diff --git "a/checkpoint-18445/trainer_state.json" "b/checkpoint-18445/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-18445/trainer_state.json" @@ -0,0 +1,129199 @@ +{ + "best_global_step": 18445, + "best_metric": 67.47337717729287, + "best_model_checkpoint": "whisper-tiny-bfloat16-sada/checkpoints/checkpoint-18445", + "epoch": 1.2497459177451047, + "eval_steps": 3689, + "global_step": 18445, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.775526797208483e-05, + "grad_norm": 132.70848083496094, + "learning_rate": 0.0, + "loss": 4.4561, + "step": 1 + }, + { + "epoch": 0.00013551053594416967, + "grad_norm": 133.01596069335938, + "learning_rate": 1.3333333333333334e-07, + "loss": 4.8642, + "step": 2 + }, + { + "epoch": 0.0002032658039162545, + "grad_norm": 106.03809356689453, + "learning_rate": 2.6666666666666667e-07, + "loss": 4.3778, + "step": 3 + }, + { + "epoch": 0.00027102107188833934, + "grad_norm": 88.75433349609375, + "learning_rate": 4.0000000000000003e-07, + "loss": 4.0385, + "step": 4 + }, + { + "epoch": 0.00033877633986042414, + "grad_norm": 101.19609832763672, + "learning_rate": 5.333333333333333e-07, + "loss": 3.9092, + "step": 5 + }, + { + "epoch": 0.000406531607832509, + "grad_norm": 67.02723693847656, + "learning_rate": 6.666666666666667e-07, + "loss": 3.7556, + "step": 6 + }, + { + "epoch": 0.0004742868758045938, + "grad_norm": 67.52012634277344, + "learning_rate": 8.000000000000001e-07, + "loss": 3.7119, + "step": 7 + }, + { + "epoch": 0.0005420421437766787, + "grad_norm": 83.27348327636719, + "learning_rate": 9.333333333333334e-07, + "loss": 3.6125, + "step": 8 + }, + { + "epoch": 0.0006097974117487635, + "grad_norm": 84.20647430419922, + "learning_rate": 1.0666666666666667e-06, + "loss": 4.1354, + "step": 9 + }, + { + "epoch": 0.0006775526797208483, + "grad_norm": 110.12113952636719, + "learning_rate": 1.2000000000000002e-06, + "loss": 3.9594, + "step": 10 + }, + { + "epoch": 0.0007453079476929331, + "grad_norm": 84.61614990234375, + "learning_rate": 1.3333333333333334e-06, + "loss": 4.1415, + "step": 11 + }, + { + "epoch": 0.000813063215665018, + "grad_norm": 119.0740966796875, + "learning_rate": 1.4666666666666667e-06, + "loss": 4.0689, + "step": 12 + }, + { + "epoch": 0.0008808184836371028, + "grad_norm": 122.41352081298828, + "learning_rate": 1.6000000000000001e-06, + "loss": 4.0834, + "step": 13 + }, + { + "epoch": 0.0009485737516091876, + "grad_norm": 99.06126403808594, + "learning_rate": 1.7333333333333334e-06, + "loss": 4.4224, + "step": 14 + }, + { + "epoch": 0.0010163290195812724, + "grad_norm": 113.89531707763672, + "learning_rate": 1.8666666666666669e-06, + "loss": 4.0188, + "step": 15 + }, + { + "epoch": 0.0010840842875533573, + "grad_norm": 70.24198150634766, + "learning_rate": 2.0000000000000003e-06, + "loss": 3.3951, + "step": 16 + }, + { + "epoch": 0.001151839555525442, + "grad_norm": 67.99130249023438, + "learning_rate": 2.1333333333333334e-06, + "loss": 3.7385, + "step": 17 + }, + { + "epoch": 0.001219594823497527, + "grad_norm": 54.554779052734375, + "learning_rate": 2.266666666666667e-06, + "loss": 2.9009, + "step": 18 + }, + { + "epoch": 0.0012873500914696117, + "grad_norm": 47.79149627685547, + "learning_rate": 2.4000000000000003e-06, + "loss": 3.0572, + "step": 19 + }, + { + "epoch": 0.0013551053594416966, + "grad_norm": 50.05776596069336, + "learning_rate": 2.5333333333333334e-06, + "loss": 2.8678, + "step": 20 + }, + { + "epoch": 0.0014228606274137815, + "grad_norm": 55.41701889038086, + "learning_rate": 2.666666666666667e-06, + "loss": 3.3981, + "step": 21 + }, + { + "epoch": 0.0014906158953858662, + "grad_norm": 57.1912727355957, + "learning_rate": 2.8000000000000003e-06, + "loss": 3.3247, + "step": 22 + }, + { + "epoch": 0.001558371163357951, + "grad_norm": 39.77119445800781, + "learning_rate": 2.9333333333333333e-06, + "loss": 3.1466, + "step": 23 + }, + { + "epoch": 0.001626126431330036, + "grad_norm": 37.258392333984375, + "learning_rate": 3.066666666666667e-06, + "loss": 3.0137, + "step": 24 + }, + { + "epoch": 0.0016938816993021207, + "grad_norm": 41.15428924560547, + "learning_rate": 3.2000000000000003e-06, + "loss": 2.9908, + "step": 25 + }, + { + "epoch": 0.0017616369672742056, + "grad_norm": 44.59917449951172, + "learning_rate": 3.3333333333333333e-06, + "loss": 3.6947, + "step": 26 + }, + { + "epoch": 0.0018293922352462903, + "grad_norm": 32.7044792175293, + "learning_rate": 3.466666666666667e-06, + "loss": 2.9421, + "step": 27 + }, + { + "epoch": 0.0018971475032183752, + "grad_norm": 28.445444107055664, + "learning_rate": 3.6e-06, + "loss": 2.9849, + "step": 28 + }, + { + "epoch": 0.00196490277119046, + "grad_norm": 30.879247665405273, + "learning_rate": 3.7333333333333337e-06, + "loss": 2.9007, + "step": 29 + }, + { + "epoch": 0.002032658039162545, + "grad_norm": 22.723188400268555, + "learning_rate": 3.866666666666667e-06, + "loss": 2.3549, + "step": 30 + }, + { + "epoch": 0.0021004133071346296, + "grad_norm": 23.59457778930664, + "learning_rate": 4.000000000000001e-06, + "loss": 2.8564, + "step": 31 + }, + { + "epoch": 0.0021681685751067147, + "grad_norm": 23.313579559326172, + "learning_rate": 4.133333333333333e-06, + "loss": 2.453, + "step": 32 + }, + { + "epoch": 0.0022359238430787994, + "grad_norm": 23.877405166625977, + "learning_rate": 4.266666666666667e-06, + "loss": 2.2042, + "step": 33 + }, + { + "epoch": 0.002303679111050884, + "grad_norm": 29.79376792907715, + "learning_rate": 4.4e-06, + "loss": 2.6679, + "step": 34 + }, + { + "epoch": 0.002371434379022969, + "grad_norm": 21.018878936767578, + "learning_rate": 4.533333333333334e-06, + "loss": 2.3983, + "step": 35 + }, + { + "epoch": 0.002439189646995054, + "grad_norm": 30.22182846069336, + "learning_rate": 4.666666666666667e-06, + "loss": 2.8668, + "step": 36 + }, + { + "epoch": 0.0025069449149671386, + "grad_norm": 24.619905471801758, + "learning_rate": 4.800000000000001e-06, + "loss": 2.5986, + "step": 37 + }, + { + "epoch": 0.0025747001829392233, + "grad_norm": 25.470375061035156, + "learning_rate": 4.933333333333333e-06, + "loss": 2.4936, + "step": 38 + }, + { + "epoch": 0.0026424554509113084, + "grad_norm": 19.872745513916016, + "learning_rate": 5.066666666666667e-06, + "loss": 2.1633, + "step": 39 + }, + { + "epoch": 0.002710210718883393, + "grad_norm": 24.880964279174805, + "learning_rate": 5.2e-06, + "loss": 2.6798, + "step": 40 + }, + { + "epoch": 0.002777965986855478, + "grad_norm": 22.790157318115234, + "learning_rate": 5.333333333333334e-06, + "loss": 2.487, + "step": 41 + }, + { + "epoch": 0.002845721254827563, + "grad_norm": 21.404136657714844, + "learning_rate": 5.466666666666667e-06, + "loss": 2.3383, + "step": 42 + }, + { + "epoch": 0.0029134765227996477, + "grad_norm": 22.80590057373047, + "learning_rate": 5.600000000000001e-06, + "loss": 2.6347, + "step": 43 + }, + { + "epoch": 0.0029812317907717324, + "grad_norm": 20.625160217285156, + "learning_rate": 5.733333333333333e-06, + "loss": 2.066, + "step": 44 + }, + { + "epoch": 0.0030489870587438175, + "grad_norm": 21.700828552246094, + "learning_rate": 5.866666666666667e-06, + "loss": 2.188, + "step": 45 + }, + { + "epoch": 0.003116742326715902, + "grad_norm": 18.8945255279541, + "learning_rate": 6e-06, + "loss": 2.1531, + "step": 46 + }, + { + "epoch": 0.003184497594687987, + "grad_norm": 20.238525390625, + "learning_rate": 6.133333333333334e-06, + "loss": 1.9306, + "step": 47 + }, + { + "epoch": 0.003252252862660072, + "grad_norm": 23.74344825744629, + "learning_rate": 6.266666666666666e-06, + "loss": 2.3206, + "step": 48 + }, + { + "epoch": 0.0033200081306321567, + "grad_norm": 22.429964065551758, + "learning_rate": 6.4000000000000006e-06, + "loss": 2.05, + "step": 49 + }, + { + "epoch": 0.0033877633986042414, + "grad_norm": 22.078792572021484, + "learning_rate": 6.533333333333333e-06, + "loss": 2.1155, + "step": 50 + }, + { + "epoch": 0.003455518666576326, + "grad_norm": 19.691747665405273, + "learning_rate": 6.666666666666667e-06, + "loss": 1.8719, + "step": 51 + }, + { + "epoch": 0.0035232739345484113, + "grad_norm": 27.412866592407227, + "learning_rate": 6.800000000000001e-06, + "loss": 2.0943, + "step": 52 + }, + { + "epoch": 0.003591029202520496, + "grad_norm": 21.631580352783203, + "learning_rate": 6.933333333333334e-06, + "loss": 2.246, + "step": 53 + }, + { + "epoch": 0.0036587844704925807, + "grad_norm": 16.729095458984375, + "learning_rate": 7.066666666666667e-06, + "loss": 1.8101, + "step": 54 + }, + { + "epoch": 0.003726539738464666, + "grad_norm": 18.381364822387695, + "learning_rate": 7.2e-06, + "loss": 1.8668, + "step": 55 + }, + { + "epoch": 0.0037942950064367505, + "grad_norm": 19.661123275756836, + "learning_rate": 7.333333333333334e-06, + "loss": 2.1912, + "step": 56 + }, + { + "epoch": 0.003862050274408835, + "grad_norm": 13.772311210632324, + "learning_rate": 7.4666666666666675e-06, + "loss": 1.7408, + "step": 57 + }, + { + "epoch": 0.00392980554238092, + "grad_norm": 20.559425354003906, + "learning_rate": 7.6e-06, + "loss": 1.9717, + "step": 58 + }, + { + "epoch": 0.003997560810353005, + "grad_norm": 17.385791778564453, + "learning_rate": 7.733333333333334e-06, + "loss": 1.7681, + "step": 59 + }, + { + "epoch": 0.00406531607832509, + "grad_norm": 18.455888748168945, + "learning_rate": 7.866666666666667e-06, + "loss": 1.8822, + "step": 60 + }, + { + "epoch": 0.004133071346297174, + "grad_norm": 17.53643798828125, + "learning_rate": 8.000000000000001e-06, + "loss": 1.8158, + "step": 61 + }, + { + "epoch": 0.004200826614269259, + "grad_norm": 15.640396118164062, + "learning_rate": 8.133333333333332e-06, + "loss": 1.8786, + "step": 62 + }, + { + "epoch": 0.004268581882241345, + "grad_norm": 67.14771270751953, + "learning_rate": 8.266666666666667e-06, + "loss": 1.845, + "step": 63 + }, + { + "epoch": 0.004336337150213429, + "grad_norm": 16.778207778930664, + "learning_rate": 8.400000000000001e-06, + "loss": 1.7286, + "step": 64 + }, + { + "epoch": 0.004404092418185514, + "grad_norm": 20.485218048095703, + "learning_rate": 8.533333333333334e-06, + "loss": 1.882, + "step": 65 + }, + { + "epoch": 0.004471847686157599, + "grad_norm": 15.77828598022461, + "learning_rate": 8.666666666666668e-06, + "loss": 1.7788, + "step": 66 + }, + { + "epoch": 0.0045396029541296835, + "grad_norm": 20.225608825683594, + "learning_rate": 8.8e-06, + "loss": 1.9842, + "step": 67 + }, + { + "epoch": 0.004607358222101768, + "grad_norm": 16.230987548828125, + "learning_rate": 8.933333333333333e-06, + "loss": 1.4581, + "step": 68 + }, + { + "epoch": 0.004675113490073853, + "grad_norm": 13.831804275512695, + "learning_rate": 9.066666666666667e-06, + "loss": 1.6173, + "step": 69 + }, + { + "epoch": 0.004742868758045938, + "grad_norm": 16.4691104888916, + "learning_rate": 9.2e-06, + "loss": 1.6055, + "step": 70 + }, + { + "epoch": 0.004810624026018023, + "grad_norm": 17.788105010986328, + "learning_rate": 9.333333333333334e-06, + "loss": 1.7996, + "step": 71 + }, + { + "epoch": 0.004878379293990108, + "grad_norm": 20.043182373046875, + "learning_rate": 9.466666666666667e-06, + "loss": 1.7224, + "step": 72 + }, + { + "epoch": 0.0049461345619621925, + "grad_norm": 14.918837547302246, + "learning_rate": 9.600000000000001e-06, + "loss": 1.4448, + "step": 73 + }, + { + "epoch": 0.005013889829934277, + "grad_norm": 15.70170783996582, + "learning_rate": 9.733333333333334e-06, + "loss": 1.7469, + "step": 74 + }, + { + "epoch": 0.005081645097906362, + "grad_norm": 19.128456115722656, + "learning_rate": 9.866666666666667e-06, + "loss": 1.9505, + "step": 75 + }, + { + "epoch": 0.005149400365878447, + "grad_norm": 34.82925796508789, + "learning_rate": 1e-05, + "loss": 1.6543, + "step": 76 + }, + { + "epoch": 0.005217155633850532, + "grad_norm": 19.61214256286621, + "learning_rate": 1.0133333333333333e-05, + "loss": 1.9051, + "step": 77 + }, + { + "epoch": 0.005284910901822617, + "grad_norm": 20.190847396850586, + "learning_rate": 1.0266666666666668e-05, + "loss": 1.6016, + "step": 78 + }, + { + "epoch": 0.005352666169794702, + "grad_norm": 15.873608589172363, + "learning_rate": 1.04e-05, + "loss": 1.8286, + "step": 79 + }, + { + "epoch": 0.005420421437766786, + "grad_norm": 17.513385772705078, + "learning_rate": 1.0533333333333335e-05, + "loss": 1.7122, + "step": 80 + }, + { + "epoch": 0.005488176705738871, + "grad_norm": 13.794294357299805, + "learning_rate": 1.0666666666666667e-05, + "loss": 1.5369, + "step": 81 + }, + { + "epoch": 0.005555931973710956, + "grad_norm": 16.763822555541992, + "learning_rate": 1.08e-05, + "loss": 1.688, + "step": 82 + }, + { + "epoch": 0.005623687241683041, + "grad_norm": 15.182608604431152, + "learning_rate": 1.0933333333333334e-05, + "loss": 1.9076, + "step": 83 + }, + { + "epoch": 0.005691442509655126, + "grad_norm": 19.11581802368164, + "learning_rate": 1.1066666666666667e-05, + "loss": 1.6034, + "step": 84 + }, + { + "epoch": 0.005759197777627211, + "grad_norm": 18.303098678588867, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.8248, + "step": 85 + }, + { + "epoch": 0.005826953045599295, + "grad_norm": 17.912731170654297, + "learning_rate": 1.1333333333333334e-05, + "loss": 1.737, + "step": 86 + }, + { + "epoch": 0.00589470831357138, + "grad_norm": 18.4353084564209, + "learning_rate": 1.1466666666666666e-05, + "loss": 1.7141, + "step": 87 + }, + { + "epoch": 0.005962463581543465, + "grad_norm": 17.006074905395508, + "learning_rate": 1.16e-05, + "loss": 1.6568, + "step": 88 + }, + { + "epoch": 0.0060302188495155494, + "grad_norm": 15.084410667419434, + "learning_rate": 1.1733333333333333e-05, + "loss": 1.3369, + "step": 89 + }, + { + "epoch": 0.006097974117487635, + "grad_norm": 14.984724044799805, + "learning_rate": 1.1866666666666668e-05, + "loss": 1.777, + "step": 90 + }, + { + "epoch": 0.00616572938545972, + "grad_norm": 19.447574615478516, + "learning_rate": 1.2e-05, + "loss": 1.4225, + "step": 91 + }, + { + "epoch": 0.006233484653431804, + "grad_norm": 15.212474822998047, + "learning_rate": 1.2133333333333335e-05, + "loss": 1.4896, + "step": 92 + }, + { + "epoch": 0.006301239921403889, + "grad_norm": 17.97924041748047, + "learning_rate": 1.2266666666666667e-05, + "loss": 1.5947, + "step": 93 + }, + { + "epoch": 0.006368995189375974, + "grad_norm": 13.695365905761719, + "learning_rate": 1.24e-05, + "loss": 1.471, + "step": 94 + }, + { + "epoch": 0.0064367504573480585, + "grad_norm": 15.574562072753906, + "learning_rate": 1.2533333333333332e-05, + "loss": 1.5803, + "step": 95 + }, + { + "epoch": 0.006504505725320144, + "grad_norm": 16.71872329711914, + "learning_rate": 1.2666666666666668e-05, + "loss": 1.7408, + "step": 96 + }, + { + "epoch": 0.006572260993292229, + "grad_norm": 14.232036590576172, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.4414, + "step": 97 + }, + { + "epoch": 0.0066400162612643135, + "grad_norm": 14.430774688720703, + "learning_rate": 1.2933333333333334e-05, + "loss": 1.712, + "step": 98 + }, + { + "epoch": 0.006707771529236398, + "grad_norm": 15.887603759765625, + "learning_rate": 1.3066666666666666e-05, + "loss": 1.6097, + "step": 99 + }, + { + "epoch": 0.006775526797208483, + "grad_norm": 21.155643463134766, + "learning_rate": 1.32e-05, + "loss": 2.0581, + "step": 100 + }, + { + "epoch": 0.0068432820651805676, + "grad_norm": 17.30326271057129, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.2823, + "step": 101 + }, + { + "epoch": 0.006911037333152652, + "grad_norm": 18.184707641601562, + "learning_rate": 1.3466666666666666e-05, + "loss": 1.5165, + "step": 102 + }, + { + "epoch": 0.006978792601124738, + "grad_norm": 23.46563148498535, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.406, + "step": 103 + }, + { + "epoch": 0.0070465478690968225, + "grad_norm": 14.586087226867676, + "learning_rate": 1.3733333333333335e-05, + "loss": 1.8416, + "step": 104 + }, + { + "epoch": 0.007114303137068907, + "grad_norm": 14.55811595916748, + "learning_rate": 1.3866666666666667e-05, + "loss": 1.5466, + "step": 105 + }, + { + "epoch": 0.007182058405040992, + "grad_norm": 20.244586944580078, + "learning_rate": 1.4000000000000001e-05, + "loss": 1.755, + "step": 106 + }, + { + "epoch": 0.007249813673013077, + "grad_norm": 11.932172775268555, + "learning_rate": 1.4133333333333334e-05, + "loss": 1.4543, + "step": 107 + }, + { + "epoch": 0.007317568940985161, + "grad_norm": 17.406888961791992, + "learning_rate": 1.4266666666666667e-05, + "loss": 1.4436, + "step": 108 + }, + { + "epoch": 0.007385324208957246, + "grad_norm": 14.591705322265625, + "learning_rate": 1.44e-05, + "loss": 1.4088, + "step": 109 + }, + { + "epoch": 0.007453079476929332, + "grad_norm": 18.028337478637695, + "learning_rate": 1.4533333333333335e-05, + "loss": 1.6675, + "step": 110 + }, + { + "epoch": 0.007520834744901416, + "grad_norm": 16.37979507446289, + "learning_rate": 1.4666666666666668e-05, + "loss": 1.6646, + "step": 111 + }, + { + "epoch": 0.007588590012873501, + "grad_norm": 19.028308868408203, + "learning_rate": 1.48e-05, + "loss": 1.701, + "step": 112 + }, + { + "epoch": 0.007656345280845586, + "grad_norm": 17.28181266784668, + "learning_rate": 1.4933333333333335e-05, + "loss": 1.3984, + "step": 113 + }, + { + "epoch": 0.00772410054881767, + "grad_norm": 16.78407859802246, + "learning_rate": 1.5066666666666668e-05, + "loss": 1.6851, + "step": 114 + }, + { + "epoch": 0.007791855816789755, + "grad_norm": 15.634642601013184, + "learning_rate": 1.52e-05, + "loss": 1.6484, + "step": 115 + }, + { + "epoch": 0.00785961108476184, + "grad_norm": 13.879626274108887, + "learning_rate": 1.5333333333333334e-05, + "loss": 1.4609, + "step": 116 + }, + { + "epoch": 0.007927366352733925, + "grad_norm": 16.247541427612305, + "learning_rate": 1.546666666666667e-05, + "loss": 1.5219, + "step": 117 + }, + { + "epoch": 0.00799512162070601, + "grad_norm": 20.57497215270996, + "learning_rate": 1.56e-05, + "loss": 1.6038, + "step": 118 + }, + { + "epoch": 0.008062876888678095, + "grad_norm": 16.32591438293457, + "learning_rate": 1.5733333333333334e-05, + "loss": 1.5315, + "step": 119 + }, + { + "epoch": 0.00813063215665018, + "grad_norm": 18.236255645751953, + "learning_rate": 1.586666666666667e-05, + "loss": 1.6589, + "step": 120 + }, + { + "epoch": 0.008198387424622264, + "grad_norm": 20.850723266601562, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.5556, + "step": 121 + }, + { + "epoch": 0.008266142692594349, + "grad_norm": 17.931947708129883, + "learning_rate": 1.6133333333333334e-05, + "loss": 1.8209, + "step": 122 + }, + { + "epoch": 0.008333897960566434, + "grad_norm": 20.531381607055664, + "learning_rate": 1.6266666666666665e-05, + "loss": 1.6421, + "step": 123 + }, + { + "epoch": 0.008401653228538518, + "grad_norm": 15.6729736328125, + "learning_rate": 1.6400000000000002e-05, + "loss": 1.5652, + "step": 124 + }, + { + "epoch": 0.008469408496510603, + "grad_norm": 18.139949798583984, + "learning_rate": 1.6533333333333333e-05, + "loss": 1.5578, + "step": 125 + }, + { + "epoch": 0.00853716376448269, + "grad_norm": 15.77291202545166, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.6091, + "step": 126 + }, + { + "epoch": 0.008604919032454774, + "grad_norm": 18.00165557861328, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.4725, + "step": 127 + }, + { + "epoch": 0.008672674300426859, + "grad_norm": 17.33936309814453, + "learning_rate": 1.6933333333333333e-05, + "loss": 1.4272, + "step": 128 + }, + { + "epoch": 0.008740429568398943, + "grad_norm": 16.75677490234375, + "learning_rate": 1.7066666666666667e-05, + "loss": 1.602, + "step": 129 + }, + { + "epoch": 0.008808184836371028, + "grad_norm": 17.761140823364258, + "learning_rate": 1.7199999999999998e-05, + "loss": 1.8523, + "step": 130 + }, + { + "epoch": 0.008875940104343113, + "grad_norm": 15.888497352600098, + "learning_rate": 1.7333333333333336e-05, + "loss": 1.5594, + "step": 131 + }, + { + "epoch": 0.008943695372315198, + "grad_norm": 19.62826156616211, + "learning_rate": 1.7466666666666667e-05, + "loss": 1.6909, + "step": 132 + }, + { + "epoch": 0.009011450640287282, + "grad_norm": 15.62592887878418, + "learning_rate": 1.76e-05, + "loss": 1.5873, + "step": 133 + }, + { + "epoch": 0.009079205908259367, + "grad_norm": 17.64893913269043, + "learning_rate": 1.7733333333333335e-05, + "loss": 1.5681, + "step": 134 + }, + { + "epoch": 0.009146961176231452, + "grad_norm": 15.14854907989502, + "learning_rate": 1.7866666666666666e-05, + "loss": 1.3324, + "step": 135 + }, + { + "epoch": 0.009214716444203536, + "grad_norm": 18.115419387817383, + "learning_rate": 1.8e-05, + "loss": 1.5512, + "step": 136 + }, + { + "epoch": 0.009282471712175621, + "grad_norm": 18.2511043548584, + "learning_rate": 1.8133333333333335e-05, + "loss": 1.6405, + "step": 137 + }, + { + "epoch": 0.009350226980147706, + "grad_norm": 13.296855926513672, + "learning_rate": 1.826666666666667e-05, + "loss": 1.2269, + "step": 138 + }, + { + "epoch": 0.009417982248119792, + "grad_norm": 17.17320442199707, + "learning_rate": 1.84e-05, + "loss": 1.4031, + "step": 139 + }, + { + "epoch": 0.009485737516091877, + "grad_norm": 15.399650573730469, + "learning_rate": 1.8533333333333334e-05, + "loss": 1.4793, + "step": 140 + }, + { + "epoch": 0.009553492784063962, + "grad_norm": 15.0385160446167, + "learning_rate": 1.866666666666667e-05, + "loss": 1.4129, + "step": 141 + }, + { + "epoch": 0.009621248052036046, + "grad_norm": 14.586984634399414, + "learning_rate": 1.88e-05, + "loss": 1.1959, + "step": 142 + }, + { + "epoch": 0.009689003320008131, + "grad_norm": 14.939247131347656, + "learning_rate": 1.8933333333333334e-05, + "loss": 1.6728, + "step": 143 + }, + { + "epoch": 0.009756758587980216, + "grad_norm": 18.1126651763916, + "learning_rate": 1.9066666666666668e-05, + "loss": 1.5035, + "step": 144 + }, + { + "epoch": 0.0098245138559523, + "grad_norm": 16.0823974609375, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.544, + "step": 145 + }, + { + "epoch": 0.009892269123924385, + "grad_norm": 17.45062828063965, + "learning_rate": 1.9333333333333333e-05, + "loss": 1.4593, + "step": 146 + }, + { + "epoch": 0.00996002439189647, + "grad_norm": 21.030731201171875, + "learning_rate": 1.9466666666666668e-05, + "loss": 1.6544, + "step": 147 + }, + { + "epoch": 0.010027779659868554, + "grad_norm": 18.31300163269043, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.7682, + "step": 148 + }, + { + "epoch": 0.01009553492784064, + "grad_norm": 13.541064262390137, + "learning_rate": 1.9733333333333333e-05, + "loss": 1.333, + "step": 149 + }, + { + "epoch": 0.010163290195812724, + "grad_norm": 13.538237571716309, + "learning_rate": 1.9866666666666667e-05, + "loss": 1.3629, + "step": 150 + }, + { + "epoch": 0.010231045463784809, + "grad_norm": 18.393657684326172, + "learning_rate": 2e-05, + "loss": 1.4467, + "step": 151 + }, + { + "epoch": 0.010298800731756893, + "grad_norm": 19.18985939025879, + "learning_rate": 2.0133333333333336e-05, + "loss": 1.612, + "step": 152 + }, + { + "epoch": 0.01036655599972898, + "grad_norm": 14.230488777160645, + "learning_rate": 2.0266666666666667e-05, + "loss": 1.4047, + "step": 153 + }, + { + "epoch": 0.010434311267701064, + "grad_norm": 13.807863235473633, + "learning_rate": 2.04e-05, + "loss": 1.4977, + "step": 154 + }, + { + "epoch": 0.010502066535673149, + "grad_norm": 15.149568557739258, + "learning_rate": 2.0533333333333336e-05, + "loss": 1.3856, + "step": 155 + }, + { + "epoch": 0.010569821803645234, + "grad_norm": 16.095308303833008, + "learning_rate": 2.0666666666666666e-05, + "loss": 1.3687, + "step": 156 + }, + { + "epoch": 0.010637577071617318, + "grad_norm": 14.864961624145508, + "learning_rate": 2.08e-05, + "loss": 1.5707, + "step": 157 + }, + { + "epoch": 0.010705332339589403, + "grad_norm": 15.9810152053833, + "learning_rate": 2.0933333333333335e-05, + "loss": 1.6841, + "step": 158 + }, + { + "epoch": 0.010773087607561488, + "grad_norm": 16.706584930419922, + "learning_rate": 2.106666666666667e-05, + "loss": 1.5017, + "step": 159 + }, + { + "epoch": 0.010840842875533573, + "grad_norm": 18.726655960083008, + "learning_rate": 2.12e-05, + "loss": 1.7177, + "step": 160 + }, + { + "epoch": 0.010908598143505657, + "grad_norm": 17.22323989868164, + "learning_rate": 2.1333333333333335e-05, + "loss": 1.5452, + "step": 161 + }, + { + "epoch": 0.010976353411477742, + "grad_norm": 14.823019027709961, + "learning_rate": 2.146666666666667e-05, + "loss": 1.2856, + "step": 162 + }, + { + "epoch": 0.011044108679449827, + "grad_norm": 14.756085395812988, + "learning_rate": 2.16e-05, + "loss": 1.4942, + "step": 163 + }, + { + "epoch": 0.011111863947421911, + "grad_norm": 18.523683547973633, + "learning_rate": 2.1733333333333334e-05, + "loss": 1.4832, + "step": 164 + }, + { + "epoch": 0.011179619215393996, + "grad_norm": 16.75520896911621, + "learning_rate": 2.186666666666667e-05, + "loss": 1.1847, + "step": 165 + }, + { + "epoch": 0.011247374483366082, + "grad_norm": 17.045665740966797, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.6358, + "step": 166 + }, + { + "epoch": 0.011315129751338167, + "grad_norm": 17.235095977783203, + "learning_rate": 2.2133333333333334e-05, + "loss": 1.6984, + "step": 167 + }, + { + "epoch": 0.011382885019310252, + "grad_norm": 18.120485305786133, + "learning_rate": 2.2266666666666668e-05, + "loss": 1.5544, + "step": 168 + }, + { + "epoch": 0.011450640287282337, + "grad_norm": 15.252264976501465, + "learning_rate": 2.2400000000000002e-05, + "loss": 1.4386, + "step": 169 + }, + { + "epoch": 0.011518395555254421, + "grad_norm": 13.91727066040039, + "learning_rate": 2.2533333333333333e-05, + "loss": 1.2083, + "step": 170 + }, + { + "epoch": 0.011586150823226506, + "grad_norm": 16.50945472717285, + "learning_rate": 2.2666666666666668e-05, + "loss": 1.5878, + "step": 171 + }, + { + "epoch": 0.01165390609119859, + "grad_norm": 12.260165214538574, + "learning_rate": 2.2800000000000002e-05, + "loss": 1.4173, + "step": 172 + }, + { + "epoch": 0.011721661359170675, + "grad_norm": 17.12419891357422, + "learning_rate": 2.2933333333333333e-05, + "loss": 1.3117, + "step": 173 + }, + { + "epoch": 0.01178941662714276, + "grad_norm": 17.508407592773438, + "learning_rate": 2.3066666666666667e-05, + "loss": 1.4294, + "step": 174 + }, + { + "epoch": 0.011857171895114845, + "grad_norm": 17.78769302368164, + "learning_rate": 2.32e-05, + "loss": 1.8437, + "step": 175 + }, + { + "epoch": 0.01192492716308693, + "grad_norm": 13.79132080078125, + "learning_rate": 2.3333333333333336e-05, + "loss": 1.3969, + "step": 176 + }, + { + "epoch": 0.011992682431059014, + "grad_norm": 16.519269943237305, + "learning_rate": 2.3466666666666667e-05, + "loss": 1.5833, + "step": 177 + }, + { + "epoch": 0.012060437699031099, + "grad_norm": 15.920942306518555, + "learning_rate": 2.36e-05, + "loss": 1.4783, + "step": 178 + }, + { + "epoch": 0.012128192967003185, + "grad_norm": 17.00782585144043, + "learning_rate": 2.3733333333333335e-05, + "loss": 1.4038, + "step": 179 + }, + { + "epoch": 0.01219594823497527, + "grad_norm": 18.844655990600586, + "learning_rate": 2.3866666666666666e-05, + "loss": 1.653, + "step": 180 + }, + { + "epoch": 0.012263703502947355, + "grad_norm": 18.05002784729004, + "learning_rate": 2.4e-05, + "loss": 1.4137, + "step": 181 + }, + { + "epoch": 0.01233145877091944, + "grad_norm": 14.200697898864746, + "learning_rate": 2.4133333333333335e-05, + "loss": 1.3067, + "step": 182 + }, + { + "epoch": 0.012399214038891524, + "grad_norm": 16.28152084350586, + "learning_rate": 2.426666666666667e-05, + "loss": 1.3425, + "step": 183 + }, + { + "epoch": 0.012466969306863609, + "grad_norm": 18.691898345947266, + "learning_rate": 2.44e-05, + "loss": 1.5995, + "step": 184 + }, + { + "epoch": 0.012534724574835693, + "grad_norm": 16.62773895263672, + "learning_rate": 2.4533333333333334e-05, + "loss": 1.5722, + "step": 185 + }, + { + "epoch": 0.012602479842807778, + "grad_norm": 19.650226593017578, + "learning_rate": 2.466666666666667e-05, + "loss": 1.5445, + "step": 186 + }, + { + "epoch": 0.012670235110779863, + "grad_norm": 14.582402229309082, + "learning_rate": 2.48e-05, + "loss": 1.4822, + "step": 187 + }, + { + "epoch": 0.012737990378751948, + "grad_norm": 16.5295352935791, + "learning_rate": 2.4933333333333334e-05, + "loss": 1.5981, + "step": 188 + }, + { + "epoch": 0.012805745646724032, + "grad_norm": 17.096031188964844, + "learning_rate": 2.5066666666666665e-05, + "loss": 1.2527, + "step": 189 + }, + { + "epoch": 0.012873500914696117, + "grad_norm": 15.424555778503418, + "learning_rate": 2.5200000000000003e-05, + "loss": 1.5284, + "step": 190 + }, + { + "epoch": 0.012941256182668202, + "grad_norm": 14.505122184753418, + "learning_rate": 2.5333333333333337e-05, + "loss": 1.5726, + "step": 191 + }, + { + "epoch": 0.013009011450640288, + "grad_norm": 13.430007934570312, + "learning_rate": 2.5466666666666668e-05, + "loss": 1.5905, + "step": 192 + }, + { + "epoch": 0.013076766718612373, + "grad_norm": 12.291808128356934, + "learning_rate": 2.5600000000000002e-05, + "loss": 1.3122, + "step": 193 + }, + { + "epoch": 0.013144521986584458, + "grad_norm": 16.053747177124023, + "learning_rate": 2.5733333333333337e-05, + "loss": 1.3164, + "step": 194 + }, + { + "epoch": 0.013212277254556542, + "grad_norm": 17.762910842895508, + "learning_rate": 2.5866666666666667e-05, + "loss": 1.4727, + "step": 195 + }, + { + "epoch": 0.013280032522528627, + "grad_norm": 15.695332527160645, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.5584, + "step": 196 + }, + { + "epoch": 0.013347787790500712, + "grad_norm": 14.789410591125488, + "learning_rate": 2.6133333333333333e-05, + "loss": 1.3253, + "step": 197 + }, + { + "epoch": 0.013415543058472796, + "grad_norm": 15.248231887817383, + "learning_rate": 2.6266666666666667e-05, + "loss": 1.3087, + "step": 198 + }, + { + "epoch": 0.013483298326444881, + "grad_norm": 18.894878387451172, + "learning_rate": 2.64e-05, + "loss": 1.2755, + "step": 199 + }, + { + "epoch": 0.013551053594416966, + "grad_norm": 16.611616134643555, + "learning_rate": 2.6533333333333332e-05, + "loss": 1.4218, + "step": 200 + }, + { + "epoch": 0.01361880886238905, + "grad_norm": 17.056982040405273, + "learning_rate": 2.6666666666666667e-05, + "loss": 1.4376, + "step": 201 + }, + { + "epoch": 0.013686564130361135, + "grad_norm": 15.91457748413086, + "learning_rate": 2.6800000000000004e-05, + "loss": 1.4836, + "step": 202 + }, + { + "epoch": 0.01375431939833322, + "grad_norm": 16.371660232543945, + "learning_rate": 2.6933333333333332e-05, + "loss": 1.7374, + "step": 203 + }, + { + "epoch": 0.013822074666305304, + "grad_norm": 14.45882511138916, + "learning_rate": 2.706666666666667e-05, + "loss": 1.3324, + "step": 204 + }, + { + "epoch": 0.013889829934277391, + "grad_norm": 18.779190063476562, + "learning_rate": 2.7200000000000004e-05, + "loss": 1.4845, + "step": 205 + }, + { + "epoch": 0.013957585202249476, + "grad_norm": 19.7078914642334, + "learning_rate": 2.733333333333333e-05, + "loss": 1.6681, + "step": 206 + }, + { + "epoch": 0.01402534047022156, + "grad_norm": 20.498266220092773, + "learning_rate": 2.746666666666667e-05, + "loss": 1.559, + "step": 207 + }, + { + "epoch": 0.014093095738193645, + "grad_norm": 20.2978515625, + "learning_rate": 2.7600000000000003e-05, + "loss": 1.5821, + "step": 208 + }, + { + "epoch": 0.01416085100616573, + "grad_norm": 18.85460090637207, + "learning_rate": 2.7733333333333334e-05, + "loss": 1.3, + "step": 209 + }, + { + "epoch": 0.014228606274137814, + "grad_norm": 13.912948608398438, + "learning_rate": 2.786666666666667e-05, + "loss": 1.4833, + "step": 210 + }, + { + "epoch": 0.014296361542109899, + "grad_norm": 19.084806442260742, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.3379, + "step": 211 + }, + { + "epoch": 0.014364116810081984, + "grad_norm": 15.123217582702637, + "learning_rate": 2.8133333333333334e-05, + "loss": 1.354, + "step": 212 + }, + { + "epoch": 0.014431872078054069, + "grad_norm": 14.127299308776855, + "learning_rate": 2.8266666666666668e-05, + "loss": 1.2341, + "step": 213 + }, + { + "epoch": 0.014499627346026153, + "grad_norm": 15.609139442443848, + "learning_rate": 2.84e-05, + "loss": 1.3474, + "step": 214 + }, + { + "epoch": 0.014567382613998238, + "grad_norm": 20.588394165039062, + "learning_rate": 2.8533333333333333e-05, + "loss": 1.4881, + "step": 215 + }, + { + "epoch": 0.014635137881970323, + "grad_norm": 12.27170467376709, + "learning_rate": 2.8666666666666668e-05, + "loss": 1.2255, + "step": 216 + }, + { + "epoch": 0.014702893149942407, + "grad_norm": 18.14453125, + "learning_rate": 2.88e-05, + "loss": 1.6328, + "step": 217 + }, + { + "epoch": 0.014770648417914492, + "grad_norm": 17.722230911254883, + "learning_rate": 2.8933333333333333e-05, + "loss": 2.0172, + "step": 218 + }, + { + "epoch": 0.014838403685886578, + "grad_norm": 14.454687118530273, + "learning_rate": 2.906666666666667e-05, + "loss": 1.2936, + "step": 219 + }, + { + "epoch": 0.014906158953858663, + "grad_norm": 16.369625091552734, + "learning_rate": 2.9199999999999998e-05, + "loss": 1.6289, + "step": 220 + }, + { + "epoch": 0.014973914221830748, + "grad_norm": 17.64832305908203, + "learning_rate": 2.9333333333333336e-05, + "loss": 1.3135, + "step": 221 + }, + { + "epoch": 0.015041669489802833, + "grad_norm": 14.401778221130371, + "learning_rate": 2.946666666666667e-05, + "loss": 1.2614, + "step": 222 + }, + { + "epoch": 0.015109424757774917, + "grad_norm": 17.145814895629883, + "learning_rate": 2.96e-05, + "loss": 1.3473, + "step": 223 + }, + { + "epoch": 0.015177180025747002, + "grad_norm": 17.273954391479492, + "learning_rate": 2.9733333333333336e-05, + "loss": 1.3747, + "step": 224 + }, + { + "epoch": 0.015244935293719087, + "grad_norm": 17.190818786621094, + "learning_rate": 2.986666666666667e-05, + "loss": 1.5452, + "step": 225 + }, + { + "epoch": 0.015312690561691171, + "grad_norm": 19.573585510253906, + "learning_rate": 3e-05, + "loss": 1.4942, + "step": 226 + }, + { + "epoch": 0.015380445829663256, + "grad_norm": 15.349540710449219, + "learning_rate": 3.0133333333333335e-05, + "loss": 1.4277, + "step": 227 + }, + { + "epoch": 0.01544820109763534, + "grad_norm": 19.786211013793945, + "learning_rate": 3.0266666666666666e-05, + "loss": 1.5014, + "step": 228 + }, + { + "epoch": 0.015515956365607425, + "grad_norm": 17.51229476928711, + "learning_rate": 3.04e-05, + "loss": 1.695, + "step": 229 + }, + { + "epoch": 0.01558371163357951, + "grad_norm": 19.894756317138672, + "learning_rate": 3.0533333333333335e-05, + "loss": 1.4994, + "step": 230 + }, + { + "epoch": 0.015651466901551597, + "grad_norm": 13.575235366821289, + "learning_rate": 3.066666666666667e-05, + "loss": 1.4037, + "step": 231 + }, + { + "epoch": 0.01571922216952368, + "grad_norm": 18.516822814941406, + "learning_rate": 3.08e-05, + "loss": 1.5494, + "step": 232 + }, + { + "epoch": 0.015786977437495766, + "grad_norm": 17.343368530273438, + "learning_rate": 3.093333333333334e-05, + "loss": 1.1708, + "step": 233 + }, + { + "epoch": 0.01585473270546785, + "grad_norm": 18.78635597229004, + "learning_rate": 3.1066666666666665e-05, + "loss": 1.6543, + "step": 234 + }, + { + "epoch": 0.015922487973439935, + "grad_norm": 16.219989776611328, + "learning_rate": 3.12e-05, + "loss": 1.381, + "step": 235 + }, + { + "epoch": 0.01599024324141202, + "grad_norm": 15.987724304199219, + "learning_rate": 3.1333333333333334e-05, + "loss": 1.3981, + "step": 236 + }, + { + "epoch": 0.016057998509384105, + "grad_norm": 17.251537322998047, + "learning_rate": 3.146666666666667e-05, + "loss": 1.8181, + "step": 237 + }, + { + "epoch": 0.01612575377735619, + "grad_norm": 15.804817199707031, + "learning_rate": 3.16e-05, + "loss": 1.4755, + "step": 238 + }, + { + "epoch": 0.016193509045328274, + "grad_norm": 16.508943557739258, + "learning_rate": 3.173333333333334e-05, + "loss": 1.2167, + "step": 239 + }, + { + "epoch": 0.01626126431330036, + "grad_norm": 17.382783889770508, + "learning_rate": 3.1866666666666664e-05, + "loss": 1.6186, + "step": 240 + }, + { + "epoch": 0.016329019581272444, + "grad_norm": 17.938161849975586, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.3685, + "step": 241 + }, + { + "epoch": 0.016396774849244528, + "grad_norm": 21.103981018066406, + "learning_rate": 3.213333333333334e-05, + "loss": 1.6275, + "step": 242 + }, + { + "epoch": 0.016464530117216613, + "grad_norm": 15.145224571228027, + "learning_rate": 3.226666666666667e-05, + "loss": 1.2058, + "step": 243 + }, + { + "epoch": 0.016532285385188698, + "grad_norm": 15.368849754333496, + "learning_rate": 3.24e-05, + "loss": 1.3436, + "step": 244 + }, + { + "epoch": 0.016600040653160782, + "grad_norm": 16.95122718811035, + "learning_rate": 3.253333333333333e-05, + "loss": 1.4346, + "step": 245 + }, + { + "epoch": 0.016667795921132867, + "grad_norm": 14.612030029296875, + "learning_rate": 3.266666666666667e-05, + "loss": 1.4759, + "step": 246 + }, + { + "epoch": 0.016735551189104952, + "grad_norm": 17.6989688873291, + "learning_rate": 3.2800000000000004e-05, + "loss": 1.3612, + "step": 247 + }, + { + "epoch": 0.016803306457077036, + "grad_norm": 19.985427856445312, + "learning_rate": 3.293333333333333e-05, + "loss": 1.2718, + "step": 248 + }, + { + "epoch": 0.01687106172504912, + "grad_norm": 12.162117958068848, + "learning_rate": 3.3066666666666666e-05, + "loss": 1.2133, + "step": 249 + }, + { + "epoch": 0.016938816993021206, + "grad_norm": 19.053850173950195, + "learning_rate": 3.32e-05, + "loss": 1.1345, + "step": 250 + }, + { + "epoch": 0.01700657226099329, + "grad_norm": 15.729182243347168, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.3079, + "step": 251 + }, + { + "epoch": 0.01707432752896538, + "grad_norm": 18.984880447387695, + "learning_rate": 3.346666666666667e-05, + "loss": 1.5124, + "step": 252 + }, + { + "epoch": 0.017142082796937463, + "grad_norm": 14.444266319274902, + "learning_rate": 3.3600000000000004e-05, + "loss": 1.4279, + "step": 253 + }, + { + "epoch": 0.017209838064909548, + "grad_norm": 17.174089431762695, + "learning_rate": 3.373333333333333e-05, + "loss": 1.1192, + "step": 254 + }, + { + "epoch": 0.017277593332881633, + "grad_norm": 15.885781288146973, + "learning_rate": 3.3866666666666665e-05, + "loss": 1.3918, + "step": 255 + }, + { + "epoch": 0.017345348600853717, + "grad_norm": 14.934944152832031, + "learning_rate": 3.4000000000000007e-05, + "loss": 1.3348, + "step": 256 + }, + { + "epoch": 0.017413103868825802, + "grad_norm": 17.977643966674805, + "learning_rate": 3.4133333333333334e-05, + "loss": 1.392, + "step": 257 + }, + { + "epoch": 0.017480859136797887, + "grad_norm": 17.527130126953125, + "learning_rate": 3.426666666666667e-05, + "loss": 1.2939, + "step": 258 + }, + { + "epoch": 0.01754861440476997, + "grad_norm": 17.823453903198242, + "learning_rate": 3.4399999999999996e-05, + "loss": 1.3118, + "step": 259 + }, + { + "epoch": 0.017616369672742056, + "grad_norm": 15.746111869812012, + "learning_rate": 3.453333333333334e-05, + "loss": 1.3865, + "step": 260 + }, + { + "epoch": 0.01768412494071414, + "grad_norm": 18.798080444335938, + "learning_rate": 3.466666666666667e-05, + "loss": 1.399, + "step": 261 + }, + { + "epoch": 0.017751880208686226, + "grad_norm": 15.139904975891113, + "learning_rate": 3.48e-05, + "loss": 1.5248, + "step": 262 + }, + { + "epoch": 0.01781963547665831, + "grad_norm": 16.150434494018555, + "learning_rate": 3.493333333333333e-05, + "loss": 1.5033, + "step": 263 + }, + { + "epoch": 0.017887390744630395, + "grad_norm": 19.125490188598633, + "learning_rate": 3.506666666666667e-05, + "loss": 1.4695, + "step": 264 + }, + { + "epoch": 0.01795514601260248, + "grad_norm": 15.708895683288574, + "learning_rate": 3.52e-05, + "loss": 1.6189, + "step": 265 + }, + { + "epoch": 0.018022901280574564, + "grad_norm": 14.185934066772461, + "learning_rate": 3.5333333333333336e-05, + "loss": 1.3536, + "step": 266 + }, + { + "epoch": 0.01809065654854665, + "grad_norm": 15.22371768951416, + "learning_rate": 3.546666666666667e-05, + "loss": 1.4652, + "step": 267 + }, + { + "epoch": 0.018158411816518734, + "grad_norm": 15.854277610778809, + "learning_rate": 3.56e-05, + "loss": 1.7211, + "step": 268 + }, + { + "epoch": 0.01822616708449082, + "grad_norm": 18.41203498840332, + "learning_rate": 3.573333333333333e-05, + "loss": 1.54, + "step": 269 + }, + { + "epoch": 0.018293922352462903, + "grad_norm": 15.323198318481445, + "learning_rate": 3.586666666666667e-05, + "loss": 1.3582, + "step": 270 + }, + { + "epoch": 0.018361677620434988, + "grad_norm": 15.056265830993652, + "learning_rate": 3.6e-05, + "loss": 1.2959, + "step": 271 + }, + { + "epoch": 0.018429432888407073, + "grad_norm": 18.912555694580078, + "learning_rate": 3.6133333333333335e-05, + "loss": 1.5964, + "step": 272 + }, + { + "epoch": 0.018497188156379157, + "grad_norm": 15.093083381652832, + "learning_rate": 3.626666666666667e-05, + "loss": 1.3403, + "step": 273 + }, + { + "epoch": 0.018564943424351242, + "grad_norm": 17.38453483581543, + "learning_rate": 3.6400000000000004e-05, + "loss": 1.5082, + "step": 274 + }, + { + "epoch": 0.018632698692323327, + "grad_norm": 16.53204345703125, + "learning_rate": 3.653333333333334e-05, + "loss": 1.2373, + "step": 275 + }, + { + "epoch": 0.01870045396029541, + "grad_norm": 13.423516273498535, + "learning_rate": 3.6666666666666666e-05, + "loss": 1.3111, + "step": 276 + }, + { + "epoch": 0.018768209228267496, + "grad_norm": 14.92001724243164, + "learning_rate": 3.68e-05, + "loss": 1.5767, + "step": 277 + }, + { + "epoch": 0.018835964496239584, + "grad_norm": 15.229578971862793, + "learning_rate": 3.6933333333333334e-05, + "loss": 1.19, + "step": 278 + }, + { + "epoch": 0.01890371976421167, + "grad_norm": 17.159273147583008, + "learning_rate": 3.706666666666667e-05, + "loss": 1.335, + "step": 279 + }, + { + "epoch": 0.018971475032183754, + "grad_norm": 13.7230806350708, + "learning_rate": 3.72e-05, + "loss": 1.2587, + "step": 280 + }, + { + "epoch": 0.01903923030015584, + "grad_norm": 15.375404357910156, + "learning_rate": 3.733333333333334e-05, + "loss": 1.2744, + "step": 281 + }, + { + "epoch": 0.019106985568127923, + "grad_norm": 21.054534912109375, + "learning_rate": 3.7466666666666665e-05, + "loss": 1.5581, + "step": 282 + }, + { + "epoch": 0.019174740836100008, + "grad_norm": 17.954307556152344, + "learning_rate": 3.76e-05, + "loss": 1.6617, + "step": 283 + }, + { + "epoch": 0.019242496104072093, + "grad_norm": 17.954139709472656, + "learning_rate": 3.773333333333334e-05, + "loss": 1.7486, + "step": 284 + }, + { + "epoch": 0.019310251372044177, + "grad_norm": 16.210704803466797, + "learning_rate": 3.786666666666667e-05, + "loss": 1.661, + "step": 285 + }, + { + "epoch": 0.019378006640016262, + "grad_norm": 18.14916229248047, + "learning_rate": 3.8e-05, + "loss": 1.8154, + "step": 286 + }, + { + "epoch": 0.019445761907988347, + "grad_norm": 16.372896194458008, + "learning_rate": 3.8133333333333336e-05, + "loss": 1.5464, + "step": 287 + }, + { + "epoch": 0.01951351717596043, + "grad_norm": 16.4403076171875, + "learning_rate": 3.8266666666666664e-05, + "loss": 1.0815, + "step": 288 + }, + { + "epoch": 0.019581272443932516, + "grad_norm": 16.612815856933594, + "learning_rate": 3.8400000000000005e-05, + "loss": 1.4338, + "step": 289 + }, + { + "epoch": 0.0196490277119046, + "grad_norm": 17.0092716217041, + "learning_rate": 3.853333333333334e-05, + "loss": 1.5257, + "step": 290 + }, + { + "epoch": 0.019716782979876685, + "grad_norm": 13.920248031616211, + "learning_rate": 3.866666666666667e-05, + "loss": 1.4595, + "step": 291 + }, + { + "epoch": 0.01978453824784877, + "grad_norm": 12.699677467346191, + "learning_rate": 3.88e-05, + "loss": 1.2602, + "step": 292 + }, + { + "epoch": 0.019852293515820855, + "grad_norm": 17.31343650817871, + "learning_rate": 3.8933333333333336e-05, + "loss": 1.2934, + "step": 293 + }, + { + "epoch": 0.01992004878379294, + "grad_norm": 16.7819881439209, + "learning_rate": 3.906666666666667e-05, + "loss": 1.5385, + "step": 294 + }, + { + "epoch": 0.019987804051765024, + "grad_norm": 14.175074577331543, + "learning_rate": 3.9200000000000004e-05, + "loss": 1.1644, + "step": 295 + }, + { + "epoch": 0.02005555931973711, + "grad_norm": 22.863780975341797, + "learning_rate": 3.933333333333333e-05, + "loss": 1.5932, + "step": 296 + }, + { + "epoch": 0.020123314587709194, + "grad_norm": 15.278692245483398, + "learning_rate": 3.9466666666666666e-05, + "loss": 1.3997, + "step": 297 + }, + { + "epoch": 0.02019106985568128, + "grad_norm": 15.878965377807617, + "learning_rate": 3.960000000000001e-05, + "loss": 1.5992, + "step": 298 + }, + { + "epoch": 0.020258825123653363, + "grad_norm": 17.444494247436523, + "learning_rate": 3.9733333333333335e-05, + "loss": 1.5224, + "step": 299 + }, + { + "epoch": 0.020326580391625448, + "grad_norm": 15.011860847473145, + "learning_rate": 3.986666666666667e-05, + "loss": 1.2609, + "step": 300 + }, + { + "epoch": 0.020394335659597532, + "grad_norm": 17.057064056396484, + "learning_rate": 4e-05, + "loss": 1.2724, + "step": 301 + }, + { + "epoch": 0.020462090927569617, + "grad_norm": 14.946621894836426, + "learning_rate": 4.013333333333333e-05, + "loss": 1.273, + "step": 302 + }, + { + "epoch": 0.020529846195541702, + "grad_norm": 15.072589874267578, + "learning_rate": 4.026666666666667e-05, + "loss": 1.2029, + "step": 303 + }, + { + "epoch": 0.020597601463513786, + "grad_norm": 14.304384231567383, + "learning_rate": 4.0400000000000006e-05, + "loss": 1.3601, + "step": 304 + }, + { + "epoch": 0.020665356731485875, + "grad_norm": 16.18627166748047, + "learning_rate": 4.0533333333333334e-05, + "loss": 1.5844, + "step": 305 + }, + { + "epoch": 0.02073311199945796, + "grad_norm": 14.144999504089355, + "learning_rate": 4.066666666666667e-05, + "loss": 1.1638, + "step": 306 + }, + { + "epoch": 0.020800867267430044, + "grad_norm": 13.793155670166016, + "learning_rate": 4.08e-05, + "loss": 1.2101, + "step": 307 + }, + { + "epoch": 0.02086862253540213, + "grad_norm": 19.344085693359375, + "learning_rate": 4.093333333333334e-05, + "loss": 1.5899, + "step": 308 + }, + { + "epoch": 0.020936377803374213, + "grad_norm": 13.670002937316895, + "learning_rate": 4.106666666666667e-05, + "loss": 1.4271, + "step": 309 + }, + { + "epoch": 0.021004133071346298, + "grad_norm": 17.72041893005371, + "learning_rate": 4.12e-05, + "loss": 1.5276, + "step": 310 + }, + { + "epoch": 0.021071888339318383, + "grad_norm": 14.103938102722168, + "learning_rate": 4.133333333333333e-05, + "loss": 1.2355, + "step": 311 + }, + { + "epoch": 0.021139643607290468, + "grad_norm": 16.439529418945312, + "learning_rate": 4.146666666666667e-05, + "loss": 1.3393, + "step": 312 + }, + { + "epoch": 0.021207398875262552, + "grad_norm": 13.821382522583008, + "learning_rate": 4.16e-05, + "loss": 1.275, + "step": 313 + }, + { + "epoch": 0.021275154143234637, + "grad_norm": 19.987564086914062, + "learning_rate": 4.1733333333333336e-05, + "loss": 1.415, + "step": 314 + }, + { + "epoch": 0.02134290941120672, + "grad_norm": 12.660404205322266, + "learning_rate": 4.186666666666667e-05, + "loss": 1.2378, + "step": 315 + }, + { + "epoch": 0.021410664679178806, + "grad_norm": 14.009933471679688, + "learning_rate": 4.2e-05, + "loss": 1.1615, + "step": 316 + }, + { + "epoch": 0.02147841994715089, + "grad_norm": 16.11932945251465, + "learning_rate": 4.213333333333334e-05, + "loss": 1.0653, + "step": 317 + }, + { + "epoch": 0.021546175215122976, + "grad_norm": 15.943892478942871, + "learning_rate": 4.226666666666667e-05, + "loss": 1.6208, + "step": 318 + }, + { + "epoch": 0.02161393048309506, + "grad_norm": 16.01548194885254, + "learning_rate": 4.24e-05, + "loss": 1.4052, + "step": 319 + }, + { + "epoch": 0.021681685751067145, + "grad_norm": 15.703156471252441, + "learning_rate": 4.2533333333333335e-05, + "loss": 1.3736, + "step": 320 + }, + { + "epoch": 0.02174944101903923, + "grad_norm": 20.367021560668945, + "learning_rate": 4.266666666666667e-05, + "loss": 1.5758, + "step": 321 + }, + { + "epoch": 0.021817196287011315, + "grad_norm": 15.678637504577637, + "learning_rate": 4.2800000000000004e-05, + "loss": 1.4458, + "step": 322 + }, + { + "epoch": 0.0218849515549834, + "grad_norm": 15.707669258117676, + "learning_rate": 4.293333333333334e-05, + "loss": 1.3065, + "step": 323 + }, + { + "epoch": 0.021952706822955484, + "grad_norm": 16.07583999633789, + "learning_rate": 4.3066666666666665e-05, + "loss": 1.5875, + "step": 324 + }, + { + "epoch": 0.02202046209092757, + "grad_norm": 15.248065948486328, + "learning_rate": 4.32e-05, + "loss": 1.5972, + "step": 325 + }, + { + "epoch": 0.022088217358899653, + "grad_norm": 17.77379608154297, + "learning_rate": 4.3333333333333334e-05, + "loss": 1.3385, + "step": 326 + }, + { + "epoch": 0.022155972626871738, + "grad_norm": 24.381973266601562, + "learning_rate": 4.346666666666667e-05, + "loss": 1.6428, + "step": 327 + }, + { + "epoch": 0.022223727894843823, + "grad_norm": 17.3684024810791, + "learning_rate": 4.36e-05, + "loss": 1.3634, + "step": 328 + }, + { + "epoch": 0.022291483162815907, + "grad_norm": 20.45623207092285, + "learning_rate": 4.373333333333334e-05, + "loss": 1.3947, + "step": 329 + }, + { + "epoch": 0.022359238430787992, + "grad_norm": 17.434114456176758, + "learning_rate": 4.3866666666666665e-05, + "loss": 1.4269, + "step": 330 + }, + { + "epoch": 0.02242699369876008, + "grad_norm": 13.574700355529785, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.2807, + "step": 331 + }, + { + "epoch": 0.022494748966732165, + "grad_norm": 16.404966354370117, + "learning_rate": 4.413333333333334e-05, + "loss": 1.4549, + "step": 332 + }, + { + "epoch": 0.02256250423470425, + "grad_norm": 14.14947509765625, + "learning_rate": 4.426666666666667e-05, + "loss": 1.4187, + "step": 333 + }, + { + "epoch": 0.022630259502676334, + "grad_norm": 17.632492065429688, + "learning_rate": 4.44e-05, + "loss": 1.3515, + "step": 334 + }, + { + "epoch": 0.02269801477064842, + "grad_norm": 18.149261474609375, + "learning_rate": 4.4533333333333336e-05, + "loss": 1.4875, + "step": 335 + }, + { + "epoch": 0.022765770038620504, + "grad_norm": 19.864049911499023, + "learning_rate": 4.466666666666667e-05, + "loss": 1.5622, + "step": 336 + }, + { + "epoch": 0.02283352530659259, + "grad_norm": 16.7431583404541, + "learning_rate": 4.4800000000000005e-05, + "loss": 1.4344, + "step": 337 + }, + { + "epoch": 0.022901280574564673, + "grad_norm": 16.53998565673828, + "learning_rate": 4.493333333333333e-05, + "loss": 1.1745, + "step": 338 + }, + { + "epoch": 0.022969035842536758, + "grad_norm": 17.786945343017578, + "learning_rate": 4.5066666666666667e-05, + "loss": 1.4335, + "step": 339 + }, + { + "epoch": 0.023036791110508843, + "grad_norm": 17.730606079101562, + "learning_rate": 4.52e-05, + "loss": 1.341, + "step": 340 + }, + { + "epoch": 0.023104546378480927, + "grad_norm": 15.536660194396973, + "learning_rate": 4.5333333333333335e-05, + "loss": 1.4935, + "step": 341 + }, + { + "epoch": 0.023172301646453012, + "grad_norm": 22.405893325805664, + "learning_rate": 4.546666666666667e-05, + "loss": 1.5538, + "step": 342 + }, + { + "epoch": 0.023240056914425097, + "grad_norm": 15.963828086853027, + "learning_rate": 4.5600000000000004e-05, + "loss": 1.373, + "step": 343 + }, + { + "epoch": 0.02330781218239718, + "grad_norm": 13.041223526000977, + "learning_rate": 4.573333333333333e-05, + "loss": 1.346, + "step": 344 + }, + { + "epoch": 0.023375567450369266, + "grad_norm": 14.552989959716797, + "learning_rate": 4.5866666666666666e-05, + "loss": 1.2723, + "step": 345 + }, + { + "epoch": 0.02344332271834135, + "grad_norm": 16.034404754638672, + "learning_rate": 4.600000000000001e-05, + "loss": 1.3671, + "step": 346 + }, + { + "epoch": 0.023511077986313435, + "grad_norm": 17.5505428314209, + "learning_rate": 4.6133333333333334e-05, + "loss": 1.4553, + "step": 347 + }, + { + "epoch": 0.02357883325428552, + "grad_norm": 14.029112815856934, + "learning_rate": 4.626666666666667e-05, + "loss": 1.3639, + "step": 348 + }, + { + "epoch": 0.023646588522257605, + "grad_norm": 14.633806228637695, + "learning_rate": 4.64e-05, + "loss": 1.61, + "step": 349 + }, + { + "epoch": 0.02371434379022969, + "grad_norm": 15.407472610473633, + "learning_rate": 4.653333333333334e-05, + "loss": 1.2449, + "step": 350 + }, + { + "epoch": 0.023782099058201774, + "grad_norm": 13.801481246948242, + "learning_rate": 4.666666666666667e-05, + "loss": 1.3105, + "step": 351 + }, + { + "epoch": 0.02384985432617386, + "grad_norm": 17.675159454345703, + "learning_rate": 4.6800000000000006e-05, + "loss": 1.3142, + "step": 352 + }, + { + "epoch": 0.023917609594145944, + "grad_norm": 15.383625984191895, + "learning_rate": 4.6933333333333333e-05, + "loss": 1.2976, + "step": 353 + }, + { + "epoch": 0.02398536486211803, + "grad_norm": 18.087081909179688, + "learning_rate": 4.706666666666667e-05, + "loss": 1.3391, + "step": 354 + }, + { + "epoch": 0.024053120130090113, + "grad_norm": 15.790519714355469, + "learning_rate": 4.72e-05, + "loss": 1.7103, + "step": 355 + }, + { + "epoch": 0.024120875398062198, + "grad_norm": 15.931246757507324, + "learning_rate": 4.7333333333333336e-05, + "loss": 1.4792, + "step": 356 + }, + { + "epoch": 0.024188630666034282, + "grad_norm": 18.39167022705078, + "learning_rate": 4.746666666666667e-05, + "loss": 1.5085, + "step": 357 + }, + { + "epoch": 0.02425638593400637, + "grad_norm": 13.518149375915527, + "learning_rate": 4.76e-05, + "loss": 1.2711, + "step": 358 + }, + { + "epoch": 0.024324141201978455, + "grad_norm": 18.02762794494629, + "learning_rate": 4.773333333333333e-05, + "loss": 1.4113, + "step": 359 + }, + { + "epoch": 0.02439189646995054, + "grad_norm": 15.260608673095703, + "learning_rate": 4.7866666666666674e-05, + "loss": 1.3286, + "step": 360 + }, + { + "epoch": 0.024459651737922625, + "grad_norm": 16.26275062561035, + "learning_rate": 4.8e-05, + "loss": 1.4436, + "step": 361 + }, + { + "epoch": 0.02452740700589471, + "grad_norm": 14.213051795959473, + "learning_rate": 4.8133333333333336e-05, + "loss": 1.1339, + "step": 362 + }, + { + "epoch": 0.024595162273866794, + "grad_norm": 16.478811264038086, + "learning_rate": 4.826666666666667e-05, + "loss": 1.5534, + "step": 363 + }, + { + "epoch": 0.02466291754183888, + "grad_norm": 13.24108600616455, + "learning_rate": 4.8400000000000004e-05, + "loss": 1.4767, + "step": 364 + }, + { + "epoch": 0.024730672809810963, + "grad_norm": 15.167949676513672, + "learning_rate": 4.853333333333334e-05, + "loss": 1.1592, + "step": 365 + }, + { + "epoch": 0.024798428077783048, + "grad_norm": 18.150367736816406, + "learning_rate": 4.866666666666667e-05, + "loss": 1.4172, + "step": 366 + }, + { + "epoch": 0.024866183345755133, + "grad_norm": 13.250176429748535, + "learning_rate": 4.88e-05, + "loss": 1.5022, + "step": 367 + }, + { + "epoch": 0.024933938613727218, + "grad_norm": 14.780989646911621, + "learning_rate": 4.8933333333333335e-05, + "loss": 1.3243, + "step": 368 + }, + { + "epoch": 0.025001693881699302, + "grad_norm": 13.34748649597168, + "learning_rate": 4.906666666666667e-05, + "loss": 1.285, + "step": 369 + }, + { + "epoch": 0.025069449149671387, + "grad_norm": 14.677448272705078, + "learning_rate": 4.92e-05, + "loss": 1.1982, + "step": 370 + }, + { + "epoch": 0.02513720441764347, + "grad_norm": 18.211145401000977, + "learning_rate": 4.933333333333334e-05, + "loss": 1.4018, + "step": 371 + }, + { + "epoch": 0.025204959685615556, + "grad_norm": 17.68402099609375, + "learning_rate": 4.9466666666666665e-05, + "loss": 1.6153, + "step": 372 + }, + { + "epoch": 0.02527271495358764, + "grad_norm": 14.44299030303955, + "learning_rate": 4.96e-05, + "loss": 1.4092, + "step": 373 + }, + { + "epoch": 0.025340470221559726, + "grad_norm": 15.467754364013672, + "learning_rate": 4.973333333333334e-05, + "loss": 1.5647, + "step": 374 + }, + { + "epoch": 0.02540822548953181, + "grad_norm": 14.037840843200684, + "learning_rate": 4.986666666666667e-05, + "loss": 1.2106, + "step": 375 + }, + { + "epoch": 0.025475980757503895, + "grad_norm": 15.72461223602295, + "learning_rate": 5e-05, + "loss": 1.1832, + "step": 376 + }, + { + "epoch": 0.02554373602547598, + "grad_norm": 15.03637981414795, + "learning_rate": 5.013333333333333e-05, + "loss": 1.5803, + "step": 377 + }, + { + "epoch": 0.025611491293448065, + "grad_norm": 15.995429992675781, + "learning_rate": 5.026666666666667e-05, + "loss": 1.3837, + "step": 378 + }, + { + "epoch": 0.02567924656142015, + "grad_norm": 17.96516227722168, + "learning_rate": 5.0400000000000005e-05, + "loss": 1.5114, + "step": 379 + }, + { + "epoch": 0.025747001829392234, + "grad_norm": 17.960783004760742, + "learning_rate": 5.053333333333333e-05, + "loss": 1.4012, + "step": 380 + }, + { + "epoch": 0.02581475709736432, + "grad_norm": 14.399024963378906, + "learning_rate": 5.0666666666666674e-05, + "loss": 1.2906, + "step": 381 + }, + { + "epoch": 0.025882512365336403, + "grad_norm": 14.268051147460938, + "learning_rate": 5.08e-05, + "loss": 1.2354, + "step": 382 + }, + { + "epoch": 0.025950267633308488, + "grad_norm": 16.070646286010742, + "learning_rate": 5.0933333333333336e-05, + "loss": 1.272, + "step": 383 + }, + { + "epoch": 0.026018022901280576, + "grad_norm": 18.698619842529297, + "learning_rate": 5.106666666666668e-05, + "loss": 1.352, + "step": 384 + }, + { + "epoch": 0.02608577816925266, + "grad_norm": 13.932543754577637, + "learning_rate": 5.1200000000000004e-05, + "loss": 1.3249, + "step": 385 + }, + { + "epoch": 0.026153533437224746, + "grad_norm": 14.76308822631836, + "learning_rate": 5.133333333333333e-05, + "loss": 1.1807, + "step": 386 + }, + { + "epoch": 0.02622128870519683, + "grad_norm": 17.602182388305664, + "learning_rate": 5.146666666666667e-05, + "loss": 1.4017, + "step": 387 + }, + { + "epoch": 0.026289043973168915, + "grad_norm": 13.589346885681152, + "learning_rate": 5.16e-05, + "loss": 1.4205, + "step": 388 + }, + { + "epoch": 0.026356799241141, + "grad_norm": 16.37788200378418, + "learning_rate": 5.1733333333333335e-05, + "loss": 1.4255, + "step": 389 + }, + { + "epoch": 0.026424554509113084, + "grad_norm": 16.24977684020996, + "learning_rate": 5.1866666666666676e-05, + "loss": 1.2526, + "step": 390 + }, + { + "epoch": 0.02649230977708517, + "grad_norm": 15.02576732635498, + "learning_rate": 5.2000000000000004e-05, + "loss": 1.2147, + "step": 391 + }, + { + "epoch": 0.026560065045057254, + "grad_norm": 16.754850387573242, + "learning_rate": 5.213333333333333e-05, + "loss": 1.5725, + "step": 392 + }, + { + "epoch": 0.02662782031302934, + "grad_norm": 17.120729446411133, + "learning_rate": 5.2266666666666665e-05, + "loss": 1.351, + "step": 393 + }, + { + "epoch": 0.026695575581001423, + "grad_norm": 18.83492660522461, + "learning_rate": 5.2400000000000007e-05, + "loss": 1.3044, + "step": 394 + }, + { + "epoch": 0.026763330848973508, + "grad_norm": 18.66061019897461, + "learning_rate": 5.2533333333333334e-05, + "loss": 1.4889, + "step": 395 + }, + { + "epoch": 0.026831086116945593, + "grad_norm": 18.493236541748047, + "learning_rate": 5.266666666666666e-05, + "loss": 1.5015, + "step": 396 + }, + { + "epoch": 0.026898841384917677, + "grad_norm": 19.523067474365234, + "learning_rate": 5.28e-05, + "loss": 1.5179, + "step": 397 + }, + { + "epoch": 0.026966596652889762, + "grad_norm": 18.207366943359375, + "learning_rate": 5.293333333333334e-05, + "loss": 1.374, + "step": 398 + }, + { + "epoch": 0.027034351920861847, + "grad_norm": 13.021437644958496, + "learning_rate": 5.3066666666666665e-05, + "loss": 1.0259, + "step": 399 + }, + { + "epoch": 0.02710210718883393, + "grad_norm": 14.108929634094238, + "learning_rate": 5.3200000000000006e-05, + "loss": 1.3914, + "step": 400 + }, + { + "epoch": 0.027169862456806016, + "grad_norm": 15.087890625, + "learning_rate": 5.333333333333333e-05, + "loss": 1.3143, + "step": 401 + }, + { + "epoch": 0.0272376177247781, + "grad_norm": 16.389965057373047, + "learning_rate": 5.346666666666667e-05, + "loss": 1.5286, + "step": 402 + }, + { + "epoch": 0.027305372992750186, + "grad_norm": 17.855867385864258, + "learning_rate": 5.360000000000001e-05, + "loss": 1.4221, + "step": 403 + }, + { + "epoch": 0.02737312826072227, + "grad_norm": 14.016682624816895, + "learning_rate": 5.3733333333333336e-05, + "loss": 1.0922, + "step": 404 + }, + { + "epoch": 0.027440883528694355, + "grad_norm": 16.08883285522461, + "learning_rate": 5.3866666666666664e-05, + "loss": 1.4932, + "step": 405 + }, + { + "epoch": 0.02750863879666644, + "grad_norm": 14.883580207824707, + "learning_rate": 5.4000000000000005e-05, + "loss": 1.0806, + "step": 406 + }, + { + "epoch": 0.027576394064638524, + "grad_norm": 13.443008422851562, + "learning_rate": 5.413333333333334e-05, + "loss": 1.2653, + "step": 407 + }, + { + "epoch": 0.02764414933261061, + "grad_norm": 13.391244888305664, + "learning_rate": 5.4266666666666667e-05, + "loss": 1.2788, + "step": 408 + }, + { + "epoch": 0.027711904600582694, + "grad_norm": 13.449023246765137, + "learning_rate": 5.440000000000001e-05, + "loss": 1.133, + "step": 409 + }, + { + "epoch": 0.027779659868554782, + "grad_norm": 13.777191162109375, + "learning_rate": 5.4533333333333335e-05, + "loss": 1.1509, + "step": 410 + }, + { + "epoch": 0.027847415136526867, + "grad_norm": 19.04452896118164, + "learning_rate": 5.466666666666666e-05, + "loss": 1.3306, + "step": 411 + }, + { + "epoch": 0.02791517040449895, + "grad_norm": 16.583112716674805, + "learning_rate": 5.4800000000000004e-05, + "loss": 1.3018, + "step": 412 + }, + { + "epoch": 0.027982925672471036, + "grad_norm": 16.8561954498291, + "learning_rate": 5.493333333333334e-05, + "loss": 1.257, + "step": 413 + }, + { + "epoch": 0.02805068094044312, + "grad_norm": 15.321064949035645, + "learning_rate": 5.5066666666666666e-05, + "loss": 1.2498, + "step": 414 + }, + { + "epoch": 0.028118436208415205, + "grad_norm": 15.825779914855957, + "learning_rate": 5.520000000000001e-05, + "loss": 1.2177, + "step": 415 + }, + { + "epoch": 0.02818619147638729, + "grad_norm": 15.665361404418945, + "learning_rate": 5.5333333333333334e-05, + "loss": 1.244, + "step": 416 + }, + { + "epoch": 0.028253946744359375, + "grad_norm": 15.516046524047852, + "learning_rate": 5.546666666666667e-05, + "loss": 1.4957, + "step": 417 + }, + { + "epoch": 0.02832170201233146, + "grad_norm": 19.148746490478516, + "learning_rate": 5.560000000000001e-05, + "loss": 1.4346, + "step": 418 + }, + { + "epoch": 0.028389457280303544, + "grad_norm": 15.215538024902344, + "learning_rate": 5.573333333333334e-05, + "loss": 1.081, + "step": 419 + }, + { + "epoch": 0.02845721254827563, + "grad_norm": 15.612996101379395, + "learning_rate": 5.5866666666666665e-05, + "loss": 1.1671, + "step": 420 + }, + { + "epoch": 0.028524967816247714, + "grad_norm": 13.754039764404297, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.2679, + "step": 421 + }, + { + "epoch": 0.028592723084219798, + "grad_norm": 16.350305557250977, + "learning_rate": 5.613333333333334e-05, + "loss": 1.3034, + "step": 422 + }, + { + "epoch": 0.028660478352191883, + "grad_norm": 16.560344696044922, + "learning_rate": 5.626666666666667e-05, + "loss": 1.3633, + "step": 423 + }, + { + "epoch": 0.028728233620163968, + "grad_norm": 20.391889572143555, + "learning_rate": 5.6399999999999995e-05, + "loss": 1.3533, + "step": 424 + }, + { + "epoch": 0.028795988888136052, + "grad_norm": 16.13326072692871, + "learning_rate": 5.6533333333333336e-05, + "loss": 1.3678, + "step": 425 + }, + { + "epoch": 0.028863744156108137, + "grad_norm": 21.544612884521484, + "learning_rate": 5.666666666666667e-05, + "loss": 1.6275, + "step": 426 + }, + { + "epoch": 0.028931499424080222, + "grad_norm": 17.097408294677734, + "learning_rate": 5.68e-05, + "loss": 1.6119, + "step": 427 + }, + { + "epoch": 0.028999254692052306, + "grad_norm": 15.699347496032715, + "learning_rate": 5.693333333333334e-05, + "loss": 1.2644, + "step": 428 + }, + { + "epoch": 0.02906700996002439, + "grad_norm": 16.643613815307617, + "learning_rate": 5.706666666666667e-05, + "loss": 1.3497, + "step": 429 + }, + { + "epoch": 0.029134765227996476, + "grad_norm": 20.941123962402344, + "learning_rate": 5.72e-05, + "loss": 1.4232, + "step": 430 + }, + { + "epoch": 0.02920252049596856, + "grad_norm": 17.549013137817383, + "learning_rate": 5.7333333333333336e-05, + "loss": 1.3429, + "step": 431 + }, + { + "epoch": 0.029270275763940645, + "grad_norm": 19.702617645263672, + "learning_rate": 5.746666666666667e-05, + "loss": 1.7924, + "step": 432 + }, + { + "epoch": 0.02933803103191273, + "grad_norm": 16.396209716796875, + "learning_rate": 5.76e-05, + "loss": 1.3296, + "step": 433 + }, + { + "epoch": 0.029405786299884815, + "grad_norm": 15.823278427124023, + "learning_rate": 5.773333333333334e-05, + "loss": 1.4228, + "step": 434 + }, + { + "epoch": 0.0294735415678569, + "grad_norm": 19.61952018737793, + "learning_rate": 5.7866666666666666e-05, + "loss": 1.3522, + "step": 435 + }, + { + "epoch": 0.029541296835828984, + "grad_norm": 14.721433639526367, + "learning_rate": 5.8e-05, + "loss": 1.284, + "step": 436 + }, + { + "epoch": 0.029609052103801072, + "grad_norm": 16.594276428222656, + "learning_rate": 5.813333333333334e-05, + "loss": 1.4636, + "step": 437 + }, + { + "epoch": 0.029676807371773157, + "grad_norm": 15.98005199432373, + "learning_rate": 5.826666666666667e-05, + "loss": 1.2825, + "step": 438 + }, + { + "epoch": 0.02974456263974524, + "grad_norm": 15.501729965209961, + "learning_rate": 5.8399999999999997e-05, + "loss": 1.3736, + "step": 439 + }, + { + "epoch": 0.029812317907717326, + "grad_norm": 18.077552795410156, + "learning_rate": 5.853333333333334e-05, + "loss": 1.4115, + "step": 440 + }, + { + "epoch": 0.02988007317568941, + "grad_norm": 16.016721725463867, + "learning_rate": 5.866666666666667e-05, + "loss": 1.3177, + "step": 441 + }, + { + "epoch": 0.029947828443661496, + "grad_norm": 16.39783477783203, + "learning_rate": 5.88e-05, + "loss": 1.4465, + "step": 442 + }, + { + "epoch": 0.03001558371163358, + "grad_norm": 13.970220565795898, + "learning_rate": 5.893333333333334e-05, + "loss": 1.2956, + "step": 443 + }, + { + "epoch": 0.030083338979605665, + "grad_norm": 16.5622615814209, + "learning_rate": 5.906666666666667e-05, + "loss": 1.2144, + "step": 444 + }, + { + "epoch": 0.03015109424757775, + "grad_norm": 18.191911697387695, + "learning_rate": 5.92e-05, + "loss": 1.623, + "step": 445 + }, + { + "epoch": 0.030218849515549834, + "grad_norm": 15.483613967895508, + "learning_rate": 5.9333333333333343e-05, + "loss": 1.3232, + "step": 446 + }, + { + "epoch": 0.03028660478352192, + "grad_norm": 15.633200645446777, + "learning_rate": 5.946666666666667e-05, + "loss": 1.544, + "step": 447 + }, + { + "epoch": 0.030354360051494004, + "grad_norm": 16.025054931640625, + "learning_rate": 5.96e-05, + "loss": 1.4837, + "step": 448 + }, + { + "epoch": 0.03042211531946609, + "grad_norm": 15.954922676086426, + "learning_rate": 5.973333333333334e-05, + "loss": 1.341, + "step": 449 + }, + { + "epoch": 0.030489870587438173, + "grad_norm": 14.852401733398438, + "learning_rate": 5.9866666666666674e-05, + "loss": 1.2346, + "step": 450 + }, + { + "epoch": 0.030557625855410258, + "grad_norm": 14.887676239013672, + "learning_rate": 6e-05, + "loss": 1.2789, + "step": 451 + }, + { + "epoch": 0.030625381123382343, + "grad_norm": 17.25469970703125, + "learning_rate": 6.013333333333334e-05, + "loss": 1.4446, + "step": 452 + }, + { + "epoch": 0.030693136391354427, + "grad_norm": 19.356597900390625, + "learning_rate": 6.026666666666667e-05, + "loss": 1.3765, + "step": 453 + }, + { + "epoch": 0.030760891659326512, + "grad_norm": 14.16335391998291, + "learning_rate": 6.04e-05, + "loss": 1.4213, + "step": 454 + }, + { + "epoch": 0.030828646927298597, + "grad_norm": 15.812528610229492, + "learning_rate": 6.053333333333333e-05, + "loss": 1.4028, + "step": 455 + }, + { + "epoch": 0.03089640219527068, + "grad_norm": 15.10727596282959, + "learning_rate": 6.066666666666667e-05, + "loss": 1.3841, + "step": 456 + }, + { + "epoch": 0.030964157463242766, + "grad_norm": 19.91744613647461, + "learning_rate": 6.08e-05, + "loss": 1.4706, + "step": 457 + }, + { + "epoch": 0.03103191273121485, + "grad_norm": 14.597813606262207, + "learning_rate": 6.093333333333333e-05, + "loss": 1.2428, + "step": 458 + }, + { + "epoch": 0.031099667999186936, + "grad_norm": 14.952363014221191, + "learning_rate": 6.106666666666667e-05, + "loss": 1.3431, + "step": 459 + }, + { + "epoch": 0.03116742326715902, + "grad_norm": 16.519468307495117, + "learning_rate": 6.12e-05, + "loss": 1.5399, + "step": 460 + }, + { + "epoch": 0.031235178535131105, + "grad_norm": 14.55786418914795, + "learning_rate": 6.133333333333334e-05, + "loss": 1.3844, + "step": 461 + }, + { + "epoch": 0.03130293380310319, + "grad_norm": 12.52665901184082, + "learning_rate": 6.146666666666668e-05, + "loss": 1.2085, + "step": 462 + }, + { + "epoch": 0.03137068907107528, + "grad_norm": 17.549148559570312, + "learning_rate": 6.16e-05, + "loss": 1.588, + "step": 463 + }, + { + "epoch": 0.03143844433904736, + "grad_norm": 14.761232376098633, + "learning_rate": 6.173333333333333e-05, + "loss": 1.1685, + "step": 464 + }, + { + "epoch": 0.03150619960701945, + "grad_norm": 13.1014404296875, + "learning_rate": 6.186666666666668e-05, + "loss": 1.1718, + "step": 465 + }, + { + "epoch": 0.03157395487499153, + "grad_norm": 15.998059272766113, + "learning_rate": 6.2e-05, + "loss": 1.3177, + "step": 466 + }, + { + "epoch": 0.03164171014296362, + "grad_norm": 16.402875900268555, + "learning_rate": 6.213333333333333e-05, + "loss": 1.4821, + "step": 467 + }, + { + "epoch": 0.0317094654109357, + "grad_norm": 12.871467590332031, + "learning_rate": 6.226666666666667e-05, + "loss": 1.0176, + "step": 468 + }, + { + "epoch": 0.031777220678907786, + "grad_norm": 15.602563858032227, + "learning_rate": 6.24e-05, + "loss": 1.3418, + "step": 469 + }, + { + "epoch": 0.03184497594687987, + "grad_norm": 13.5369234085083, + "learning_rate": 6.253333333333333e-05, + "loss": 1.4051, + "step": 470 + }, + { + "epoch": 0.031912731214851955, + "grad_norm": 16.841650009155273, + "learning_rate": 6.266666666666667e-05, + "loss": 1.5438, + "step": 471 + }, + { + "epoch": 0.03198048648282404, + "grad_norm": 14.581981658935547, + "learning_rate": 6.280000000000001e-05, + "loss": 1.2767, + "step": 472 + }, + { + "epoch": 0.032048241750796125, + "grad_norm": 14.084460258483887, + "learning_rate": 6.293333333333334e-05, + "loss": 1.2224, + "step": 473 + }, + { + "epoch": 0.03211599701876821, + "grad_norm": 15.54316234588623, + "learning_rate": 6.306666666666668e-05, + "loss": 1.6084, + "step": 474 + }, + { + "epoch": 0.032183752286740294, + "grad_norm": 15.498743057250977, + "learning_rate": 6.32e-05, + "loss": 1.3937, + "step": 475 + }, + { + "epoch": 0.03225150755471238, + "grad_norm": 17.728227615356445, + "learning_rate": 6.333333333333333e-05, + "loss": 1.2996, + "step": 476 + }, + { + "epoch": 0.032319262822684464, + "grad_norm": 13.956122398376465, + "learning_rate": 6.346666666666667e-05, + "loss": 1.4472, + "step": 477 + }, + { + "epoch": 0.03238701809065655, + "grad_norm": 18.578060150146484, + "learning_rate": 6.36e-05, + "loss": 1.4659, + "step": 478 + }, + { + "epoch": 0.03245477335862863, + "grad_norm": 14.055442810058594, + "learning_rate": 6.373333333333333e-05, + "loss": 1.1979, + "step": 479 + }, + { + "epoch": 0.03252252862660072, + "grad_norm": 15.902195930480957, + "learning_rate": 6.386666666666667e-05, + "loss": 1.3367, + "step": 480 + }, + { + "epoch": 0.0325902838945728, + "grad_norm": 14.212138175964355, + "learning_rate": 6.400000000000001e-05, + "loss": 1.2716, + "step": 481 + }, + { + "epoch": 0.03265803916254489, + "grad_norm": 17.711475372314453, + "learning_rate": 6.413333333333334e-05, + "loss": 1.3016, + "step": 482 + }, + { + "epoch": 0.03272579443051697, + "grad_norm": 13.068260192871094, + "learning_rate": 6.426666666666668e-05, + "loss": 1.1346, + "step": 483 + }, + { + "epoch": 0.032793549698489056, + "grad_norm": 17.422321319580078, + "learning_rate": 6.440000000000001e-05, + "loss": 1.0931, + "step": 484 + }, + { + "epoch": 0.03286130496646114, + "grad_norm": 14.98038101196289, + "learning_rate": 6.453333333333333e-05, + "loss": 1.22, + "step": 485 + }, + { + "epoch": 0.032929060234433226, + "grad_norm": 13.300479888916016, + "learning_rate": 6.466666666666666e-05, + "loss": 1.2647, + "step": 486 + }, + { + "epoch": 0.03299681550240531, + "grad_norm": 14.611360549926758, + "learning_rate": 6.48e-05, + "loss": 1.2478, + "step": 487 + }, + { + "epoch": 0.033064570770377395, + "grad_norm": 19.244020462036133, + "learning_rate": 6.493333333333333e-05, + "loss": 1.2126, + "step": 488 + }, + { + "epoch": 0.03313232603834948, + "grad_norm": 16.285396575927734, + "learning_rate": 6.506666666666666e-05, + "loss": 1.1694, + "step": 489 + }, + { + "epoch": 0.033200081306321565, + "grad_norm": 19.613059997558594, + "learning_rate": 6.52e-05, + "loss": 1.2118, + "step": 490 + }, + { + "epoch": 0.03326783657429365, + "grad_norm": 19.19630241394043, + "learning_rate": 6.533333333333334e-05, + "loss": 1.3093, + "step": 491 + }, + { + "epoch": 0.033335591842265734, + "grad_norm": 12.653651237487793, + "learning_rate": 6.546666666666667e-05, + "loss": 1.1545, + "step": 492 + }, + { + "epoch": 0.03340334711023782, + "grad_norm": 13.434443473815918, + "learning_rate": 6.560000000000001e-05, + "loss": 1.2543, + "step": 493 + }, + { + "epoch": 0.033471102378209903, + "grad_norm": 18.893598556518555, + "learning_rate": 6.573333333333334e-05, + "loss": 1.3632, + "step": 494 + }, + { + "epoch": 0.03353885764618199, + "grad_norm": 18.339479446411133, + "learning_rate": 6.586666666666666e-05, + "loss": 1.4369, + "step": 495 + }, + { + "epoch": 0.03360661291415407, + "grad_norm": 12.303078651428223, + "learning_rate": 6.6e-05, + "loss": 1.4032, + "step": 496 + }, + { + "epoch": 0.03367436818212616, + "grad_norm": 17.041015625, + "learning_rate": 6.613333333333333e-05, + "loss": 1.2395, + "step": 497 + }, + { + "epoch": 0.03374212345009824, + "grad_norm": 14.035640716552734, + "learning_rate": 6.626666666666666e-05, + "loss": 1.3541, + "step": 498 + }, + { + "epoch": 0.03380987871807033, + "grad_norm": 16.93412208557129, + "learning_rate": 6.64e-05, + "loss": 1.3503, + "step": 499 + }, + { + "epoch": 0.03387763398604241, + "grad_norm": 16.02039909362793, + "learning_rate": 6.653333333333334e-05, + "loss": 1.3226, + "step": 500 + }, + { + "epoch": 0.033945389254014496, + "grad_norm": 16.061542510986328, + "learning_rate": 6.666666666666667e-05, + "loss": 1.433, + "step": 501 + }, + { + "epoch": 0.03401314452198658, + "grad_norm": 14.810320854187012, + "learning_rate": 6.680000000000001e-05, + "loss": 1.3715, + "step": 502 + }, + { + "epoch": 0.03408089978995867, + "grad_norm": 15.132527351379395, + "learning_rate": 6.693333333333334e-05, + "loss": 1.0683, + "step": 503 + }, + { + "epoch": 0.03414865505793076, + "grad_norm": 13.768165588378906, + "learning_rate": 6.706666666666667e-05, + "loss": 1.234, + "step": 504 + }, + { + "epoch": 0.03421641032590284, + "grad_norm": 16.227920532226562, + "learning_rate": 6.720000000000001e-05, + "loss": 1.4181, + "step": 505 + }, + { + "epoch": 0.03428416559387493, + "grad_norm": 14.205772399902344, + "learning_rate": 6.733333333333333e-05, + "loss": 1.4342, + "step": 506 + }, + { + "epoch": 0.03435192086184701, + "grad_norm": 17.451099395751953, + "learning_rate": 6.746666666666666e-05, + "loss": 1.3338, + "step": 507 + }, + { + "epoch": 0.034419676129819096, + "grad_norm": 13.7041015625, + "learning_rate": 6.76e-05, + "loss": 1.3972, + "step": 508 + }, + { + "epoch": 0.03448743139779118, + "grad_norm": 13.206351280212402, + "learning_rate": 6.773333333333333e-05, + "loss": 1.4233, + "step": 509 + }, + { + "epoch": 0.034555186665763266, + "grad_norm": 11.867975234985352, + "learning_rate": 6.786666666666667e-05, + "loss": 1.111, + "step": 510 + }, + { + "epoch": 0.03462294193373535, + "grad_norm": 19.726648330688477, + "learning_rate": 6.800000000000001e-05, + "loss": 1.2918, + "step": 511 + }, + { + "epoch": 0.034690697201707435, + "grad_norm": 18.914016723632812, + "learning_rate": 6.813333333333334e-05, + "loss": 1.5998, + "step": 512 + }, + { + "epoch": 0.03475845246967952, + "grad_norm": 18.065937042236328, + "learning_rate": 6.826666666666667e-05, + "loss": 1.4952, + "step": 513 + }, + { + "epoch": 0.034826207737651604, + "grad_norm": 16.42376136779785, + "learning_rate": 6.840000000000001e-05, + "loss": 1.2807, + "step": 514 + }, + { + "epoch": 0.03489396300562369, + "grad_norm": 15.537372589111328, + "learning_rate": 6.853333333333334e-05, + "loss": 1.3524, + "step": 515 + }, + { + "epoch": 0.034961718273595774, + "grad_norm": 18.395238876342773, + "learning_rate": 6.866666666666666e-05, + "loss": 1.3022, + "step": 516 + }, + { + "epoch": 0.03502947354156786, + "grad_norm": 15.80531120300293, + "learning_rate": 6.879999999999999e-05, + "loss": 1.4201, + "step": 517 + }, + { + "epoch": 0.03509722880953994, + "grad_norm": 15.361504554748535, + "learning_rate": 6.893333333333333e-05, + "loss": 1.4351, + "step": 518 + }, + { + "epoch": 0.03516498407751203, + "grad_norm": 12.501819610595703, + "learning_rate": 6.906666666666667e-05, + "loss": 1.1894, + "step": 519 + }, + { + "epoch": 0.03523273934548411, + "grad_norm": 14.53589916229248, + "learning_rate": 6.92e-05, + "loss": 1.3435, + "step": 520 + }, + { + "epoch": 0.0353004946134562, + "grad_norm": 15.057633399963379, + "learning_rate": 6.933333333333334e-05, + "loss": 1.5467, + "step": 521 + }, + { + "epoch": 0.03536824988142828, + "grad_norm": 16.27166175842285, + "learning_rate": 6.946666666666667e-05, + "loss": 1.5171, + "step": 522 + }, + { + "epoch": 0.03543600514940037, + "grad_norm": 18.050413131713867, + "learning_rate": 6.96e-05, + "loss": 1.2246, + "step": 523 + }, + { + "epoch": 0.03550376041737245, + "grad_norm": 15.316112518310547, + "learning_rate": 6.973333333333334e-05, + "loss": 1.2551, + "step": 524 + }, + { + "epoch": 0.035571515685344536, + "grad_norm": 15.416853904724121, + "learning_rate": 6.986666666666667e-05, + "loss": 1.1895, + "step": 525 + }, + { + "epoch": 0.03563927095331662, + "grad_norm": 15.019004821777344, + "learning_rate": 7e-05, + "loss": 1.5957, + "step": 526 + }, + { + "epoch": 0.035707026221288705, + "grad_norm": 15.33927059173584, + "learning_rate": 7.013333333333333e-05, + "loss": 1.3536, + "step": 527 + }, + { + "epoch": 0.03577478148926079, + "grad_norm": 16.403501510620117, + "learning_rate": 7.026666666666668e-05, + "loss": 1.6104, + "step": 528 + }, + { + "epoch": 0.035842536757232875, + "grad_norm": 13.962714195251465, + "learning_rate": 7.04e-05, + "loss": 1.2122, + "step": 529 + }, + { + "epoch": 0.03591029202520496, + "grad_norm": 14.530926704406738, + "learning_rate": 7.053333333333334e-05, + "loss": 1.2879, + "step": 530 + }, + { + "epoch": 0.035978047293177044, + "grad_norm": 13.99305248260498, + "learning_rate": 7.066666666666667e-05, + "loss": 1.2063, + "step": 531 + }, + { + "epoch": 0.03604580256114913, + "grad_norm": 17.54342269897461, + "learning_rate": 7.08e-05, + "loss": 1.2823, + "step": 532 + }, + { + "epoch": 0.036113557829121214, + "grad_norm": 11.097670555114746, + "learning_rate": 7.093333333333334e-05, + "loss": 1.1658, + "step": 533 + }, + { + "epoch": 0.0361813130970933, + "grad_norm": 19.069889068603516, + "learning_rate": 7.106666666666667e-05, + "loss": 1.3368, + "step": 534 + }, + { + "epoch": 0.03624906836506538, + "grad_norm": 12.675487518310547, + "learning_rate": 7.12e-05, + "loss": 1.1586, + "step": 535 + }, + { + "epoch": 0.03631682363303747, + "grad_norm": 16.971637725830078, + "learning_rate": 7.133333333333334e-05, + "loss": 1.2458, + "step": 536 + }, + { + "epoch": 0.03638457890100955, + "grad_norm": 14.513422012329102, + "learning_rate": 7.146666666666666e-05, + "loss": 1.236, + "step": 537 + }, + { + "epoch": 0.03645233416898164, + "grad_norm": 14.271836280822754, + "learning_rate": 7.16e-05, + "loss": 1.0951, + "step": 538 + }, + { + "epoch": 0.03652008943695372, + "grad_norm": 14.286022186279297, + "learning_rate": 7.173333333333335e-05, + "loss": 1.2411, + "step": 539 + }, + { + "epoch": 0.03658784470492581, + "grad_norm": 16.81059455871582, + "learning_rate": 7.186666666666667e-05, + "loss": 1.2453, + "step": 540 + }, + { + "epoch": 0.03665559997289789, + "grad_norm": 16.818300247192383, + "learning_rate": 7.2e-05, + "loss": 1.2302, + "step": 541 + }, + { + "epoch": 0.036723355240869976, + "grad_norm": 14.839925765991211, + "learning_rate": 7.213333333333334e-05, + "loss": 1.3445, + "step": 542 + }, + { + "epoch": 0.03679111050884206, + "grad_norm": 14.917344093322754, + "learning_rate": 7.226666666666667e-05, + "loss": 1.3658, + "step": 543 + }, + { + "epoch": 0.036858865776814145, + "grad_norm": 15.310625076293945, + "learning_rate": 7.24e-05, + "loss": 1.2565, + "step": 544 + }, + { + "epoch": 0.03692662104478623, + "grad_norm": 16.7796630859375, + "learning_rate": 7.253333333333334e-05, + "loss": 1.265, + "step": 545 + }, + { + "epoch": 0.036994376312758315, + "grad_norm": 19.4224796295166, + "learning_rate": 7.266666666666667e-05, + "loss": 1.6152, + "step": 546 + }, + { + "epoch": 0.0370621315807304, + "grad_norm": 15.8001127243042, + "learning_rate": 7.280000000000001e-05, + "loss": 1.4313, + "step": 547 + }, + { + "epoch": 0.037129886848702484, + "grad_norm": 13.597784042358398, + "learning_rate": 7.293333333333334e-05, + "loss": 1.3812, + "step": 548 + }, + { + "epoch": 0.03719764211667457, + "grad_norm": 13.139778137207031, + "learning_rate": 7.306666666666668e-05, + "loss": 1.1888, + "step": 549 + }, + { + "epoch": 0.037265397384646654, + "grad_norm": 17.083406448364258, + "learning_rate": 7.32e-05, + "loss": 1.1668, + "step": 550 + }, + { + "epoch": 0.03733315265261874, + "grad_norm": 14.46076774597168, + "learning_rate": 7.333333333333333e-05, + "loss": 1.2394, + "step": 551 + }, + { + "epoch": 0.03740090792059082, + "grad_norm": 16.217782974243164, + "learning_rate": 7.346666666666667e-05, + "loss": 1.3068, + "step": 552 + }, + { + "epoch": 0.03746866318856291, + "grad_norm": 16.06308937072754, + "learning_rate": 7.36e-05, + "loss": 0.9678, + "step": 553 + }, + { + "epoch": 0.03753641845653499, + "grad_norm": 14.652907371520996, + "learning_rate": 7.373333333333333e-05, + "loss": 1.5375, + "step": 554 + }, + { + "epoch": 0.03760417372450708, + "grad_norm": 13.870722770690918, + "learning_rate": 7.386666666666667e-05, + "loss": 1.29, + "step": 555 + }, + { + "epoch": 0.03767192899247917, + "grad_norm": 14.649571418762207, + "learning_rate": 7.4e-05, + "loss": 1.3588, + "step": 556 + }, + { + "epoch": 0.03773968426045125, + "grad_norm": 13.698415756225586, + "learning_rate": 7.413333333333334e-05, + "loss": 1.0801, + "step": 557 + }, + { + "epoch": 0.03780743952842334, + "grad_norm": 19.412424087524414, + "learning_rate": 7.426666666666668e-05, + "loss": 1.4474, + "step": 558 + }, + { + "epoch": 0.03787519479639542, + "grad_norm": 18.58599853515625, + "learning_rate": 7.44e-05, + "loss": 1.3294, + "step": 559 + }, + { + "epoch": 0.03794295006436751, + "grad_norm": 15.283289909362793, + "learning_rate": 7.453333333333333e-05, + "loss": 1.1554, + "step": 560 + }, + { + "epoch": 0.03801070533233959, + "grad_norm": 16.289731979370117, + "learning_rate": 7.466666666666667e-05, + "loss": 1.222, + "step": 561 + }, + { + "epoch": 0.03807846060031168, + "grad_norm": 25.105520248413086, + "learning_rate": 7.48e-05, + "loss": 1.2213, + "step": 562 + }, + { + "epoch": 0.03814621586828376, + "grad_norm": 15.428693771362305, + "learning_rate": 7.493333333333333e-05, + "loss": 1.2165, + "step": 563 + }, + { + "epoch": 0.038213971136255846, + "grad_norm": 18.79145622253418, + "learning_rate": 7.506666666666667e-05, + "loss": 1.4799, + "step": 564 + }, + { + "epoch": 0.03828172640422793, + "grad_norm": 12.498862266540527, + "learning_rate": 7.52e-05, + "loss": 1.4348, + "step": 565 + }, + { + "epoch": 0.038349481672200016, + "grad_norm": 19.302845001220703, + "learning_rate": 7.533333333333334e-05, + "loss": 1.2656, + "step": 566 + }, + { + "epoch": 0.0384172369401721, + "grad_norm": 16.28862953186035, + "learning_rate": 7.546666666666668e-05, + "loss": 1.4779, + "step": 567 + }, + { + "epoch": 0.038484992208144185, + "grad_norm": 13.972156524658203, + "learning_rate": 7.560000000000001e-05, + "loss": 1.4775, + "step": 568 + }, + { + "epoch": 0.03855274747611627, + "grad_norm": 15.443683624267578, + "learning_rate": 7.573333333333334e-05, + "loss": 1.2671, + "step": 569 + }, + { + "epoch": 0.038620502744088354, + "grad_norm": 15.203113555908203, + "learning_rate": 7.586666666666668e-05, + "loss": 1.4209, + "step": 570 + }, + { + "epoch": 0.03868825801206044, + "grad_norm": 18.516462326049805, + "learning_rate": 7.6e-05, + "loss": 1.3315, + "step": 571 + }, + { + "epoch": 0.038756013280032524, + "grad_norm": 13.403026580810547, + "learning_rate": 7.613333333333333e-05, + "loss": 1.364, + "step": 572 + }, + { + "epoch": 0.03882376854800461, + "grad_norm": 12.339615821838379, + "learning_rate": 7.626666666666667e-05, + "loss": 1.2107, + "step": 573 + }, + { + "epoch": 0.03889152381597669, + "grad_norm": 15.647716522216797, + "learning_rate": 7.64e-05, + "loss": 1.3165, + "step": 574 + }, + { + "epoch": 0.03895927908394878, + "grad_norm": 15.854050636291504, + "learning_rate": 7.653333333333333e-05, + "loss": 1.517, + "step": 575 + }, + { + "epoch": 0.03902703435192086, + "grad_norm": 11.765929222106934, + "learning_rate": 7.666666666666667e-05, + "loss": 1.2676, + "step": 576 + }, + { + "epoch": 0.03909478961989295, + "grad_norm": 12.515352249145508, + "learning_rate": 7.680000000000001e-05, + "loss": 1.4472, + "step": 577 + }, + { + "epoch": 0.03916254488786503, + "grad_norm": 11.72417163848877, + "learning_rate": 7.693333333333334e-05, + "loss": 1.1874, + "step": 578 + }, + { + "epoch": 0.03923030015583712, + "grad_norm": 15.03148365020752, + "learning_rate": 7.706666666666668e-05, + "loss": 1.6381, + "step": 579 + }, + { + "epoch": 0.0392980554238092, + "grad_norm": 14.0188570022583, + "learning_rate": 7.72e-05, + "loss": 1.4351, + "step": 580 + }, + { + "epoch": 0.039365810691781286, + "grad_norm": 12.343233108520508, + "learning_rate": 7.733333333333333e-05, + "loss": 1.2293, + "step": 581 + }, + { + "epoch": 0.03943356595975337, + "grad_norm": 15.358945846557617, + "learning_rate": 7.746666666666666e-05, + "loss": 1.307, + "step": 582 + }, + { + "epoch": 0.039501321227725456, + "grad_norm": 14.131333351135254, + "learning_rate": 7.76e-05, + "loss": 1.0432, + "step": 583 + }, + { + "epoch": 0.03956907649569754, + "grad_norm": 16.46926498413086, + "learning_rate": 7.773333333333333e-05, + "loss": 1.4126, + "step": 584 + }, + { + "epoch": 0.039636831763669625, + "grad_norm": 15.209906578063965, + "learning_rate": 7.786666666666667e-05, + "loss": 1.3242, + "step": 585 + }, + { + "epoch": 0.03970458703164171, + "grad_norm": 15.657282829284668, + "learning_rate": 7.800000000000001e-05, + "loss": 1.4967, + "step": 586 + }, + { + "epoch": 0.039772342299613794, + "grad_norm": 11.466882705688477, + "learning_rate": 7.813333333333334e-05, + "loss": 1.3003, + "step": 587 + }, + { + "epoch": 0.03984009756758588, + "grad_norm": 19.529300689697266, + "learning_rate": 7.826666666666667e-05, + "loss": 1.3835, + "step": 588 + }, + { + "epoch": 0.039907852835557964, + "grad_norm": 17.22064971923828, + "learning_rate": 7.840000000000001e-05, + "loss": 1.2655, + "step": 589 + }, + { + "epoch": 0.03997560810353005, + "grad_norm": 14.788103103637695, + "learning_rate": 7.853333333333334e-05, + "loss": 1.1981, + "step": 590 + }, + { + "epoch": 0.04004336337150213, + "grad_norm": 13.705521583557129, + "learning_rate": 7.866666666666666e-05, + "loss": 1.2064, + "step": 591 + }, + { + "epoch": 0.04011111863947422, + "grad_norm": 14.901930809020996, + "learning_rate": 7.88e-05, + "loss": 1.2632, + "step": 592 + }, + { + "epoch": 0.0401788739074463, + "grad_norm": 18.520828247070312, + "learning_rate": 7.893333333333333e-05, + "loss": 1.4316, + "step": 593 + }, + { + "epoch": 0.04024662917541839, + "grad_norm": 15.101353645324707, + "learning_rate": 7.906666666666667e-05, + "loss": 1.3935, + "step": 594 + }, + { + "epoch": 0.04031438444339047, + "grad_norm": 13.961483001708984, + "learning_rate": 7.920000000000001e-05, + "loss": 1.1477, + "step": 595 + }, + { + "epoch": 0.04038213971136256, + "grad_norm": 16.015554428100586, + "learning_rate": 7.933333333333334e-05, + "loss": 1.5029, + "step": 596 + }, + { + "epoch": 0.04044989497933464, + "grad_norm": 15.009637832641602, + "learning_rate": 7.946666666666667e-05, + "loss": 1.3715, + "step": 597 + }, + { + "epoch": 0.040517650247306726, + "grad_norm": 16.295202255249023, + "learning_rate": 7.960000000000001e-05, + "loss": 1.5129, + "step": 598 + }, + { + "epoch": 0.04058540551527881, + "grad_norm": 88.3338623046875, + "learning_rate": 7.973333333333334e-05, + "loss": 1.2395, + "step": 599 + }, + { + "epoch": 0.040653160783250895, + "grad_norm": 16.769424438476562, + "learning_rate": 7.986666666666667e-05, + "loss": 1.4258, + "step": 600 + }, + { + "epoch": 0.04072091605122298, + "grad_norm": 16.044578552246094, + "learning_rate": 8e-05, + "loss": 1.365, + "step": 601 + }, + { + "epoch": 0.040788671319195065, + "grad_norm": 15.282588958740234, + "learning_rate": 8.013333333333333e-05, + "loss": 1.3212, + "step": 602 + }, + { + "epoch": 0.04085642658716715, + "grad_norm": 15.275490760803223, + "learning_rate": 8.026666666666666e-05, + "loss": 1.6048, + "step": 603 + }, + { + "epoch": 0.040924181855139234, + "grad_norm": 12.952759742736816, + "learning_rate": 8.04e-05, + "loss": 1.1982, + "step": 604 + }, + { + "epoch": 0.04099193712311132, + "grad_norm": 16.79343032836914, + "learning_rate": 8.053333333333334e-05, + "loss": 1.281, + "step": 605 + }, + { + "epoch": 0.041059692391083404, + "grad_norm": 13.70421314239502, + "learning_rate": 8.066666666666667e-05, + "loss": 1.2664, + "step": 606 + }, + { + "epoch": 0.04112744765905549, + "grad_norm": 14.420117378234863, + "learning_rate": 8.080000000000001e-05, + "loss": 1.2758, + "step": 607 + }, + { + "epoch": 0.04119520292702757, + "grad_norm": 16.421335220336914, + "learning_rate": 8.093333333333334e-05, + "loss": 1.3105, + "step": 608 + }, + { + "epoch": 0.041262958194999665, + "grad_norm": 16.96218490600586, + "learning_rate": 8.106666666666667e-05, + "loss": 1.4665, + "step": 609 + }, + { + "epoch": 0.04133071346297175, + "grad_norm": 13.73725700378418, + "learning_rate": 8.120000000000001e-05, + "loss": 1.2109, + "step": 610 + }, + { + "epoch": 0.041398468730943834, + "grad_norm": 14.718022346496582, + "learning_rate": 8.133333333333334e-05, + "loss": 1.2051, + "step": 611 + }, + { + "epoch": 0.04146622399891592, + "grad_norm": 14.119061470031738, + "learning_rate": 8.146666666666666e-05, + "loss": 1.0731, + "step": 612 + }, + { + "epoch": 0.041533979266888, + "grad_norm": 17.90053367614746, + "learning_rate": 8.16e-05, + "loss": 1.3804, + "step": 613 + }, + { + "epoch": 0.04160173453486009, + "grad_norm": 12.727055549621582, + "learning_rate": 8.173333333333335e-05, + "loss": 1.0926, + "step": 614 + }, + { + "epoch": 0.04166948980283217, + "grad_norm": 14.953054428100586, + "learning_rate": 8.186666666666667e-05, + "loss": 1.1974, + "step": 615 + }, + { + "epoch": 0.04173724507080426, + "grad_norm": 16.05322265625, + "learning_rate": 8.2e-05, + "loss": 1.2526, + "step": 616 + }, + { + "epoch": 0.04180500033877634, + "grad_norm": 15.028168678283691, + "learning_rate": 8.213333333333334e-05, + "loss": 1.3593, + "step": 617 + }, + { + "epoch": 0.04187275560674843, + "grad_norm": 13.127458572387695, + "learning_rate": 8.226666666666667e-05, + "loss": 1.3981, + "step": 618 + }, + { + "epoch": 0.04194051087472051, + "grad_norm": 17.323017120361328, + "learning_rate": 8.24e-05, + "loss": 1.4622, + "step": 619 + }, + { + "epoch": 0.042008266142692596, + "grad_norm": 16.106731414794922, + "learning_rate": 8.253333333333334e-05, + "loss": 1.257, + "step": 620 + }, + { + "epoch": 0.04207602141066468, + "grad_norm": 13.039103507995605, + "learning_rate": 8.266666666666667e-05, + "loss": 1.2905, + "step": 621 + }, + { + "epoch": 0.042143776678636766, + "grad_norm": 13.661933898925781, + "learning_rate": 8.28e-05, + "loss": 1.2779, + "step": 622 + }, + { + "epoch": 0.04221153194660885, + "grad_norm": 17.325756072998047, + "learning_rate": 8.293333333333333e-05, + "loss": 1.3934, + "step": 623 + }, + { + "epoch": 0.042279287214580935, + "grad_norm": 13.898777961730957, + "learning_rate": 8.306666666666668e-05, + "loss": 1.3735, + "step": 624 + }, + { + "epoch": 0.04234704248255302, + "grad_norm": 16.787601470947266, + "learning_rate": 8.32e-05, + "loss": 1.5228, + "step": 625 + }, + { + "epoch": 0.042414797750525104, + "grad_norm": 13.54299259185791, + "learning_rate": 8.333333333333334e-05, + "loss": 1.0538, + "step": 626 + }, + { + "epoch": 0.04248255301849719, + "grad_norm": 14.418194770812988, + "learning_rate": 8.346666666666667e-05, + "loss": 1.2932, + "step": 627 + }, + { + "epoch": 0.042550308286469274, + "grad_norm": 13.39255142211914, + "learning_rate": 8.36e-05, + "loss": 1.1989, + "step": 628 + }, + { + "epoch": 0.04261806355444136, + "grad_norm": 15.445747375488281, + "learning_rate": 8.373333333333334e-05, + "loss": 1.3557, + "step": 629 + }, + { + "epoch": 0.04268581882241344, + "grad_norm": 13.414338111877441, + "learning_rate": 8.386666666666667e-05, + "loss": 1.1729, + "step": 630 + }, + { + "epoch": 0.04275357409038553, + "grad_norm": 16.649137496948242, + "learning_rate": 8.4e-05, + "loss": 1.4212, + "step": 631 + }, + { + "epoch": 0.04282132935835761, + "grad_norm": 17.583528518676758, + "learning_rate": 8.413333333333334e-05, + "loss": 1.4838, + "step": 632 + }, + { + "epoch": 0.0428890846263297, + "grad_norm": 19.32307243347168, + "learning_rate": 8.426666666666668e-05, + "loss": 1.5958, + "step": 633 + }, + { + "epoch": 0.04295683989430178, + "grad_norm": 12.703327178955078, + "learning_rate": 8.44e-05, + "loss": 1.2752, + "step": 634 + }, + { + "epoch": 0.04302459516227387, + "grad_norm": 15.72768497467041, + "learning_rate": 8.453333333333335e-05, + "loss": 1.4059, + "step": 635 + }, + { + "epoch": 0.04309235043024595, + "grad_norm": 13.532344818115234, + "learning_rate": 8.466666666666667e-05, + "loss": 1.2252, + "step": 636 + }, + { + "epoch": 0.043160105698218036, + "grad_norm": 13.305481910705566, + "learning_rate": 8.48e-05, + "loss": 1.5084, + "step": 637 + }, + { + "epoch": 0.04322786096619012, + "grad_norm": 11.986043930053711, + "learning_rate": 8.493333333333334e-05, + "loss": 1.2006, + "step": 638 + }, + { + "epoch": 0.043295616234162206, + "grad_norm": 13.28003978729248, + "learning_rate": 8.506666666666667e-05, + "loss": 1.2442, + "step": 639 + }, + { + "epoch": 0.04336337150213429, + "grad_norm": 15.835545539855957, + "learning_rate": 8.52e-05, + "loss": 1.2466, + "step": 640 + }, + { + "epoch": 0.043431126770106375, + "grad_norm": 14.887584686279297, + "learning_rate": 8.533333333333334e-05, + "loss": 1.3828, + "step": 641 + }, + { + "epoch": 0.04349888203807846, + "grad_norm": 14.740251541137695, + "learning_rate": 8.546666666666667e-05, + "loss": 1.4847, + "step": 642 + }, + { + "epoch": 0.043566637306050544, + "grad_norm": 14.109213829040527, + "learning_rate": 8.560000000000001e-05, + "loss": 1.2251, + "step": 643 + }, + { + "epoch": 0.04363439257402263, + "grad_norm": 17.33670425415039, + "learning_rate": 8.573333333333333e-05, + "loss": 1.3817, + "step": 644 + }, + { + "epoch": 0.043702147841994714, + "grad_norm": 16.92241096496582, + "learning_rate": 8.586666666666668e-05, + "loss": 1.458, + "step": 645 + }, + { + "epoch": 0.0437699031099668, + "grad_norm": 15.369187355041504, + "learning_rate": 8.6e-05, + "loss": 1.28, + "step": 646 + }, + { + "epoch": 0.04383765837793888, + "grad_norm": 18.748065948486328, + "learning_rate": 8.613333333333333e-05, + "loss": 1.6275, + "step": 647 + }, + { + "epoch": 0.04390541364591097, + "grad_norm": 15.650605201721191, + "learning_rate": 8.626666666666667e-05, + "loss": 1.3742, + "step": 648 + }, + { + "epoch": 0.04397316891388305, + "grad_norm": 16.170730590820312, + "learning_rate": 8.64e-05, + "loss": 1.174, + "step": 649 + }, + { + "epoch": 0.04404092418185514, + "grad_norm": 12.61952018737793, + "learning_rate": 8.653333333333333e-05, + "loss": 1.2178, + "step": 650 + }, + { + "epoch": 0.04410867944982722, + "grad_norm": 18.86512565612793, + "learning_rate": 8.666666666666667e-05, + "loss": 1.4993, + "step": 651 + }, + { + "epoch": 0.04417643471779931, + "grad_norm": 16.412899017333984, + "learning_rate": 8.680000000000001e-05, + "loss": 1.2663, + "step": 652 + }, + { + "epoch": 0.04424418998577139, + "grad_norm": 14.11953353881836, + "learning_rate": 8.693333333333334e-05, + "loss": 1.3772, + "step": 653 + }, + { + "epoch": 0.044311945253743476, + "grad_norm": 17.232633590698242, + "learning_rate": 8.706666666666668e-05, + "loss": 1.3252, + "step": 654 + }, + { + "epoch": 0.04437970052171556, + "grad_norm": 13.833864212036133, + "learning_rate": 8.72e-05, + "loss": 1.279, + "step": 655 + }, + { + "epoch": 0.044447455789687645, + "grad_norm": 11.546829223632812, + "learning_rate": 8.733333333333333e-05, + "loss": 1.382, + "step": 656 + }, + { + "epoch": 0.04451521105765973, + "grad_norm": 13.163644790649414, + "learning_rate": 8.746666666666667e-05, + "loss": 1.2253, + "step": 657 + }, + { + "epoch": 0.044582966325631815, + "grad_norm": 17.037311553955078, + "learning_rate": 8.76e-05, + "loss": 1.2727, + "step": 658 + }, + { + "epoch": 0.0446507215936039, + "grad_norm": 14.610177040100098, + "learning_rate": 8.773333333333333e-05, + "loss": 1.3082, + "step": 659 + }, + { + "epoch": 0.044718476861575984, + "grad_norm": 16.305557250976562, + "learning_rate": 8.786666666666667e-05, + "loss": 1.3948, + "step": 660 + }, + { + "epoch": 0.04478623212954807, + "grad_norm": 13.207799911499023, + "learning_rate": 8.800000000000001e-05, + "loss": 1.2764, + "step": 661 + }, + { + "epoch": 0.04485398739752016, + "grad_norm": 11.451075553894043, + "learning_rate": 8.813333333333334e-05, + "loss": 1.0831, + "step": 662 + }, + { + "epoch": 0.044921742665492245, + "grad_norm": 13.555370330810547, + "learning_rate": 8.826666666666668e-05, + "loss": 1.4996, + "step": 663 + }, + { + "epoch": 0.04498949793346433, + "grad_norm": 13.544769287109375, + "learning_rate": 8.840000000000001e-05, + "loss": 1.2277, + "step": 664 + }, + { + "epoch": 0.045057253201436415, + "grad_norm": 18.05879783630371, + "learning_rate": 8.853333333333333e-05, + "loss": 1.3964, + "step": 665 + }, + { + "epoch": 0.0451250084694085, + "grad_norm": 17.309839248657227, + "learning_rate": 8.866666666666668e-05, + "loss": 1.6233, + "step": 666 + }, + { + "epoch": 0.045192763737380584, + "grad_norm": 12.732510566711426, + "learning_rate": 8.88e-05, + "loss": 1.3237, + "step": 667 + }, + { + "epoch": 0.04526051900535267, + "grad_norm": 13.541101455688477, + "learning_rate": 8.893333333333333e-05, + "loss": 1.2523, + "step": 668 + }, + { + "epoch": 0.04532827427332475, + "grad_norm": 17.54905891418457, + "learning_rate": 8.906666666666667e-05, + "loss": 1.5824, + "step": 669 + }, + { + "epoch": 0.04539602954129684, + "grad_norm": 12.52578353881836, + "learning_rate": 8.92e-05, + "loss": 1.2966, + "step": 670 + }, + { + "epoch": 0.04546378480926892, + "grad_norm": 13.279097557067871, + "learning_rate": 8.933333333333334e-05, + "loss": 1.4898, + "step": 671 + }, + { + "epoch": 0.04553154007724101, + "grad_norm": 15.892850875854492, + "learning_rate": 8.946666666666668e-05, + "loss": 1.3757, + "step": 672 + }, + { + "epoch": 0.04559929534521309, + "grad_norm": 14.108098983764648, + "learning_rate": 8.960000000000001e-05, + "loss": 1.1758, + "step": 673 + }, + { + "epoch": 0.04566705061318518, + "grad_norm": 17.15204429626465, + "learning_rate": 8.973333333333334e-05, + "loss": 1.3876, + "step": 674 + }, + { + "epoch": 0.04573480588115726, + "grad_norm": 14.453113555908203, + "learning_rate": 8.986666666666666e-05, + "loss": 1.2152, + "step": 675 + }, + { + "epoch": 0.045802561149129346, + "grad_norm": 17.9672794342041, + "learning_rate": 9e-05, + "loss": 1.2962, + "step": 676 + }, + { + "epoch": 0.04587031641710143, + "grad_norm": 19.810890197753906, + "learning_rate": 9.013333333333333e-05, + "loss": 1.3374, + "step": 677 + }, + { + "epoch": 0.045938071685073516, + "grad_norm": 16.13353729248047, + "learning_rate": 9.026666666666666e-05, + "loss": 1.4125, + "step": 678 + }, + { + "epoch": 0.0460058269530456, + "grad_norm": 15.257608413696289, + "learning_rate": 9.04e-05, + "loss": 1.1931, + "step": 679 + }, + { + "epoch": 0.046073582221017685, + "grad_norm": 16.88699722290039, + "learning_rate": 9.053333333333334e-05, + "loss": 1.362, + "step": 680 + }, + { + "epoch": 0.04614133748898977, + "grad_norm": 15.46777057647705, + "learning_rate": 9.066666666666667e-05, + "loss": 1.3522, + "step": 681 + }, + { + "epoch": 0.046209092756961855, + "grad_norm": 13.584056854248047, + "learning_rate": 9.080000000000001e-05, + "loss": 1.1998, + "step": 682 + }, + { + "epoch": 0.04627684802493394, + "grad_norm": 14.226449966430664, + "learning_rate": 9.093333333333334e-05, + "loss": 1.3312, + "step": 683 + }, + { + "epoch": 0.046344603292906024, + "grad_norm": 15.157097816467285, + "learning_rate": 9.106666666666667e-05, + "loss": 1.3422, + "step": 684 + }, + { + "epoch": 0.04641235856087811, + "grad_norm": 14.748275756835938, + "learning_rate": 9.120000000000001e-05, + "loss": 1.2902, + "step": 685 + }, + { + "epoch": 0.04648011382885019, + "grad_norm": 13.730619430541992, + "learning_rate": 9.133333333333334e-05, + "loss": 1.3034, + "step": 686 + }, + { + "epoch": 0.04654786909682228, + "grad_norm": 12.804062843322754, + "learning_rate": 9.146666666666666e-05, + "loss": 1.1484, + "step": 687 + }, + { + "epoch": 0.04661562436479436, + "grad_norm": 16.587923049926758, + "learning_rate": 9.16e-05, + "loss": 1.4875, + "step": 688 + }, + { + "epoch": 0.04668337963276645, + "grad_norm": 12.228131294250488, + "learning_rate": 9.173333333333333e-05, + "loss": 0.917, + "step": 689 + }, + { + "epoch": 0.04675113490073853, + "grad_norm": 18.680187225341797, + "learning_rate": 9.186666666666667e-05, + "loss": 1.497, + "step": 690 + }, + { + "epoch": 0.04681889016871062, + "grad_norm": 14.80630111694336, + "learning_rate": 9.200000000000001e-05, + "loss": 1.5623, + "step": 691 + }, + { + "epoch": 0.0468866454366827, + "grad_norm": 13.754642486572266, + "learning_rate": 9.213333333333334e-05, + "loss": 1.3921, + "step": 692 + }, + { + "epoch": 0.046954400704654786, + "grad_norm": 14.264236450195312, + "learning_rate": 9.226666666666667e-05, + "loss": 1.3615, + "step": 693 + }, + { + "epoch": 0.04702215597262687, + "grad_norm": 15.883113861083984, + "learning_rate": 9.240000000000001e-05, + "loss": 1.4528, + "step": 694 + }, + { + "epoch": 0.047089911240598956, + "grad_norm": 13.528610229492188, + "learning_rate": 9.253333333333334e-05, + "loss": 1.0995, + "step": 695 + }, + { + "epoch": 0.04715766650857104, + "grad_norm": 15.945343971252441, + "learning_rate": 9.266666666666666e-05, + "loss": 1.272, + "step": 696 + }, + { + "epoch": 0.047225421776543125, + "grad_norm": 14.378050804138184, + "learning_rate": 9.28e-05, + "loss": 1.3545, + "step": 697 + }, + { + "epoch": 0.04729317704451521, + "grad_norm": 13.457077980041504, + "learning_rate": 9.293333333333333e-05, + "loss": 1.1751, + "step": 698 + }, + { + "epoch": 0.047360932312487294, + "grad_norm": 19.285078048706055, + "learning_rate": 9.306666666666667e-05, + "loss": 1.4383, + "step": 699 + }, + { + "epoch": 0.04742868758045938, + "grad_norm": 16.683856964111328, + "learning_rate": 9.320000000000002e-05, + "loss": 1.2499, + "step": 700 + }, + { + "epoch": 0.047496442848431464, + "grad_norm": 13.818337440490723, + "learning_rate": 9.333333333333334e-05, + "loss": 1.2325, + "step": 701 + }, + { + "epoch": 0.04756419811640355, + "grad_norm": 12.51142406463623, + "learning_rate": 9.346666666666667e-05, + "loss": 1.3317, + "step": 702 + }, + { + "epoch": 0.04763195338437563, + "grad_norm": 14.697171211242676, + "learning_rate": 9.360000000000001e-05, + "loss": 1.6055, + "step": 703 + }, + { + "epoch": 0.04769970865234772, + "grad_norm": 16.942562103271484, + "learning_rate": 9.373333333333334e-05, + "loss": 1.4279, + "step": 704 + }, + { + "epoch": 0.0477674639203198, + "grad_norm": 16.739248275756836, + "learning_rate": 9.386666666666667e-05, + "loss": 1.2285, + "step": 705 + }, + { + "epoch": 0.04783521918829189, + "grad_norm": 14.479548454284668, + "learning_rate": 9.4e-05, + "loss": 1.1984, + "step": 706 + }, + { + "epoch": 0.04790297445626397, + "grad_norm": 14.384824752807617, + "learning_rate": 9.413333333333334e-05, + "loss": 1.2128, + "step": 707 + }, + { + "epoch": 0.04797072972423606, + "grad_norm": 15.41884994506836, + "learning_rate": 9.426666666666666e-05, + "loss": 1.1627, + "step": 708 + }, + { + "epoch": 0.04803848499220814, + "grad_norm": 13.267902374267578, + "learning_rate": 9.44e-05, + "loss": 1.2916, + "step": 709 + }, + { + "epoch": 0.048106240260180226, + "grad_norm": 14.909900665283203, + "learning_rate": 9.453333333333335e-05, + "loss": 1.1579, + "step": 710 + }, + { + "epoch": 0.04817399552815231, + "grad_norm": 12.639839172363281, + "learning_rate": 9.466666666666667e-05, + "loss": 1.2053, + "step": 711 + }, + { + "epoch": 0.048241750796124395, + "grad_norm": 14.957659721374512, + "learning_rate": 9.48e-05, + "loss": 1.4556, + "step": 712 + }, + { + "epoch": 0.04830950606409648, + "grad_norm": 14.008201599121094, + "learning_rate": 9.493333333333334e-05, + "loss": 1.1575, + "step": 713 + }, + { + "epoch": 0.048377261332068565, + "grad_norm": 13.857501983642578, + "learning_rate": 9.506666666666667e-05, + "loss": 1.4121, + "step": 714 + }, + { + "epoch": 0.048445016600040657, + "grad_norm": 13.867570877075195, + "learning_rate": 9.52e-05, + "loss": 1.4908, + "step": 715 + }, + { + "epoch": 0.04851277186801274, + "grad_norm": 14.3545503616333, + "learning_rate": 9.533333333333334e-05, + "loss": 1.2939, + "step": 716 + }, + { + "epoch": 0.048580527135984826, + "grad_norm": 15.173371315002441, + "learning_rate": 9.546666666666667e-05, + "loss": 1.3912, + "step": 717 + }, + { + "epoch": 0.04864828240395691, + "grad_norm": 13.799453735351562, + "learning_rate": 9.56e-05, + "loss": 1.4219, + "step": 718 + }, + { + "epoch": 0.048716037671928995, + "grad_norm": 18.264965057373047, + "learning_rate": 9.573333333333335e-05, + "loss": 1.4967, + "step": 719 + }, + { + "epoch": 0.04878379293990108, + "grad_norm": 13.809319496154785, + "learning_rate": 9.586666666666667e-05, + "loss": 1.3527, + "step": 720 + }, + { + "epoch": 0.048851548207873165, + "grad_norm": 13.931517601013184, + "learning_rate": 9.6e-05, + "loss": 1.4018, + "step": 721 + }, + { + "epoch": 0.04891930347584525, + "grad_norm": 11.74387264251709, + "learning_rate": 9.613333333333334e-05, + "loss": 1.1981, + "step": 722 + }, + { + "epoch": 0.048987058743817334, + "grad_norm": 14.960589408874512, + "learning_rate": 9.626666666666667e-05, + "loss": 1.3373, + "step": 723 + }, + { + "epoch": 0.04905481401178942, + "grad_norm": 14.170279502868652, + "learning_rate": 9.64e-05, + "loss": 1.2702, + "step": 724 + }, + { + "epoch": 0.049122569279761504, + "grad_norm": 15.532767295837402, + "learning_rate": 9.653333333333334e-05, + "loss": 1.1329, + "step": 725 + }, + { + "epoch": 0.04919032454773359, + "grad_norm": 12.141284942626953, + "learning_rate": 9.666666666666667e-05, + "loss": 1.2694, + "step": 726 + }, + { + "epoch": 0.04925807981570567, + "grad_norm": 12.379233360290527, + "learning_rate": 9.680000000000001e-05, + "loss": 1.1545, + "step": 727 + }, + { + "epoch": 0.04932583508367776, + "grad_norm": 14.45605182647705, + "learning_rate": 9.693333333333335e-05, + "loss": 1.4151, + "step": 728 + }, + { + "epoch": 0.04939359035164984, + "grad_norm": 15.656912803649902, + "learning_rate": 9.706666666666668e-05, + "loss": 1.3782, + "step": 729 + }, + { + "epoch": 0.04946134561962193, + "grad_norm": 13.992748260498047, + "learning_rate": 9.72e-05, + "loss": 1.2731, + "step": 730 + }, + { + "epoch": 0.04952910088759401, + "grad_norm": 16.253517150878906, + "learning_rate": 9.733333333333335e-05, + "loss": 1.4503, + "step": 731 + }, + { + "epoch": 0.049596856155566096, + "grad_norm": 14.88161563873291, + "learning_rate": 9.746666666666667e-05, + "loss": 1.3909, + "step": 732 + }, + { + "epoch": 0.04966461142353818, + "grad_norm": 14.60545539855957, + "learning_rate": 9.76e-05, + "loss": 1.1803, + "step": 733 + }, + { + "epoch": 0.049732366691510266, + "grad_norm": 14.589803695678711, + "learning_rate": 9.773333333333334e-05, + "loss": 1.3844, + "step": 734 + }, + { + "epoch": 0.04980012195948235, + "grad_norm": 14.781076431274414, + "learning_rate": 9.786666666666667e-05, + "loss": 1.2706, + "step": 735 + }, + { + "epoch": 0.049867877227454435, + "grad_norm": 13.717072486877441, + "learning_rate": 9.8e-05, + "loss": 1.2311, + "step": 736 + }, + { + "epoch": 0.04993563249542652, + "grad_norm": 17.14280891418457, + "learning_rate": 9.813333333333334e-05, + "loss": 1.3423, + "step": 737 + }, + { + "epoch": 0.050003387763398605, + "grad_norm": 14.521373748779297, + "learning_rate": 9.826666666666668e-05, + "loss": 1.3118, + "step": 738 + }, + { + "epoch": 0.05007114303137069, + "grad_norm": 14.79515266418457, + "learning_rate": 9.84e-05, + "loss": 1.3622, + "step": 739 + }, + { + "epoch": 0.050138898299342774, + "grad_norm": 13.63962459564209, + "learning_rate": 9.853333333333333e-05, + "loss": 1.3856, + "step": 740 + }, + { + "epoch": 0.05020665356731486, + "grad_norm": 13.788541793823242, + "learning_rate": 9.866666666666668e-05, + "loss": 1.3439, + "step": 741 + }, + { + "epoch": 0.05027440883528694, + "grad_norm": 14.015862464904785, + "learning_rate": 9.88e-05, + "loss": 1.3263, + "step": 742 + }, + { + "epoch": 0.05034216410325903, + "grad_norm": 15.407167434692383, + "learning_rate": 9.893333333333333e-05, + "loss": 1.5115, + "step": 743 + }, + { + "epoch": 0.05040991937123111, + "grad_norm": 14.542003631591797, + "learning_rate": 9.906666666666667e-05, + "loss": 1.3934, + "step": 744 + }, + { + "epoch": 0.0504776746392032, + "grad_norm": 16.733686447143555, + "learning_rate": 9.92e-05, + "loss": 1.2098, + "step": 745 + }, + { + "epoch": 0.05054542990717528, + "grad_norm": 14.979568481445312, + "learning_rate": 9.933333333333334e-05, + "loss": 1.3234, + "step": 746 + }, + { + "epoch": 0.05061318517514737, + "grad_norm": 13.699801445007324, + "learning_rate": 9.946666666666668e-05, + "loss": 1.3173, + "step": 747 + }, + { + "epoch": 0.05068094044311945, + "grad_norm": 13.262088775634766, + "learning_rate": 9.960000000000001e-05, + "loss": 1.3459, + "step": 748 + }, + { + "epoch": 0.050748695711091536, + "grad_norm": 14.694448471069336, + "learning_rate": 9.973333333333334e-05, + "loss": 1.3639, + "step": 749 + }, + { + "epoch": 0.05081645097906362, + "grad_norm": 13.999008178710938, + "learning_rate": 9.986666666666668e-05, + "loss": 1.4275, + "step": 750 + }, + { + "epoch": 0.050884206247035706, + "grad_norm": 11.588037490844727, + "learning_rate": 0.0001, + "loss": 1.2866, + "step": 751 + }, + { + "epoch": 0.05095196151500779, + "grad_norm": 16.174110412597656, + "learning_rate": 9.999863098090219e-05, + "loss": 1.5338, + "step": 752 + }, + { + "epoch": 0.051019716782979875, + "grad_norm": 17.8669376373291, + "learning_rate": 9.999726196180437e-05, + "loss": 1.3247, + "step": 753 + }, + { + "epoch": 0.05108747205095196, + "grad_norm": 13.849075317382812, + "learning_rate": 9.999589294270656e-05, + "loss": 1.1817, + "step": 754 + }, + { + "epoch": 0.051155227318924044, + "grad_norm": 14.888331413269043, + "learning_rate": 9.999452392360874e-05, + "loss": 1.4019, + "step": 755 + }, + { + "epoch": 0.05122298258689613, + "grad_norm": 18.68206024169922, + "learning_rate": 9.999315490451092e-05, + "loss": 1.7115, + "step": 756 + }, + { + "epoch": 0.051290737854868214, + "grad_norm": 14.762079238891602, + "learning_rate": 9.99917858854131e-05, + "loss": 1.2553, + "step": 757 + }, + { + "epoch": 0.0513584931228403, + "grad_norm": 14.649972915649414, + "learning_rate": 9.99904168663153e-05, + "loss": 1.0807, + "step": 758 + }, + { + "epoch": 0.05142624839081238, + "grad_norm": 13.172977447509766, + "learning_rate": 9.998904784721747e-05, + "loss": 1.4966, + "step": 759 + }, + { + "epoch": 0.05149400365878447, + "grad_norm": 14.144796371459961, + "learning_rate": 9.998767882811965e-05, + "loss": 1.234, + "step": 760 + }, + { + "epoch": 0.05156175892675655, + "grad_norm": 17.565507888793945, + "learning_rate": 9.998630980902184e-05, + "loss": 1.4061, + "step": 761 + }, + { + "epoch": 0.05162951419472864, + "grad_norm": 17.205589294433594, + "learning_rate": 9.998494078992402e-05, + "loss": 1.6626, + "step": 762 + }, + { + "epoch": 0.05169726946270072, + "grad_norm": 16.14542007446289, + "learning_rate": 9.998357177082621e-05, + "loss": 1.3145, + "step": 763 + }, + { + "epoch": 0.05176502473067281, + "grad_norm": 14.692976951599121, + "learning_rate": 9.998220275172839e-05, + "loss": 1.2085, + "step": 764 + }, + { + "epoch": 0.05183277999864489, + "grad_norm": 12.572774887084961, + "learning_rate": 9.998083373263057e-05, + "loss": 1.3743, + "step": 765 + }, + { + "epoch": 0.051900535266616976, + "grad_norm": 15.904753684997559, + "learning_rate": 9.997946471353275e-05, + "loss": 1.3863, + "step": 766 + }, + { + "epoch": 0.05196829053458906, + "grad_norm": 14.471494674682617, + "learning_rate": 9.997809569443493e-05, + "loss": 1.3615, + "step": 767 + }, + { + "epoch": 0.05203604580256115, + "grad_norm": 13.82172966003418, + "learning_rate": 9.997672667533712e-05, + "loss": 1.2104, + "step": 768 + }, + { + "epoch": 0.05210380107053324, + "grad_norm": 14.648073196411133, + "learning_rate": 9.99753576562393e-05, + "loss": 1.1166, + "step": 769 + }, + { + "epoch": 0.05217155633850532, + "grad_norm": 16.776750564575195, + "learning_rate": 9.997398863714149e-05, + "loss": 1.4242, + "step": 770 + }, + { + "epoch": 0.05223931160647741, + "grad_norm": 13.999717712402344, + "learning_rate": 9.997261961804367e-05, + "loss": 1.2493, + "step": 771 + }, + { + "epoch": 0.05230706687444949, + "grad_norm": 13.238443374633789, + "learning_rate": 9.997125059894586e-05, + "loss": 1.0469, + "step": 772 + }, + { + "epoch": 0.052374822142421576, + "grad_norm": 12.79283332824707, + "learning_rate": 9.996988157984804e-05, + "loss": 1.1975, + "step": 773 + }, + { + "epoch": 0.05244257741039366, + "grad_norm": 15.577791213989258, + "learning_rate": 9.996851256075022e-05, + "loss": 1.4972, + "step": 774 + }, + { + "epoch": 0.052510332678365745, + "grad_norm": 16.215747833251953, + "learning_rate": 9.996714354165241e-05, + "loss": 1.4285, + "step": 775 + }, + { + "epoch": 0.05257808794633783, + "grad_norm": 14.68825626373291, + "learning_rate": 9.99657745225546e-05, + "loss": 1.2028, + "step": 776 + }, + { + "epoch": 0.052645843214309915, + "grad_norm": 11.535130500793457, + "learning_rate": 9.996440550345677e-05, + "loss": 1.2466, + "step": 777 + }, + { + "epoch": 0.052713598482282, + "grad_norm": 14.865918159484863, + "learning_rate": 9.996303648435897e-05, + "loss": 1.3903, + "step": 778 + }, + { + "epoch": 0.052781353750254084, + "grad_norm": 13.626388549804688, + "learning_rate": 9.996166746526115e-05, + "loss": 1.5446, + "step": 779 + }, + { + "epoch": 0.05284910901822617, + "grad_norm": 12.957473754882812, + "learning_rate": 9.996029844616333e-05, + "loss": 1.1641, + "step": 780 + }, + { + "epoch": 0.052916864286198254, + "grad_norm": 14.928487777709961, + "learning_rate": 9.995892942706552e-05, + "loss": 1.0929, + "step": 781 + }, + { + "epoch": 0.05298461955417034, + "grad_norm": 13.592952728271484, + "learning_rate": 9.99575604079677e-05, + "loss": 1.4684, + "step": 782 + }, + { + "epoch": 0.05305237482214242, + "grad_norm": 13.836453437805176, + "learning_rate": 9.995619138886988e-05, + "loss": 1.0128, + "step": 783 + }, + { + "epoch": 0.05312013009011451, + "grad_norm": 16.654394149780273, + "learning_rate": 9.995482236977206e-05, + "loss": 1.2464, + "step": 784 + }, + { + "epoch": 0.05318788535808659, + "grad_norm": 14.4434232711792, + "learning_rate": 9.995345335067424e-05, + "loss": 1.3649, + "step": 785 + }, + { + "epoch": 0.05325564062605868, + "grad_norm": 14.15592098236084, + "learning_rate": 9.995208433157644e-05, + "loss": 1.4426, + "step": 786 + }, + { + "epoch": 0.05332339589403076, + "grad_norm": 12.97901439666748, + "learning_rate": 9.995071531247862e-05, + "loss": 1.227, + "step": 787 + }, + { + "epoch": 0.053391151162002846, + "grad_norm": 13.167902946472168, + "learning_rate": 9.99493462933808e-05, + "loss": 1.2432, + "step": 788 + }, + { + "epoch": 0.05345890642997493, + "grad_norm": 15.92297077178955, + "learning_rate": 9.994797727428298e-05, + "loss": 1.3345, + "step": 789 + }, + { + "epoch": 0.053526661697947016, + "grad_norm": 13.30277156829834, + "learning_rate": 9.994660825518517e-05, + "loss": 1.2361, + "step": 790 + }, + { + "epoch": 0.0535944169659191, + "grad_norm": 15.98779582977295, + "learning_rate": 9.994523923608735e-05, + "loss": 1.1202, + "step": 791 + }, + { + "epoch": 0.053662172233891185, + "grad_norm": 16.414382934570312, + "learning_rate": 9.994387021698953e-05, + "loss": 1.448, + "step": 792 + }, + { + "epoch": 0.05372992750186327, + "grad_norm": 14.491677284240723, + "learning_rate": 9.994250119789171e-05, + "loss": 1.0862, + "step": 793 + }, + { + "epoch": 0.053797682769835355, + "grad_norm": 13.155410766601562, + "learning_rate": 9.99411321787939e-05, + "loss": 1.312, + "step": 794 + }, + { + "epoch": 0.05386543803780744, + "grad_norm": 16.11139488220215, + "learning_rate": 9.993976315969609e-05, + "loss": 1.2043, + "step": 795 + }, + { + "epoch": 0.053933193305779524, + "grad_norm": 14.259698867797852, + "learning_rate": 9.993839414059827e-05, + "loss": 1.2256, + "step": 796 + }, + { + "epoch": 0.05400094857375161, + "grad_norm": 15.751099586486816, + "learning_rate": 9.993702512150045e-05, + "loss": 1.0732, + "step": 797 + }, + { + "epoch": 0.05406870384172369, + "grad_norm": 12.766170501708984, + "learning_rate": 9.993565610240263e-05, + "loss": 1.0362, + "step": 798 + }, + { + "epoch": 0.05413645910969578, + "grad_norm": 14.61483097076416, + "learning_rate": 9.993428708330481e-05, + "loss": 1.3107, + "step": 799 + }, + { + "epoch": 0.05420421437766786, + "grad_norm": 14.571990013122559, + "learning_rate": 9.9932918064207e-05, + "loss": 1.3616, + "step": 800 + }, + { + "epoch": 0.05427196964563995, + "grad_norm": 14.69124984741211, + "learning_rate": 9.993154904510918e-05, + "loss": 1.4273, + "step": 801 + }, + { + "epoch": 0.05433972491361203, + "grad_norm": 13.834383010864258, + "learning_rate": 9.993018002601136e-05, + "loss": 1.2886, + "step": 802 + }, + { + "epoch": 0.05440748018158412, + "grad_norm": 14.178943634033203, + "learning_rate": 9.992881100691355e-05, + "loss": 1.44, + "step": 803 + }, + { + "epoch": 0.0544752354495562, + "grad_norm": 13.185267448425293, + "learning_rate": 9.992744198781574e-05, + "loss": 1.5609, + "step": 804 + }, + { + "epoch": 0.054542990717528286, + "grad_norm": 15.211031913757324, + "learning_rate": 9.992607296871792e-05, + "loss": 1.8873, + "step": 805 + }, + { + "epoch": 0.05461074598550037, + "grad_norm": 11.855447769165039, + "learning_rate": 9.99247039496201e-05, + "loss": 1.4063, + "step": 806 + }, + { + "epoch": 0.054678501253472456, + "grad_norm": 14.352787017822266, + "learning_rate": 9.992333493052228e-05, + "loss": 1.56, + "step": 807 + }, + { + "epoch": 0.05474625652144454, + "grad_norm": 14.339908599853516, + "learning_rate": 9.992196591142446e-05, + "loss": 1.1985, + "step": 808 + }, + { + "epoch": 0.054814011789416625, + "grad_norm": 12.696648597717285, + "learning_rate": 9.992059689232665e-05, + "loss": 1.1807, + "step": 809 + }, + { + "epoch": 0.05488176705738871, + "grad_norm": 14.083525657653809, + "learning_rate": 9.991922787322883e-05, + "loss": 1.4012, + "step": 810 + }, + { + "epoch": 0.054949522325360795, + "grad_norm": 14.298514366149902, + "learning_rate": 9.991785885413101e-05, + "loss": 1.4047, + "step": 811 + }, + { + "epoch": 0.05501727759333288, + "grad_norm": 13.391324996948242, + "learning_rate": 9.99164898350332e-05, + "loss": 1.2292, + "step": 812 + }, + { + "epoch": 0.055085032861304964, + "grad_norm": 14.0011625289917, + "learning_rate": 9.991512081593539e-05, + "loss": 1.3194, + "step": 813 + }, + { + "epoch": 0.05515278812927705, + "grad_norm": 14.726574897766113, + "learning_rate": 9.991375179683757e-05, + "loss": 1.3245, + "step": 814 + }, + { + "epoch": 0.05522054339724913, + "grad_norm": 14.615242004394531, + "learning_rate": 9.991238277773975e-05, + "loss": 1.3704, + "step": 815 + }, + { + "epoch": 0.05528829866522122, + "grad_norm": 11.06546688079834, + "learning_rate": 9.991101375864193e-05, + "loss": 1.0841, + "step": 816 + }, + { + "epoch": 0.0553560539331933, + "grad_norm": 13.768633842468262, + "learning_rate": 9.990964473954411e-05, + "loss": 1.2632, + "step": 817 + }, + { + "epoch": 0.05542380920116539, + "grad_norm": 15.054973602294922, + "learning_rate": 9.99082757204463e-05, + "loss": 1.2931, + "step": 818 + }, + { + "epoch": 0.05549156446913747, + "grad_norm": 15.27096176147461, + "learning_rate": 9.990690670134848e-05, + "loss": 1.1789, + "step": 819 + }, + { + "epoch": 0.055559319737109564, + "grad_norm": 13.228281021118164, + "learning_rate": 9.990553768225067e-05, + "loss": 1.4458, + "step": 820 + }, + { + "epoch": 0.05562707500508165, + "grad_norm": 16.016782760620117, + "learning_rate": 9.990416866315286e-05, + "loss": 1.5121, + "step": 821 + }, + { + "epoch": 0.05569483027305373, + "grad_norm": 14.15912914276123, + "learning_rate": 9.990279964405504e-05, + "loss": 1.3702, + "step": 822 + }, + { + "epoch": 0.05576258554102582, + "grad_norm": 14.61017894744873, + "learning_rate": 9.990143062495722e-05, + "loss": 1.2774, + "step": 823 + }, + { + "epoch": 0.0558303408089979, + "grad_norm": 13.241140365600586, + "learning_rate": 9.990006160585941e-05, + "loss": 1.3857, + "step": 824 + }, + { + "epoch": 0.05589809607696999, + "grad_norm": 11.815064430236816, + "learning_rate": 9.98986925867616e-05, + "loss": 1.1053, + "step": 825 + }, + { + "epoch": 0.05596585134494207, + "grad_norm": 13.179222106933594, + "learning_rate": 9.989732356766377e-05, + "loss": 1.4686, + "step": 826 + }, + { + "epoch": 0.05603360661291416, + "grad_norm": 11.888179779052734, + "learning_rate": 9.989595454856597e-05, + "loss": 1.3191, + "step": 827 + }, + { + "epoch": 0.05610136188088624, + "grad_norm": 15.43813419342041, + "learning_rate": 9.989458552946815e-05, + "loss": 0.9732, + "step": 828 + }, + { + "epoch": 0.056169117148858326, + "grad_norm": 20.0246524810791, + "learning_rate": 9.989321651037033e-05, + "loss": 1.5398, + "step": 829 + }, + { + "epoch": 0.05623687241683041, + "grad_norm": 14.247052192687988, + "learning_rate": 9.989184749127251e-05, + "loss": 1.5093, + "step": 830 + }, + { + "epoch": 0.056304627684802495, + "grad_norm": 15.63775634765625, + "learning_rate": 9.989047847217469e-05, + "loss": 1.3488, + "step": 831 + }, + { + "epoch": 0.05637238295277458, + "grad_norm": 10.582650184631348, + "learning_rate": 9.988910945307688e-05, + "loss": 1.3888, + "step": 832 + }, + { + "epoch": 0.056440138220746665, + "grad_norm": 12.344864845275879, + "learning_rate": 9.988774043397906e-05, + "loss": 1.2876, + "step": 833 + }, + { + "epoch": 0.05650789348871875, + "grad_norm": 13.95814323425293, + "learning_rate": 9.988637141488124e-05, + "loss": 1.2648, + "step": 834 + }, + { + "epoch": 0.056575648756690834, + "grad_norm": 9.740105628967285, + "learning_rate": 9.988500239578342e-05, + "loss": 0.9777, + "step": 835 + }, + { + "epoch": 0.05664340402466292, + "grad_norm": 16.16588020324707, + "learning_rate": 9.988363337668562e-05, + "loss": 1.5859, + "step": 836 + }, + { + "epoch": 0.056711159292635004, + "grad_norm": 14.394731521606445, + "learning_rate": 9.98822643575878e-05, + "loss": 1.3395, + "step": 837 + }, + { + "epoch": 0.05677891456060709, + "grad_norm": 11.256571769714355, + "learning_rate": 9.988089533848998e-05, + "loss": 1.0528, + "step": 838 + }, + { + "epoch": 0.05684666982857917, + "grad_norm": 17.595510482788086, + "learning_rate": 9.987952631939216e-05, + "loss": 1.2678, + "step": 839 + }, + { + "epoch": 0.05691442509655126, + "grad_norm": 14.132645606994629, + "learning_rate": 9.987815730029434e-05, + "loss": 1.4129, + "step": 840 + }, + { + "epoch": 0.05698218036452334, + "grad_norm": 14.438119888305664, + "learning_rate": 9.987678828119653e-05, + "loss": 1.4141, + "step": 841 + }, + { + "epoch": 0.05704993563249543, + "grad_norm": 15.983094215393066, + "learning_rate": 9.987541926209871e-05, + "loss": 1.1959, + "step": 842 + }, + { + "epoch": 0.05711769090046751, + "grad_norm": 17.300403594970703, + "learning_rate": 9.98740502430009e-05, + "loss": 1.1613, + "step": 843 + }, + { + "epoch": 0.057185446168439596, + "grad_norm": 15.150660514831543, + "learning_rate": 9.987268122390307e-05, + "loss": 1.2253, + "step": 844 + }, + { + "epoch": 0.05725320143641168, + "grad_norm": 14.234586715698242, + "learning_rate": 9.987131220480525e-05, + "loss": 1.2517, + "step": 845 + }, + { + "epoch": 0.057320956704383766, + "grad_norm": 18.31337547302246, + "learning_rate": 9.986994318570745e-05, + "loss": 1.7506, + "step": 846 + }, + { + "epoch": 0.05738871197235585, + "grad_norm": 14.818669319152832, + "learning_rate": 9.986857416660963e-05, + "loss": 1.358, + "step": 847 + }, + { + "epoch": 0.057456467240327935, + "grad_norm": 14.640913009643555, + "learning_rate": 9.986720514751181e-05, + "loss": 1.4481, + "step": 848 + }, + { + "epoch": 0.05752422250830002, + "grad_norm": 12.277986526489258, + "learning_rate": 9.986583612841399e-05, + "loss": 1.3428, + "step": 849 + }, + { + "epoch": 0.057591977776272105, + "grad_norm": 13.817851066589355, + "learning_rate": 9.986446710931618e-05, + "loss": 1.3514, + "step": 850 + }, + { + "epoch": 0.05765973304424419, + "grad_norm": 13.706515312194824, + "learning_rate": 9.986309809021836e-05, + "loss": 1.4295, + "step": 851 + }, + { + "epoch": 0.057727488312216274, + "grad_norm": 16.649917602539062, + "learning_rate": 9.986172907112054e-05, + "loss": 1.3895, + "step": 852 + }, + { + "epoch": 0.05779524358018836, + "grad_norm": 13.659167289733887, + "learning_rate": 9.986036005202272e-05, + "loss": 1.1545, + "step": 853 + }, + { + "epoch": 0.057862998848160443, + "grad_norm": 11.264912605285645, + "learning_rate": 9.98589910329249e-05, + "loss": 1.0766, + "step": 854 + }, + { + "epoch": 0.05793075411613253, + "grad_norm": 12.930856704711914, + "learning_rate": 9.98576220138271e-05, + "loss": 1.1665, + "step": 855 + }, + { + "epoch": 0.05799850938410461, + "grad_norm": 15.999971389770508, + "learning_rate": 9.985625299472928e-05, + "loss": 1.4595, + "step": 856 + }, + { + "epoch": 0.0580662646520767, + "grad_norm": 14.566671371459961, + "learning_rate": 9.985488397563146e-05, + "loss": 1.283, + "step": 857 + }, + { + "epoch": 0.05813401992004878, + "grad_norm": 16.106964111328125, + "learning_rate": 9.985351495653364e-05, + "loss": 1.2842, + "step": 858 + }, + { + "epoch": 0.05820177518802087, + "grad_norm": 15.47492790222168, + "learning_rate": 9.985214593743583e-05, + "loss": 1.3949, + "step": 859 + }, + { + "epoch": 0.05826953045599295, + "grad_norm": 10.692886352539062, + "learning_rate": 9.985077691833801e-05, + "loss": 1.2896, + "step": 860 + }, + { + "epoch": 0.058337285723965036, + "grad_norm": 14.13198184967041, + "learning_rate": 9.98494078992402e-05, + "loss": 1.1566, + "step": 861 + }, + { + "epoch": 0.05840504099193712, + "grad_norm": 14.455452919006348, + "learning_rate": 9.984803888014237e-05, + "loss": 1.1949, + "step": 862 + }, + { + "epoch": 0.058472796259909206, + "grad_norm": 15.020733833312988, + "learning_rate": 9.984666986104456e-05, + "loss": 1.5202, + "step": 863 + }, + { + "epoch": 0.05854055152788129, + "grad_norm": 21.089344024658203, + "learning_rate": 9.984530084194675e-05, + "loss": 1.5083, + "step": 864 + }, + { + "epoch": 0.058608306795853375, + "grad_norm": 14.414257049560547, + "learning_rate": 9.984393182284893e-05, + "loss": 1.3977, + "step": 865 + }, + { + "epoch": 0.05867606206382546, + "grad_norm": 15.667798042297363, + "learning_rate": 9.984256280375111e-05, + "loss": 1.4044, + "step": 866 + }, + { + "epoch": 0.058743817331797545, + "grad_norm": 12.503005981445312, + "learning_rate": 9.98411937846533e-05, + "loss": 1.3369, + "step": 867 + }, + { + "epoch": 0.05881157259976963, + "grad_norm": 13.27022933959961, + "learning_rate": 9.983982476555548e-05, + "loss": 1.4134, + "step": 868 + }, + { + "epoch": 0.058879327867741714, + "grad_norm": 16.2034969329834, + "learning_rate": 9.983845574645766e-05, + "loss": 1.1196, + "step": 869 + }, + { + "epoch": 0.0589470831357138, + "grad_norm": 9.868896484375, + "learning_rate": 9.983708672735986e-05, + "loss": 1.0248, + "step": 870 + }, + { + "epoch": 0.05901483840368588, + "grad_norm": 15.588685989379883, + "learning_rate": 9.983571770826204e-05, + "loss": 1.5865, + "step": 871 + }, + { + "epoch": 0.05908259367165797, + "grad_norm": 14.688246726989746, + "learning_rate": 9.983434868916422e-05, + "loss": 1.2959, + "step": 872 + }, + { + "epoch": 0.05915034893963006, + "grad_norm": 14.252961158752441, + "learning_rate": 9.983297967006641e-05, + "loss": 1.3536, + "step": 873 + }, + { + "epoch": 0.059218104207602144, + "grad_norm": 11.79800033569336, + "learning_rate": 9.98316106509686e-05, + "loss": 1.331, + "step": 874 + }, + { + "epoch": 0.05928585947557423, + "grad_norm": 11.900074005126953, + "learning_rate": 9.983024163187077e-05, + "loss": 1.2241, + "step": 875 + }, + { + "epoch": 0.059353614743546314, + "grad_norm": 10.696773529052734, + "learning_rate": 9.982887261277295e-05, + "loss": 1.4041, + "step": 876 + }, + { + "epoch": 0.0594213700115184, + "grad_norm": 13.532305717468262, + "learning_rate": 9.982750359367513e-05, + "loss": 1.3621, + "step": 877 + }, + { + "epoch": 0.05948912527949048, + "grad_norm": 14.107857704162598, + "learning_rate": 9.982613457457733e-05, + "loss": 1.6327, + "step": 878 + }, + { + "epoch": 0.05955688054746257, + "grad_norm": 11.584097862243652, + "learning_rate": 9.982476555547951e-05, + "loss": 0.9606, + "step": 879 + }, + { + "epoch": 0.05962463581543465, + "grad_norm": 14.240161895751953, + "learning_rate": 9.982339653638169e-05, + "loss": 1.263, + "step": 880 + }, + { + "epoch": 0.05969239108340674, + "grad_norm": 14.461871147155762, + "learning_rate": 9.982202751728387e-05, + "loss": 1.4201, + "step": 881 + }, + { + "epoch": 0.05976014635137882, + "grad_norm": 14.072705268859863, + "learning_rate": 9.982065849818606e-05, + "loss": 1.5977, + "step": 882 + }, + { + "epoch": 0.05982790161935091, + "grad_norm": 14.928994178771973, + "learning_rate": 9.981928947908824e-05, + "loss": 1.3346, + "step": 883 + }, + { + "epoch": 0.05989565688732299, + "grad_norm": 14.898951530456543, + "learning_rate": 9.981792045999042e-05, + "loss": 1.312, + "step": 884 + }, + { + "epoch": 0.059963412155295076, + "grad_norm": 13.089646339416504, + "learning_rate": 9.98165514408926e-05, + "loss": 1.0833, + "step": 885 + }, + { + "epoch": 0.06003116742326716, + "grad_norm": 15.768043518066406, + "learning_rate": 9.981518242179478e-05, + "loss": 1.1246, + "step": 886 + }, + { + "epoch": 0.060098922691239245, + "grad_norm": 11.8709135055542, + "learning_rate": 9.981381340269698e-05, + "loss": 1.1822, + "step": 887 + }, + { + "epoch": 0.06016667795921133, + "grad_norm": 15.698454856872559, + "learning_rate": 9.981244438359916e-05, + "loss": 1.3733, + "step": 888 + }, + { + "epoch": 0.060234433227183415, + "grad_norm": 14.827208518981934, + "learning_rate": 9.981107536450134e-05, + "loss": 1.3121, + "step": 889 + }, + { + "epoch": 0.0603021884951555, + "grad_norm": 12.522045135498047, + "learning_rate": 9.980970634540352e-05, + "loss": 1.5584, + "step": 890 + }, + { + "epoch": 0.060369943763127584, + "grad_norm": 14.417738914489746, + "learning_rate": 9.980833732630571e-05, + "loss": 1.2389, + "step": 891 + }, + { + "epoch": 0.06043769903109967, + "grad_norm": 14.761930465698242, + "learning_rate": 9.98069683072079e-05, + "loss": 1.5007, + "step": 892 + }, + { + "epoch": 0.060505454299071754, + "grad_norm": 15.882668495178223, + "learning_rate": 9.980559928811007e-05, + "loss": 1.2651, + "step": 893 + }, + { + "epoch": 0.06057320956704384, + "grad_norm": 13.605412483215332, + "learning_rate": 9.980423026901225e-05, + "loss": 1.5186, + "step": 894 + }, + { + "epoch": 0.06064096483501592, + "grad_norm": 10.654335021972656, + "learning_rate": 9.980286124991443e-05, + "loss": 1.1177, + "step": 895 + }, + { + "epoch": 0.06070872010298801, + "grad_norm": 12.37457275390625, + "learning_rate": 9.980149223081663e-05, + "loss": 1.2362, + "step": 896 + }, + { + "epoch": 0.06077647537096009, + "grad_norm": 12.591222763061523, + "learning_rate": 9.980012321171881e-05, + "loss": 1.2509, + "step": 897 + }, + { + "epoch": 0.06084423063893218, + "grad_norm": 14.337310791015625, + "learning_rate": 9.979875419262099e-05, + "loss": 1.2987, + "step": 898 + }, + { + "epoch": 0.06091198590690426, + "grad_norm": 15.496018409729004, + "learning_rate": 9.979738517352317e-05, + "loss": 1.5307, + "step": 899 + }, + { + "epoch": 0.06097974117487635, + "grad_norm": 13.730890274047852, + "learning_rate": 9.979601615442535e-05, + "loss": 1.2975, + "step": 900 + }, + { + "epoch": 0.06104749644284843, + "grad_norm": 12.314823150634766, + "learning_rate": 9.979464713532754e-05, + "loss": 1.1916, + "step": 901 + }, + { + "epoch": 0.061115251710820516, + "grad_norm": 13.761808395385742, + "learning_rate": 9.979327811622972e-05, + "loss": 1.3037, + "step": 902 + }, + { + "epoch": 0.0611830069787926, + "grad_norm": 13.308722496032715, + "learning_rate": 9.97919090971319e-05, + "loss": 1.0598, + "step": 903 + }, + { + "epoch": 0.061250762246764685, + "grad_norm": 13.121098518371582, + "learning_rate": 9.979054007803408e-05, + "loss": 1.3378, + "step": 904 + }, + { + "epoch": 0.06131851751473677, + "grad_norm": 16.975666046142578, + "learning_rate": 9.978917105893628e-05, + "loss": 1.2826, + "step": 905 + }, + { + "epoch": 0.061386272782708855, + "grad_norm": 14.529984474182129, + "learning_rate": 9.978780203983846e-05, + "loss": 1.3467, + "step": 906 + }, + { + "epoch": 0.06145402805068094, + "grad_norm": 11.081110000610352, + "learning_rate": 9.978643302074064e-05, + "loss": 1.1906, + "step": 907 + }, + { + "epoch": 0.061521783318653024, + "grad_norm": 12.871200561523438, + "learning_rate": 9.978506400164282e-05, + "loss": 1.1057, + "step": 908 + }, + { + "epoch": 0.06158953858662511, + "grad_norm": 13.982168197631836, + "learning_rate": 9.9783694982545e-05, + "loss": 1.3824, + "step": 909 + }, + { + "epoch": 0.061657293854597194, + "grad_norm": 13.076074600219727, + "learning_rate": 9.97823259634472e-05, + "loss": 1.2985, + "step": 910 + }, + { + "epoch": 0.06172504912256928, + "grad_norm": 11.015650749206543, + "learning_rate": 9.978095694434937e-05, + "loss": 1.2406, + "step": 911 + }, + { + "epoch": 0.06179280439054136, + "grad_norm": 13.6082763671875, + "learning_rate": 9.977958792525155e-05, + "loss": 1.3873, + "step": 912 + }, + { + "epoch": 0.06186055965851345, + "grad_norm": 15.930809020996094, + "learning_rate": 9.977821890615375e-05, + "loss": 1.3542, + "step": 913 + }, + { + "epoch": 0.06192831492648553, + "grad_norm": 10.710271835327148, + "learning_rate": 9.977684988705593e-05, + "loss": 1.1571, + "step": 914 + }, + { + "epoch": 0.06199607019445762, + "grad_norm": 11.110217094421387, + "learning_rate": 9.977548086795811e-05, + "loss": 1.3255, + "step": 915 + }, + { + "epoch": 0.0620638254624297, + "grad_norm": 11.451903343200684, + "learning_rate": 9.97741118488603e-05, + "loss": 1.3681, + "step": 916 + }, + { + "epoch": 0.062131580730401786, + "grad_norm": 10.884252548217773, + "learning_rate": 9.977274282976248e-05, + "loss": 1.1988, + "step": 917 + }, + { + "epoch": 0.06219933599837387, + "grad_norm": 11.031237602233887, + "learning_rate": 9.977137381066466e-05, + "loss": 1.26, + "step": 918 + }, + { + "epoch": 0.062267091266345956, + "grad_norm": 11.585648536682129, + "learning_rate": 9.977000479156686e-05, + "loss": 1.3099, + "step": 919 + }, + { + "epoch": 0.06233484653431804, + "grad_norm": 10.867992401123047, + "learning_rate": 9.976863577246904e-05, + "loss": 1.2624, + "step": 920 + }, + { + "epoch": 0.062402601802290125, + "grad_norm": 14.552916526794434, + "learning_rate": 9.976726675337122e-05, + "loss": 1.2244, + "step": 921 + }, + { + "epoch": 0.06247035707026221, + "grad_norm": 12.101760864257812, + "learning_rate": 9.97658977342734e-05, + "loss": 1.1925, + "step": 922 + }, + { + "epoch": 0.0625381123382343, + "grad_norm": 14.113842010498047, + "learning_rate": 9.976452871517559e-05, + "loss": 1.3395, + "step": 923 + }, + { + "epoch": 0.06260586760620639, + "grad_norm": 17.214614868164062, + "learning_rate": 9.976315969607777e-05, + "loss": 1.3943, + "step": 924 + }, + { + "epoch": 0.06267362287417846, + "grad_norm": 13.43308162689209, + "learning_rate": 9.976179067697995e-05, + "loss": 1.1596, + "step": 925 + }, + { + "epoch": 0.06274137814215056, + "grad_norm": 13.806952476501465, + "learning_rate": 9.976042165788213e-05, + "loss": 1.0354, + "step": 926 + }, + { + "epoch": 0.06280913341012263, + "grad_norm": 15.638693809509277, + "learning_rate": 9.975905263878431e-05, + "loss": 1.3807, + "step": 927 + }, + { + "epoch": 0.06287688867809473, + "grad_norm": 14.337742805480957, + "learning_rate": 9.975768361968651e-05, + "loss": 1.537, + "step": 928 + }, + { + "epoch": 0.0629446439460668, + "grad_norm": 14.540297508239746, + "learning_rate": 9.975631460058869e-05, + "loss": 1.2522, + "step": 929 + }, + { + "epoch": 0.0630123992140389, + "grad_norm": 15.991955757141113, + "learning_rate": 9.975494558149087e-05, + "loss": 1.3855, + "step": 930 + }, + { + "epoch": 0.06308015448201097, + "grad_norm": 13.957479476928711, + "learning_rate": 9.975357656239305e-05, + "loss": 1.43, + "step": 931 + }, + { + "epoch": 0.06314790974998306, + "grad_norm": 16.805377960205078, + "learning_rate": 9.975220754329523e-05, + "loss": 1.383, + "step": 932 + }, + { + "epoch": 0.06321566501795514, + "grad_norm": 12.41854476928711, + "learning_rate": 9.975083852419742e-05, + "loss": 1.068, + "step": 933 + }, + { + "epoch": 0.06328342028592723, + "grad_norm": 15.929006576538086, + "learning_rate": 9.97494695050996e-05, + "loss": 1.212, + "step": 934 + }, + { + "epoch": 0.06335117555389931, + "grad_norm": 13.205544471740723, + "learning_rate": 9.974810048600178e-05, + "loss": 1.3682, + "step": 935 + }, + { + "epoch": 0.0634189308218714, + "grad_norm": 12.105626106262207, + "learning_rate": 9.974673146690396e-05, + "loss": 1.3821, + "step": 936 + }, + { + "epoch": 0.06348668608984348, + "grad_norm": 13.776711463928223, + "learning_rate": 9.974536244780616e-05, + "loss": 1.1073, + "step": 937 + }, + { + "epoch": 0.06355444135781557, + "grad_norm": 12.227380752563477, + "learning_rate": 9.974399342870834e-05, + "loss": 1.2026, + "step": 938 + }, + { + "epoch": 0.06362219662578765, + "grad_norm": 12.723440170288086, + "learning_rate": 9.974262440961052e-05, + "loss": 1.1325, + "step": 939 + }, + { + "epoch": 0.06368995189375974, + "grad_norm": 13.943262100219727, + "learning_rate": 9.97412553905127e-05, + "loss": 1.1878, + "step": 940 + }, + { + "epoch": 0.06375770716173182, + "grad_norm": 12.644627571105957, + "learning_rate": 9.973988637141488e-05, + "loss": 1.3097, + "step": 941 + }, + { + "epoch": 0.06382546242970391, + "grad_norm": 12.108241081237793, + "learning_rate": 9.973851735231707e-05, + "loss": 1.1686, + "step": 942 + }, + { + "epoch": 0.06389321769767599, + "grad_norm": 14.375092506408691, + "learning_rate": 9.973714833321925e-05, + "loss": 1.2721, + "step": 943 + }, + { + "epoch": 0.06396097296564808, + "grad_norm": 13.439800262451172, + "learning_rate": 9.973577931412143e-05, + "loss": 1.3898, + "step": 944 + }, + { + "epoch": 0.06402872823362016, + "grad_norm": 13.717879295349121, + "learning_rate": 9.973441029502361e-05, + "loss": 1.3823, + "step": 945 + }, + { + "epoch": 0.06409648350159225, + "grad_norm": 12.745361328125, + "learning_rate": 9.973304127592581e-05, + "loss": 1.2396, + "step": 946 + }, + { + "epoch": 0.06416423876956433, + "grad_norm": 11.784343719482422, + "learning_rate": 9.973167225682799e-05, + "loss": 1.214, + "step": 947 + }, + { + "epoch": 0.06423199403753642, + "grad_norm": 14.205467224121094, + "learning_rate": 9.973030323773017e-05, + "loss": 1.2803, + "step": 948 + }, + { + "epoch": 0.0642997493055085, + "grad_norm": 13.257532119750977, + "learning_rate": 9.972893421863235e-05, + "loss": 1.378, + "step": 949 + }, + { + "epoch": 0.06436750457348059, + "grad_norm": 15.153338432312012, + "learning_rate": 9.972756519953453e-05, + "loss": 1.2854, + "step": 950 + }, + { + "epoch": 0.06443525984145267, + "grad_norm": 16.765771865844727, + "learning_rate": 9.972619618043672e-05, + "loss": 1.3016, + "step": 951 + }, + { + "epoch": 0.06450301510942476, + "grad_norm": 14.636106491088867, + "learning_rate": 9.97248271613389e-05, + "loss": 1.3803, + "step": 952 + }, + { + "epoch": 0.06457077037739685, + "grad_norm": 13.87410831451416, + "learning_rate": 9.972345814224108e-05, + "loss": 1.4126, + "step": 953 + }, + { + "epoch": 0.06463852564536893, + "grad_norm": 14.328899383544922, + "learning_rate": 9.972208912314326e-05, + "loss": 1.061, + "step": 954 + }, + { + "epoch": 0.06470628091334102, + "grad_norm": 12.485203742980957, + "learning_rate": 9.972072010404544e-05, + "loss": 1.0985, + "step": 955 + }, + { + "epoch": 0.0647740361813131, + "grad_norm": 13.77907943725586, + "learning_rate": 9.971935108494764e-05, + "loss": 1.3534, + "step": 956 + }, + { + "epoch": 0.06484179144928519, + "grad_norm": 10.579590797424316, + "learning_rate": 9.971798206584982e-05, + "loss": 1.166, + "step": 957 + }, + { + "epoch": 0.06490954671725727, + "grad_norm": 14.690185546875, + "learning_rate": 9.9716613046752e-05, + "loss": 1.3666, + "step": 958 + }, + { + "epoch": 0.06497730198522936, + "grad_norm": 12.904786109924316, + "learning_rate": 9.97152440276542e-05, + "loss": 1.14, + "step": 959 + }, + { + "epoch": 0.06504505725320144, + "grad_norm": 12.126219749450684, + "learning_rate": 9.971387500855637e-05, + "loss": 1.4157, + "step": 960 + }, + { + "epoch": 0.06511281252117353, + "grad_norm": 13.747931480407715, + "learning_rate": 9.971250598945855e-05, + "loss": 1.4557, + "step": 961 + }, + { + "epoch": 0.0651805677891456, + "grad_norm": 13.232327461242676, + "learning_rate": 9.971113697036075e-05, + "loss": 1.3471, + "step": 962 + }, + { + "epoch": 0.0652483230571177, + "grad_norm": 14.886791229248047, + "learning_rate": 9.970976795126293e-05, + "loss": 1.1866, + "step": 963 + }, + { + "epoch": 0.06531607832508977, + "grad_norm": 11.747659683227539, + "learning_rate": 9.970839893216511e-05, + "loss": 1.0443, + "step": 964 + }, + { + "epoch": 0.06538383359306187, + "grad_norm": 11.181273460388184, + "learning_rate": 9.97070299130673e-05, + "loss": 1.1391, + "step": 965 + }, + { + "epoch": 0.06545158886103394, + "grad_norm": 11.9672269821167, + "learning_rate": 9.970566089396948e-05, + "loss": 1.2847, + "step": 966 + }, + { + "epoch": 0.06551934412900604, + "grad_norm": 15.825364112854004, + "learning_rate": 9.970429187487166e-05, + "loss": 1.3301, + "step": 967 + }, + { + "epoch": 0.06558709939697811, + "grad_norm": 12.26963996887207, + "learning_rate": 9.970292285577384e-05, + "loss": 1.2524, + "step": 968 + }, + { + "epoch": 0.0656548546649502, + "grad_norm": 11.440977096557617, + "learning_rate": 9.970155383667604e-05, + "loss": 1.229, + "step": 969 + }, + { + "epoch": 0.06572260993292228, + "grad_norm": 10.704546928405762, + "learning_rate": 9.970018481757822e-05, + "loss": 0.9936, + "step": 970 + }, + { + "epoch": 0.06579036520089437, + "grad_norm": 13.20880126953125, + "learning_rate": 9.96988157984804e-05, + "loss": 1.2353, + "step": 971 + }, + { + "epoch": 0.06585812046886645, + "grad_norm": 13.101622581481934, + "learning_rate": 9.969744677938258e-05, + "loss": 1.2559, + "step": 972 + }, + { + "epoch": 0.06592587573683854, + "grad_norm": 11.725826263427734, + "learning_rate": 9.969607776028476e-05, + "loss": 1.2051, + "step": 973 + }, + { + "epoch": 0.06599363100481062, + "grad_norm": 11.890633583068848, + "learning_rate": 9.969470874118695e-05, + "loss": 1.1607, + "step": 974 + }, + { + "epoch": 0.06606138627278271, + "grad_norm": 11.066970825195312, + "learning_rate": 9.969333972208913e-05, + "loss": 1.1031, + "step": 975 + }, + { + "epoch": 0.06612914154075479, + "grad_norm": 12.26187515258789, + "learning_rate": 9.969197070299131e-05, + "loss": 1.309, + "step": 976 + }, + { + "epoch": 0.06619689680872688, + "grad_norm": 13.490363121032715, + "learning_rate": 9.96906016838935e-05, + "loss": 1.3062, + "step": 977 + }, + { + "epoch": 0.06626465207669896, + "grad_norm": 12.306289672851562, + "learning_rate": 9.968923266479567e-05, + "loss": 0.9784, + "step": 978 + }, + { + "epoch": 0.06633240734467105, + "grad_norm": 10.699983596801758, + "learning_rate": 9.968786364569787e-05, + "loss": 1.0547, + "step": 979 + }, + { + "epoch": 0.06640016261264313, + "grad_norm": 12.298179626464844, + "learning_rate": 9.968649462660005e-05, + "loss": 1.2496, + "step": 980 + }, + { + "epoch": 0.06646791788061522, + "grad_norm": 15.239167213439941, + "learning_rate": 9.968512560750223e-05, + "loss": 1.2545, + "step": 981 + }, + { + "epoch": 0.0665356731485873, + "grad_norm": 11.61802864074707, + "learning_rate": 9.968375658840441e-05, + "loss": 1.1668, + "step": 982 + }, + { + "epoch": 0.06660342841655939, + "grad_norm": 12.804032325744629, + "learning_rate": 9.96823875693066e-05, + "loss": 1.3203, + "step": 983 + }, + { + "epoch": 0.06667118368453147, + "grad_norm": 14.511723518371582, + "learning_rate": 9.968101855020878e-05, + "loss": 1.7519, + "step": 984 + }, + { + "epoch": 0.06673893895250356, + "grad_norm": 12.442008018493652, + "learning_rate": 9.967964953111096e-05, + "loss": 1.3758, + "step": 985 + }, + { + "epoch": 0.06680669422047564, + "grad_norm": 14.486754417419434, + "learning_rate": 9.967828051201314e-05, + "loss": 1.1924, + "step": 986 + }, + { + "epoch": 0.06687444948844773, + "grad_norm": 13.529693603515625, + "learning_rate": 9.967691149291532e-05, + "loss": 1.2698, + "step": 987 + }, + { + "epoch": 0.06694220475641981, + "grad_norm": 12.980225563049316, + "learning_rate": 9.967554247381752e-05, + "loss": 1.1896, + "step": 988 + }, + { + "epoch": 0.0670099600243919, + "grad_norm": 15.495257377624512, + "learning_rate": 9.96741734547197e-05, + "loss": 1.286, + "step": 989 + }, + { + "epoch": 0.06707771529236398, + "grad_norm": 12.67573070526123, + "learning_rate": 9.967280443562188e-05, + "loss": 1.1751, + "step": 990 + }, + { + "epoch": 0.06714547056033607, + "grad_norm": 11.141845703125, + "learning_rate": 9.967143541652406e-05, + "loss": 1.5109, + "step": 991 + }, + { + "epoch": 0.06721322582830815, + "grad_norm": 11.975769996643066, + "learning_rate": 9.967006639742625e-05, + "loss": 1.1579, + "step": 992 + }, + { + "epoch": 0.06728098109628024, + "grad_norm": 13.872209548950195, + "learning_rate": 9.966869737832843e-05, + "loss": 1.1852, + "step": 993 + }, + { + "epoch": 0.06734873636425232, + "grad_norm": 11.52573299407959, + "learning_rate": 9.966732835923061e-05, + "loss": 1.2319, + "step": 994 + }, + { + "epoch": 0.0674164916322244, + "grad_norm": 12.90494155883789, + "learning_rate": 9.96659593401328e-05, + "loss": 1.2415, + "step": 995 + }, + { + "epoch": 0.06748424690019648, + "grad_norm": 14.210317611694336, + "learning_rate": 9.966459032103497e-05, + "loss": 1.2901, + "step": 996 + }, + { + "epoch": 0.06755200216816858, + "grad_norm": 12.600135803222656, + "learning_rate": 9.966322130193717e-05, + "loss": 1.4516, + "step": 997 + }, + { + "epoch": 0.06761975743614065, + "grad_norm": 14.462118148803711, + "learning_rate": 9.966185228283935e-05, + "loss": 1.2205, + "step": 998 + }, + { + "epoch": 0.06768751270411275, + "grad_norm": 12.870843887329102, + "learning_rate": 9.966048326374153e-05, + "loss": 1.179, + "step": 999 + }, + { + "epoch": 0.06775526797208482, + "grad_norm": 16.4424991607666, + "learning_rate": 9.965911424464371e-05, + "loss": 1.5714, + "step": 1000 + }, + { + "epoch": 0.06782302324005691, + "grad_norm": 12.902230262756348, + "learning_rate": 9.96577452255459e-05, + "loss": 1.2803, + "step": 1001 + }, + { + "epoch": 0.06789077850802899, + "grad_norm": 11.469466209411621, + "learning_rate": 9.965637620644808e-05, + "loss": 1.0771, + "step": 1002 + }, + { + "epoch": 0.06795853377600108, + "grad_norm": 13.96650505065918, + "learning_rate": 9.965500718735026e-05, + "loss": 1.2427, + "step": 1003 + }, + { + "epoch": 0.06802628904397316, + "grad_norm": 11.55516242980957, + "learning_rate": 9.965363816825244e-05, + "loss": 1.0396, + "step": 1004 + }, + { + "epoch": 0.06809404431194525, + "grad_norm": 13.34827709197998, + "learning_rate": 9.965226914915462e-05, + "loss": 1.181, + "step": 1005 + }, + { + "epoch": 0.06816179957991735, + "grad_norm": 11.243910789489746, + "learning_rate": 9.965090013005682e-05, + "loss": 1.2875, + "step": 1006 + }, + { + "epoch": 0.06822955484788942, + "grad_norm": 14.152894020080566, + "learning_rate": 9.9649531110959e-05, + "loss": 1.3125, + "step": 1007 + }, + { + "epoch": 0.06829731011586151, + "grad_norm": 13.010010719299316, + "learning_rate": 9.964816209186118e-05, + "loss": 1.3213, + "step": 1008 + }, + { + "epoch": 0.06836506538383359, + "grad_norm": 15.990034103393555, + "learning_rate": 9.964679307276337e-05, + "loss": 1.1878, + "step": 1009 + }, + { + "epoch": 0.06843282065180568, + "grad_norm": 12.943589210510254, + "learning_rate": 9.964542405366555e-05, + "loss": 1.2443, + "step": 1010 + }, + { + "epoch": 0.06850057591977776, + "grad_norm": 12.108896255493164, + "learning_rate": 9.964405503456775e-05, + "loss": 1.1719, + "step": 1011 + }, + { + "epoch": 0.06856833118774985, + "grad_norm": 12.097951889038086, + "learning_rate": 9.964268601546993e-05, + "loss": 1.0653, + "step": 1012 + }, + { + "epoch": 0.06863608645572193, + "grad_norm": 14.222228050231934, + "learning_rate": 9.964131699637211e-05, + "loss": 1.3379, + "step": 1013 + }, + { + "epoch": 0.06870384172369402, + "grad_norm": 12.636894226074219, + "learning_rate": 9.963994797727429e-05, + "loss": 1.5559, + "step": 1014 + }, + { + "epoch": 0.0687715969916661, + "grad_norm": 15.458481788635254, + "learning_rate": 9.963857895817648e-05, + "loss": 1.2542, + "step": 1015 + }, + { + "epoch": 0.06883935225963819, + "grad_norm": 11.246847152709961, + "learning_rate": 9.963720993907866e-05, + "loss": 1.1944, + "step": 1016 + }, + { + "epoch": 0.06890710752761027, + "grad_norm": 11.699065208435059, + "learning_rate": 9.963584091998084e-05, + "loss": 1.0739, + "step": 1017 + }, + { + "epoch": 0.06897486279558236, + "grad_norm": 12.259678840637207, + "learning_rate": 9.963447190088302e-05, + "loss": 1.1365, + "step": 1018 + }, + { + "epoch": 0.06904261806355444, + "grad_norm": 13.594696998596191, + "learning_rate": 9.96331028817852e-05, + "loss": 1.1418, + "step": 1019 + }, + { + "epoch": 0.06911037333152653, + "grad_norm": 12.90888786315918, + "learning_rate": 9.96317338626874e-05, + "loss": 1.1987, + "step": 1020 + }, + { + "epoch": 0.06917812859949861, + "grad_norm": 13.04245662689209, + "learning_rate": 9.963036484358958e-05, + "loss": 1.4288, + "step": 1021 + }, + { + "epoch": 0.0692458838674707, + "grad_norm": 12.706077575683594, + "learning_rate": 9.962899582449176e-05, + "loss": 1.222, + "step": 1022 + }, + { + "epoch": 0.06931363913544278, + "grad_norm": 14.205679893493652, + "learning_rate": 9.962762680539394e-05, + "loss": 1.3305, + "step": 1023 + }, + { + "epoch": 0.06938139440341487, + "grad_norm": 17.09891128540039, + "learning_rate": 9.962625778629613e-05, + "loss": 1.47, + "step": 1024 + }, + { + "epoch": 0.06944914967138695, + "grad_norm": 14.603500366210938, + "learning_rate": 9.962488876719831e-05, + "loss": 1.3887, + "step": 1025 + }, + { + "epoch": 0.06951690493935904, + "grad_norm": 10.820066452026367, + "learning_rate": 9.962351974810049e-05, + "loss": 1.1543, + "step": 1026 + }, + { + "epoch": 0.06958466020733112, + "grad_norm": 10.99889850616455, + "learning_rate": 9.962215072900267e-05, + "loss": 1.2908, + "step": 1027 + }, + { + "epoch": 0.06965241547530321, + "grad_norm": 13.470711708068848, + "learning_rate": 9.962078170990485e-05, + "loss": 1.1422, + "step": 1028 + }, + { + "epoch": 0.06972017074327529, + "grad_norm": 11.086441040039062, + "learning_rate": 9.961941269080705e-05, + "loss": 0.9835, + "step": 1029 + }, + { + "epoch": 0.06978792601124738, + "grad_norm": 16.768535614013672, + "learning_rate": 9.961804367170923e-05, + "loss": 1.404, + "step": 1030 + }, + { + "epoch": 0.06985568127921946, + "grad_norm": 15.851200103759766, + "learning_rate": 9.961667465261141e-05, + "loss": 1.4874, + "step": 1031 + }, + { + "epoch": 0.06992343654719155, + "grad_norm": 11.995482444763184, + "learning_rate": 9.961530563351359e-05, + "loss": 1.1497, + "step": 1032 + }, + { + "epoch": 0.06999119181516363, + "grad_norm": 13.591619491577148, + "learning_rate": 9.961393661441577e-05, + "loss": 1.4773, + "step": 1033 + }, + { + "epoch": 0.07005894708313572, + "grad_norm": 16.878938674926758, + "learning_rate": 9.961256759531796e-05, + "loss": 1.2522, + "step": 1034 + }, + { + "epoch": 0.0701267023511078, + "grad_norm": 11.901616096496582, + "learning_rate": 9.961119857622014e-05, + "loss": 1.2164, + "step": 1035 + }, + { + "epoch": 0.07019445761907989, + "grad_norm": 14.935117721557617, + "learning_rate": 9.960982955712232e-05, + "loss": 1.1251, + "step": 1036 + }, + { + "epoch": 0.07026221288705196, + "grad_norm": 12.380253791809082, + "learning_rate": 9.96084605380245e-05, + "loss": 1.3965, + "step": 1037 + }, + { + "epoch": 0.07032996815502406, + "grad_norm": 11.645035743713379, + "learning_rate": 9.96070915189267e-05, + "loss": 1.266, + "step": 1038 + }, + { + "epoch": 0.07039772342299613, + "grad_norm": 14.525420188903809, + "learning_rate": 9.960572249982888e-05, + "loss": 1.3991, + "step": 1039 + }, + { + "epoch": 0.07046547869096823, + "grad_norm": 14.775094985961914, + "learning_rate": 9.960435348073106e-05, + "loss": 1.5958, + "step": 1040 + }, + { + "epoch": 0.0705332339589403, + "grad_norm": 10.2192964553833, + "learning_rate": 9.960298446163324e-05, + "loss": 1.1793, + "step": 1041 + }, + { + "epoch": 0.0706009892269124, + "grad_norm": 13.074480056762695, + "learning_rate": 9.960161544253542e-05, + "loss": 1.4243, + "step": 1042 + }, + { + "epoch": 0.07066874449488447, + "grad_norm": 12.679484367370605, + "learning_rate": 9.960024642343761e-05, + "loss": 1.3398, + "step": 1043 + }, + { + "epoch": 0.07073649976285656, + "grad_norm": 9.061332702636719, + "learning_rate": 9.95988774043398e-05, + "loss": 1.0036, + "step": 1044 + }, + { + "epoch": 0.07080425503082864, + "grad_norm": 13.423661231994629, + "learning_rate": 9.959750838524197e-05, + "loss": 1.2767, + "step": 1045 + }, + { + "epoch": 0.07087201029880073, + "grad_norm": 13.955148696899414, + "learning_rate": 9.959613936614415e-05, + "loss": 1.311, + "step": 1046 + }, + { + "epoch": 0.07093976556677281, + "grad_norm": 12.746015548706055, + "learning_rate": 9.959477034704635e-05, + "loss": 1.42, + "step": 1047 + }, + { + "epoch": 0.0710075208347449, + "grad_norm": 11.409982681274414, + "learning_rate": 9.959340132794853e-05, + "loss": 1.3051, + "step": 1048 + }, + { + "epoch": 0.07107527610271698, + "grad_norm": 11.801681518554688, + "learning_rate": 9.959203230885071e-05, + "loss": 1.2315, + "step": 1049 + }, + { + "epoch": 0.07114303137068907, + "grad_norm": 13.041158676147461, + "learning_rate": 9.959066328975289e-05, + "loss": 1.3487, + "step": 1050 + }, + { + "epoch": 0.07121078663866115, + "grad_norm": 13.474900245666504, + "learning_rate": 9.958929427065507e-05, + "loss": 1.4542, + "step": 1051 + }, + { + "epoch": 0.07127854190663324, + "grad_norm": 12.335237503051758, + "learning_rate": 9.958792525155726e-05, + "loss": 1.1841, + "step": 1052 + }, + { + "epoch": 0.07134629717460532, + "grad_norm": 14.909475326538086, + "learning_rate": 9.958655623245944e-05, + "loss": 1.272, + "step": 1053 + }, + { + "epoch": 0.07141405244257741, + "grad_norm": 13.449742317199707, + "learning_rate": 9.958518721336162e-05, + "loss": 1.4881, + "step": 1054 + }, + { + "epoch": 0.07148180771054949, + "grad_norm": 12.2557954788208, + "learning_rate": 9.958381819426382e-05, + "loss": 1.2059, + "step": 1055 + }, + { + "epoch": 0.07154956297852158, + "grad_norm": 13.71298885345459, + "learning_rate": 9.9582449175166e-05, + "loss": 1.2627, + "step": 1056 + }, + { + "epoch": 0.07161731824649366, + "grad_norm": 15.4293212890625, + "learning_rate": 9.958108015606818e-05, + "loss": 1.4663, + "step": 1057 + }, + { + "epoch": 0.07168507351446575, + "grad_norm": 13.665759086608887, + "learning_rate": 9.957971113697037e-05, + "loss": 1.4634, + "step": 1058 + }, + { + "epoch": 0.07175282878243784, + "grad_norm": 13.064310073852539, + "learning_rate": 9.957834211787255e-05, + "loss": 1.2686, + "step": 1059 + }, + { + "epoch": 0.07182058405040992, + "grad_norm": 15.21746826171875, + "learning_rate": 9.957697309877473e-05, + "loss": 1.4861, + "step": 1060 + }, + { + "epoch": 0.07188833931838201, + "grad_norm": 12.499883651733398, + "learning_rate": 9.957560407967693e-05, + "loss": 1.5275, + "step": 1061 + }, + { + "epoch": 0.07195609458635409, + "grad_norm": 9.355907440185547, + "learning_rate": 9.957423506057911e-05, + "loss": 1.1509, + "step": 1062 + }, + { + "epoch": 0.07202384985432618, + "grad_norm": 10.428252220153809, + "learning_rate": 9.957286604148129e-05, + "loss": 1.0874, + "step": 1063 + }, + { + "epoch": 0.07209160512229826, + "grad_norm": 11.351346015930176, + "learning_rate": 9.957149702238347e-05, + "loss": 1.0887, + "step": 1064 + }, + { + "epoch": 0.07215936039027035, + "grad_norm": 15.271830558776855, + "learning_rate": 9.957012800328565e-05, + "loss": 1.2466, + "step": 1065 + }, + { + "epoch": 0.07222711565824243, + "grad_norm": 11.172418594360352, + "learning_rate": 9.956875898418784e-05, + "loss": 1.0958, + "step": 1066 + }, + { + "epoch": 0.07229487092621452, + "grad_norm": 14.221702575683594, + "learning_rate": 9.956738996509002e-05, + "loss": 1.2288, + "step": 1067 + }, + { + "epoch": 0.0723626261941866, + "grad_norm": 12.167356491088867, + "learning_rate": 9.95660209459922e-05, + "loss": 1.1159, + "step": 1068 + }, + { + "epoch": 0.07243038146215869, + "grad_norm": 12.607329368591309, + "learning_rate": 9.956465192689438e-05, + "loss": 1.2475, + "step": 1069 + }, + { + "epoch": 0.07249813673013077, + "grad_norm": 10.59451675415039, + "learning_rate": 9.956328290779658e-05, + "loss": 1.1493, + "step": 1070 + }, + { + "epoch": 0.07256589199810286, + "grad_norm": 11.190742492675781, + "learning_rate": 9.956191388869876e-05, + "loss": 1.4549, + "step": 1071 + }, + { + "epoch": 0.07263364726607494, + "grad_norm": 11.225564956665039, + "learning_rate": 9.956054486960094e-05, + "loss": 1.0741, + "step": 1072 + }, + { + "epoch": 0.07270140253404703, + "grad_norm": 13.648139953613281, + "learning_rate": 9.955917585050312e-05, + "loss": 1.2942, + "step": 1073 + }, + { + "epoch": 0.0727691578020191, + "grad_norm": 14.606941223144531, + "learning_rate": 9.95578068314053e-05, + "loss": 1.253, + "step": 1074 + }, + { + "epoch": 0.0728369130699912, + "grad_norm": 11.043729782104492, + "learning_rate": 9.955643781230749e-05, + "loss": 1.2279, + "step": 1075 + }, + { + "epoch": 0.07290466833796327, + "grad_norm": 12.463634490966797, + "learning_rate": 9.955506879320967e-05, + "loss": 1.4399, + "step": 1076 + }, + { + "epoch": 0.07297242360593537, + "grad_norm": 13.74101734161377, + "learning_rate": 9.955369977411185e-05, + "loss": 1.4323, + "step": 1077 + }, + { + "epoch": 0.07304017887390744, + "grad_norm": 10.1694974899292, + "learning_rate": 9.955233075501403e-05, + "loss": 1.0113, + "step": 1078 + }, + { + "epoch": 0.07310793414187954, + "grad_norm": 14.407991409301758, + "learning_rate": 9.955096173591623e-05, + "loss": 1.7102, + "step": 1079 + }, + { + "epoch": 0.07317568940985161, + "grad_norm": 13.84760570526123, + "learning_rate": 9.954959271681841e-05, + "loss": 1.4491, + "step": 1080 + }, + { + "epoch": 0.0732434446778237, + "grad_norm": 12.220841407775879, + "learning_rate": 9.954822369772059e-05, + "loss": 1.1826, + "step": 1081 + }, + { + "epoch": 0.07331119994579578, + "grad_norm": 11.380377769470215, + "learning_rate": 9.954685467862277e-05, + "loss": 1.3537, + "step": 1082 + }, + { + "epoch": 0.07337895521376787, + "grad_norm": 12.916484832763672, + "learning_rate": 9.954548565952495e-05, + "loss": 1.3406, + "step": 1083 + }, + { + "epoch": 0.07344671048173995, + "grad_norm": 14.107590675354004, + "learning_rate": 9.954411664042714e-05, + "loss": 1.0531, + "step": 1084 + }, + { + "epoch": 0.07351446574971204, + "grad_norm": 13.498798370361328, + "learning_rate": 9.954274762132932e-05, + "loss": 1.1546, + "step": 1085 + }, + { + "epoch": 0.07358222101768412, + "grad_norm": 13.155747413635254, + "learning_rate": 9.95413786022315e-05, + "loss": 1.212, + "step": 1086 + }, + { + "epoch": 0.07364997628565621, + "grad_norm": 14.135687828063965, + "learning_rate": 9.954000958313368e-05, + "loss": 1.2001, + "step": 1087 + }, + { + "epoch": 0.07371773155362829, + "grad_norm": 13.123790740966797, + "learning_rate": 9.953864056403586e-05, + "loss": 1.2735, + "step": 1088 + }, + { + "epoch": 0.07378548682160038, + "grad_norm": 14.591660499572754, + "learning_rate": 9.953727154493806e-05, + "loss": 1.2977, + "step": 1089 + }, + { + "epoch": 0.07385324208957246, + "grad_norm": 13.30614948272705, + "learning_rate": 9.953590252584024e-05, + "loss": 1.2445, + "step": 1090 + }, + { + "epoch": 0.07392099735754455, + "grad_norm": 14.542524337768555, + "learning_rate": 9.953453350674242e-05, + "loss": 1.2327, + "step": 1091 + }, + { + "epoch": 0.07398875262551663, + "grad_norm": 12.926522254943848, + "learning_rate": 9.95331644876446e-05, + "loss": 1.2139, + "step": 1092 + }, + { + "epoch": 0.07405650789348872, + "grad_norm": 11.622479438781738, + "learning_rate": 9.953179546854679e-05, + "loss": 1.2887, + "step": 1093 + }, + { + "epoch": 0.0741242631614608, + "grad_norm": 14.87485122680664, + "learning_rate": 9.953042644944897e-05, + "loss": 1.3132, + "step": 1094 + }, + { + "epoch": 0.07419201842943289, + "grad_norm": 13.096991539001465, + "learning_rate": 9.952905743035115e-05, + "loss": 1.0783, + "step": 1095 + }, + { + "epoch": 0.07425977369740497, + "grad_norm": 14.676551818847656, + "learning_rate": 9.952768841125333e-05, + "loss": 1.3171, + "step": 1096 + }, + { + "epoch": 0.07432752896537706, + "grad_norm": 10.399755477905273, + "learning_rate": 9.952631939215551e-05, + "loss": 1.0747, + "step": 1097 + }, + { + "epoch": 0.07439528423334914, + "grad_norm": 11.052745819091797, + "learning_rate": 9.952495037305771e-05, + "loss": 1.1653, + "step": 1098 + }, + { + "epoch": 0.07446303950132123, + "grad_norm": 13.014352798461914, + "learning_rate": 9.952358135395989e-05, + "loss": 1.3727, + "step": 1099 + }, + { + "epoch": 0.07453079476929331, + "grad_norm": 13.202262878417969, + "learning_rate": 9.952221233486207e-05, + "loss": 1.5063, + "step": 1100 + }, + { + "epoch": 0.0745985500372654, + "grad_norm": 14.272111892700195, + "learning_rate": 9.952084331576426e-05, + "loss": 1.0183, + "step": 1101 + }, + { + "epoch": 0.07466630530523748, + "grad_norm": 14.23975658416748, + "learning_rate": 9.951947429666644e-05, + "loss": 1.3518, + "step": 1102 + }, + { + "epoch": 0.07473406057320957, + "grad_norm": 12.293742179870605, + "learning_rate": 9.951810527756862e-05, + "loss": 1.4021, + "step": 1103 + }, + { + "epoch": 0.07480181584118165, + "grad_norm": 16.026020050048828, + "learning_rate": 9.951673625847082e-05, + "loss": 1.3412, + "step": 1104 + }, + { + "epoch": 0.07486957110915374, + "grad_norm": 11.120819091796875, + "learning_rate": 9.9515367239373e-05, + "loss": 1.331, + "step": 1105 + }, + { + "epoch": 0.07493732637712582, + "grad_norm": 12.363526344299316, + "learning_rate": 9.951399822027518e-05, + "loss": 1.4284, + "step": 1106 + }, + { + "epoch": 0.0750050816450979, + "grad_norm": 14.377492904663086, + "learning_rate": 9.951262920117737e-05, + "loss": 1.3686, + "step": 1107 + }, + { + "epoch": 0.07507283691306998, + "grad_norm": 11.366288185119629, + "learning_rate": 9.951126018207955e-05, + "loss": 1.2329, + "step": 1108 + }, + { + "epoch": 0.07514059218104208, + "grad_norm": 10.26131820678711, + "learning_rate": 9.950989116298173e-05, + "loss": 1.1091, + "step": 1109 + }, + { + "epoch": 0.07520834744901415, + "grad_norm": 12.64631462097168, + "learning_rate": 9.950852214388391e-05, + "loss": 1.1231, + "step": 1110 + }, + { + "epoch": 0.07527610271698625, + "grad_norm": 13.042781829833984, + "learning_rate": 9.95071531247861e-05, + "loss": 1.2338, + "step": 1111 + }, + { + "epoch": 0.07534385798495834, + "grad_norm": 11.577115058898926, + "learning_rate": 9.950578410568829e-05, + "loss": 1.2143, + "step": 1112 + }, + { + "epoch": 0.07541161325293042, + "grad_norm": 13.640811920166016, + "learning_rate": 9.950441508659047e-05, + "loss": 1.224, + "step": 1113 + }, + { + "epoch": 0.0754793685209025, + "grad_norm": 10.271018981933594, + "learning_rate": 9.950304606749265e-05, + "loss": 1.0459, + "step": 1114 + }, + { + "epoch": 0.07554712378887458, + "grad_norm": 12.053836822509766, + "learning_rate": 9.950167704839483e-05, + "loss": 1.2137, + "step": 1115 + }, + { + "epoch": 0.07561487905684668, + "grad_norm": 10.022509574890137, + "learning_rate": 9.950030802929702e-05, + "loss": 1.092, + "step": 1116 + }, + { + "epoch": 0.07568263432481875, + "grad_norm": 12.28339672088623, + "learning_rate": 9.94989390101992e-05, + "loss": 1.1616, + "step": 1117 + }, + { + "epoch": 0.07575038959279085, + "grad_norm": 14.182686805725098, + "learning_rate": 9.949756999110138e-05, + "loss": 1.5135, + "step": 1118 + }, + { + "epoch": 0.07581814486076292, + "grad_norm": 10.668661117553711, + "learning_rate": 9.949620097200356e-05, + "loss": 1.2412, + "step": 1119 + }, + { + "epoch": 0.07588590012873501, + "grad_norm": 14.443583488464355, + "learning_rate": 9.949483195290574e-05, + "loss": 1.207, + "step": 1120 + }, + { + "epoch": 0.07595365539670709, + "grad_norm": 12.418794631958008, + "learning_rate": 9.949346293380794e-05, + "loss": 1.1907, + "step": 1121 + }, + { + "epoch": 0.07602141066467918, + "grad_norm": 12.429618835449219, + "learning_rate": 9.949209391471012e-05, + "loss": 1.4403, + "step": 1122 + }, + { + "epoch": 0.07608916593265126, + "grad_norm": 9.524622917175293, + "learning_rate": 9.94907248956123e-05, + "loss": 1.139, + "step": 1123 + }, + { + "epoch": 0.07615692120062335, + "grad_norm": 10.974812507629395, + "learning_rate": 9.948935587651448e-05, + "loss": 1.4055, + "step": 1124 + }, + { + "epoch": 0.07622467646859543, + "grad_norm": 16.152681350708008, + "learning_rate": 9.948798685741667e-05, + "loss": 1.0874, + "step": 1125 + }, + { + "epoch": 0.07629243173656752, + "grad_norm": 12.023541450500488, + "learning_rate": 9.948661783831885e-05, + "loss": 1.2378, + "step": 1126 + }, + { + "epoch": 0.0763601870045396, + "grad_norm": 11.597234725952148, + "learning_rate": 9.948524881922103e-05, + "loss": 1.0104, + "step": 1127 + }, + { + "epoch": 0.07642794227251169, + "grad_norm": 11.374302864074707, + "learning_rate": 9.948387980012321e-05, + "loss": 1.1099, + "step": 1128 + }, + { + "epoch": 0.07649569754048377, + "grad_norm": 14.398423194885254, + "learning_rate": 9.94825107810254e-05, + "loss": 1.5533, + "step": 1129 + }, + { + "epoch": 0.07656345280845586, + "grad_norm": 13.1026611328125, + "learning_rate": 9.948114176192759e-05, + "loss": 1.3948, + "step": 1130 + }, + { + "epoch": 0.07663120807642794, + "grad_norm": 12.012560844421387, + "learning_rate": 9.947977274282977e-05, + "loss": 1.2156, + "step": 1131 + }, + { + "epoch": 0.07669896334440003, + "grad_norm": 12.900229454040527, + "learning_rate": 9.947840372373195e-05, + "loss": 1.2658, + "step": 1132 + }, + { + "epoch": 0.07676671861237211, + "grad_norm": 15.250492095947266, + "learning_rate": 9.947703470463413e-05, + "loss": 1.6102, + "step": 1133 + }, + { + "epoch": 0.0768344738803442, + "grad_norm": 15.08134937286377, + "learning_rate": 9.947566568553632e-05, + "loss": 1.1947, + "step": 1134 + }, + { + "epoch": 0.07690222914831628, + "grad_norm": 13.353601455688477, + "learning_rate": 9.94742966664385e-05, + "loss": 1.3211, + "step": 1135 + }, + { + "epoch": 0.07696998441628837, + "grad_norm": 11.310175895690918, + "learning_rate": 9.947292764734068e-05, + "loss": 1.2223, + "step": 1136 + }, + { + "epoch": 0.07703773968426045, + "grad_norm": 11.800848960876465, + "learning_rate": 9.947155862824286e-05, + "loss": 1.2131, + "step": 1137 + }, + { + "epoch": 0.07710549495223254, + "grad_norm": 11.748014450073242, + "learning_rate": 9.947018960914504e-05, + "loss": 1.0734, + "step": 1138 + }, + { + "epoch": 0.07717325022020462, + "grad_norm": 12.282258033752441, + "learning_rate": 9.946882059004724e-05, + "loss": 1.3257, + "step": 1139 + }, + { + "epoch": 0.07724100548817671, + "grad_norm": 11.93818473815918, + "learning_rate": 9.946745157094942e-05, + "loss": 1.1452, + "step": 1140 + }, + { + "epoch": 0.07730876075614879, + "grad_norm": 13.397029876708984, + "learning_rate": 9.94660825518516e-05, + "loss": 1.2625, + "step": 1141 + }, + { + "epoch": 0.07737651602412088, + "grad_norm": 12.135769844055176, + "learning_rate": 9.946471353275378e-05, + "loss": 1.3624, + "step": 1142 + }, + { + "epoch": 0.07744427129209296, + "grad_norm": 11.304028511047363, + "learning_rate": 9.946334451365596e-05, + "loss": 1.3019, + "step": 1143 + }, + { + "epoch": 0.07751202656006505, + "grad_norm": 10.978137016296387, + "learning_rate": 9.946197549455815e-05, + "loss": 1.3137, + "step": 1144 + }, + { + "epoch": 0.07757978182803713, + "grad_norm": 10.997323989868164, + "learning_rate": 9.946060647546033e-05, + "loss": 1.3573, + "step": 1145 + }, + { + "epoch": 0.07764753709600922, + "grad_norm": 11.883647918701172, + "learning_rate": 9.945923745636251e-05, + "loss": 1.2587, + "step": 1146 + }, + { + "epoch": 0.0777152923639813, + "grad_norm": 10.70753288269043, + "learning_rate": 9.945786843726471e-05, + "loss": 1.0866, + "step": 1147 + }, + { + "epoch": 0.07778304763195339, + "grad_norm": 13.318743705749512, + "learning_rate": 9.945649941816689e-05, + "loss": 1.3356, + "step": 1148 + }, + { + "epoch": 0.07785080289992546, + "grad_norm": 10.656171798706055, + "learning_rate": 9.945513039906907e-05, + "loss": 1.2155, + "step": 1149 + }, + { + "epoch": 0.07791855816789756, + "grad_norm": 14.73982048034668, + "learning_rate": 9.945376137997126e-05, + "loss": 1.2276, + "step": 1150 + }, + { + "epoch": 0.07798631343586963, + "grad_norm": 12.461714744567871, + "learning_rate": 9.945239236087344e-05, + "loss": 1.2999, + "step": 1151 + }, + { + "epoch": 0.07805406870384173, + "grad_norm": 12.277376174926758, + "learning_rate": 9.945102334177562e-05, + "loss": 1.1131, + "step": 1152 + }, + { + "epoch": 0.0781218239718138, + "grad_norm": 12.726540565490723, + "learning_rate": 9.944965432267782e-05, + "loss": 1.4181, + "step": 1153 + }, + { + "epoch": 0.0781895792397859, + "grad_norm": 11.086180686950684, + "learning_rate": 9.944828530358e-05, + "loss": 1.3852, + "step": 1154 + }, + { + "epoch": 0.07825733450775797, + "grad_norm": 12.80537223815918, + "learning_rate": 9.944691628448218e-05, + "loss": 1.1101, + "step": 1155 + }, + { + "epoch": 0.07832508977573006, + "grad_norm": 11.960269927978516, + "learning_rate": 9.944554726538436e-05, + "loss": 1.2186, + "step": 1156 + }, + { + "epoch": 0.07839284504370214, + "grad_norm": 11.732439041137695, + "learning_rate": 9.944417824628655e-05, + "loss": 0.9854, + "step": 1157 + }, + { + "epoch": 0.07846060031167423, + "grad_norm": 12.578715324401855, + "learning_rate": 9.944280922718873e-05, + "loss": 1.0792, + "step": 1158 + }, + { + "epoch": 0.07852835557964631, + "grad_norm": 13.779712677001953, + "learning_rate": 9.944144020809091e-05, + "loss": 1.2468, + "step": 1159 + }, + { + "epoch": 0.0785961108476184, + "grad_norm": 13.95693588256836, + "learning_rate": 9.944007118899309e-05, + "loss": 1.234, + "step": 1160 + }, + { + "epoch": 0.07866386611559048, + "grad_norm": 12.056897163391113, + "learning_rate": 9.943870216989527e-05, + "loss": 1.3034, + "step": 1161 + }, + { + "epoch": 0.07873162138356257, + "grad_norm": 11.633442878723145, + "learning_rate": 9.943733315079747e-05, + "loss": 1.2744, + "step": 1162 + }, + { + "epoch": 0.07879937665153465, + "grad_norm": 14.062381744384766, + "learning_rate": 9.943596413169965e-05, + "loss": 1.2535, + "step": 1163 + }, + { + "epoch": 0.07886713191950674, + "grad_norm": 11.60498332977295, + "learning_rate": 9.943459511260183e-05, + "loss": 1.1578, + "step": 1164 + }, + { + "epoch": 0.07893488718747883, + "grad_norm": 9.667806625366211, + "learning_rate": 9.943322609350401e-05, + "loss": 1.0804, + "step": 1165 + }, + { + "epoch": 0.07900264245545091, + "grad_norm": 12.30827808380127, + "learning_rate": 9.943185707440619e-05, + "loss": 1.4167, + "step": 1166 + }, + { + "epoch": 0.079070397723423, + "grad_norm": 10.196819305419922, + "learning_rate": 9.943048805530838e-05, + "loss": 1.0547, + "step": 1167 + }, + { + "epoch": 0.07913815299139508, + "grad_norm": 10.029928207397461, + "learning_rate": 9.942911903621056e-05, + "loss": 1.1798, + "step": 1168 + }, + { + "epoch": 0.07920590825936717, + "grad_norm": 11.782978057861328, + "learning_rate": 9.942775001711274e-05, + "loss": 1.1454, + "step": 1169 + }, + { + "epoch": 0.07927366352733925, + "grad_norm": 12.25143814086914, + "learning_rate": 9.942638099801492e-05, + "loss": 1.2854, + "step": 1170 + }, + { + "epoch": 0.07934141879531134, + "grad_norm": 8.47904109954834, + "learning_rate": 9.942501197891712e-05, + "loss": 0.9381, + "step": 1171 + }, + { + "epoch": 0.07940917406328342, + "grad_norm": 13.698802947998047, + "learning_rate": 9.94236429598193e-05, + "loss": 1.6261, + "step": 1172 + }, + { + "epoch": 0.07947692933125551, + "grad_norm": 10.885397911071777, + "learning_rate": 9.942227394072148e-05, + "loss": 1.0645, + "step": 1173 + }, + { + "epoch": 0.07954468459922759, + "grad_norm": 13.274818420410156, + "learning_rate": 9.942090492162366e-05, + "loss": 1.3411, + "step": 1174 + }, + { + "epoch": 0.07961243986719968, + "grad_norm": 14.061238288879395, + "learning_rate": 9.941953590252584e-05, + "loss": 1.1916, + "step": 1175 + }, + { + "epoch": 0.07968019513517176, + "grad_norm": 10.074264526367188, + "learning_rate": 9.941816688342803e-05, + "loss": 1.092, + "step": 1176 + }, + { + "epoch": 0.07974795040314385, + "grad_norm": 14.741287231445312, + "learning_rate": 9.941679786433021e-05, + "loss": 1.3774, + "step": 1177 + }, + { + "epoch": 0.07981570567111593, + "grad_norm": 11.308422088623047, + "learning_rate": 9.941542884523239e-05, + "loss": 0.9298, + "step": 1178 + }, + { + "epoch": 0.07988346093908802, + "grad_norm": 14.375280380249023, + "learning_rate": 9.941405982613457e-05, + "loss": 1.4525, + "step": 1179 + }, + { + "epoch": 0.0799512162070601, + "grad_norm": 11.724523544311523, + "learning_rate": 9.941269080703677e-05, + "loss": 1.3616, + "step": 1180 + }, + { + "epoch": 0.08001897147503219, + "grad_norm": 12.578176498413086, + "learning_rate": 9.941132178793895e-05, + "loss": 1.1513, + "step": 1181 + }, + { + "epoch": 0.08008672674300427, + "grad_norm": 12.100804328918457, + "learning_rate": 9.940995276884113e-05, + "loss": 1.0656, + "step": 1182 + }, + { + "epoch": 0.08015448201097636, + "grad_norm": 11.680248260498047, + "learning_rate": 9.940858374974331e-05, + "loss": 1.2358, + "step": 1183 + }, + { + "epoch": 0.08022223727894844, + "grad_norm": 10.066198348999023, + "learning_rate": 9.940721473064549e-05, + "loss": 0.9219, + "step": 1184 + }, + { + "epoch": 0.08028999254692053, + "grad_norm": 10.813334465026855, + "learning_rate": 9.940584571154768e-05, + "loss": 1.1456, + "step": 1185 + }, + { + "epoch": 0.0803577478148926, + "grad_norm": 14.004862785339355, + "learning_rate": 9.940447669244986e-05, + "loss": 1.2375, + "step": 1186 + }, + { + "epoch": 0.0804255030828647, + "grad_norm": 11.868766784667969, + "learning_rate": 9.940310767335204e-05, + "loss": 1.3027, + "step": 1187 + }, + { + "epoch": 0.08049325835083677, + "grad_norm": 12.48153018951416, + "learning_rate": 9.940173865425422e-05, + "loss": 1.1192, + "step": 1188 + }, + { + "epoch": 0.08056101361880887, + "grad_norm": 12.340612411499023, + "learning_rate": 9.940036963515642e-05, + "loss": 1.246, + "step": 1189 + }, + { + "epoch": 0.08062876888678094, + "grad_norm": 12.205392837524414, + "learning_rate": 9.93990006160586e-05, + "loss": 1.1233, + "step": 1190 + }, + { + "epoch": 0.08069652415475304, + "grad_norm": 12.69509220123291, + "learning_rate": 9.939763159696078e-05, + "loss": 1.2202, + "step": 1191 + }, + { + "epoch": 0.08076427942272511, + "grad_norm": 12.40784740447998, + "learning_rate": 9.939626257786296e-05, + "loss": 1.1345, + "step": 1192 + }, + { + "epoch": 0.0808320346906972, + "grad_norm": 11.678507804870605, + "learning_rate": 9.939489355876515e-05, + "loss": 1.4513, + "step": 1193 + }, + { + "epoch": 0.08089978995866928, + "grad_norm": 11.649873733520508, + "learning_rate": 9.939352453966733e-05, + "loss": 1.3827, + "step": 1194 + }, + { + "epoch": 0.08096754522664137, + "grad_norm": 12.378853797912598, + "learning_rate": 9.939215552056951e-05, + "loss": 1.3632, + "step": 1195 + }, + { + "epoch": 0.08103530049461345, + "grad_norm": 11.023188591003418, + "learning_rate": 9.93907865014717e-05, + "loss": 1.3892, + "step": 1196 + }, + { + "epoch": 0.08110305576258554, + "grad_norm": 13.111897468566895, + "learning_rate": 9.938941748237389e-05, + "loss": 1.1973, + "step": 1197 + }, + { + "epoch": 0.08117081103055762, + "grad_norm": 10.171613693237305, + "learning_rate": 9.938804846327607e-05, + "loss": 1.3862, + "step": 1198 + }, + { + "epoch": 0.08123856629852971, + "grad_norm": 13.327658653259277, + "learning_rate": 9.938667944417826e-05, + "loss": 1.3173, + "step": 1199 + }, + { + "epoch": 0.08130632156650179, + "grad_norm": 11.715154647827148, + "learning_rate": 9.938531042508044e-05, + "loss": 1.0839, + "step": 1200 + }, + { + "epoch": 0.08137407683447388, + "grad_norm": 11.521212577819824, + "learning_rate": 9.938394140598262e-05, + "loss": 1.0285, + "step": 1201 + }, + { + "epoch": 0.08144183210244596, + "grad_norm": 12.024236679077148, + "learning_rate": 9.93825723868848e-05, + "loss": 1.3025, + "step": 1202 + }, + { + "epoch": 0.08150958737041805, + "grad_norm": 10.245376586914062, + "learning_rate": 9.9381203367787e-05, + "loss": 1.1658, + "step": 1203 + }, + { + "epoch": 0.08157734263839013, + "grad_norm": 10.731759071350098, + "learning_rate": 9.937983434868918e-05, + "loss": 1.0565, + "step": 1204 + }, + { + "epoch": 0.08164509790636222, + "grad_norm": 16.714153289794922, + "learning_rate": 9.937846532959136e-05, + "loss": 1.2761, + "step": 1205 + }, + { + "epoch": 0.0817128531743343, + "grad_norm": 11.172699928283691, + "learning_rate": 9.937709631049354e-05, + "loss": 1.1887, + "step": 1206 + }, + { + "epoch": 0.08178060844230639, + "grad_norm": 11.384743690490723, + "learning_rate": 9.937572729139572e-05, + "loss": 0.9546, + "step": 1207 + }, + { + "epoch": 0.08184836371027847, + "grad_norm": 11.163822174072266, + "learning_rate": 9.937435827229791e-05, + "loss": 1.1498, + "step": 1208 + }, + { + "epoch": 0.08191611897825056, + "grad_norm": 10.657593727111816, + "learning_rate": 9.937298925320009e-05, + "loss": 0.9506, + "step": 1209 + }, + { + "epoch": 0.08198387424622264, + "grad_norm": 10.6862211227417, + "learning_rate": 9.937162023410227e-05, + "loss": 1.2308, + "step": 1210 + }, + { + "epoch": 0.08205162951419473, + "grad_norm": 10.649473190307617, + "learning_rate": 9.937025121500445e-05, + "loss": 1.2679, + "step": 1211 + }, + { + "epoch": 0.08211938478216681, + "grad_norm": 9.298782348632812, + "learning_rate": 9.936888219590665e-05, + "loss": 1.1327, + "step": 1212 + }, + { + "epoch": 0.0821871400501389, + "grad_norm": 11.448348045349121, + "learning_rate": 9.936751317680883e-05, + "loss": 1.2796, + "step": 1213 + }, + { + "epoch": 0.08225489531811098, + "grad_norm": 12.141517639160156, + "learning_rate": 9.936614415771101e-05, + "loss": 1.1128, + "step": 1214 + }, + { + "epoch": 0.08232265058608307, + "grad_norm": 11.799830436706543, + "learning_rate": 9.936477513861319e-05, + "loss": 1.0743, + "step": 1215 + }, + { + "epoch": 0.08239040585405515, + "grad_norm": 11.952958106994629, + "learning_rate": 9.936340611951537e-05, + "loss": 1.1659, + "step": 1216 + }, + { + "epoch": 0.08245816112202724, + "grad_norm": 11.870144844055176, + "learning_rate": 9.936203710041756e-05, + "loss": 1.1195, + "step": 1217 + }, + { + "epoch": 0.08252591638999933, + "grad_norm": 11.970368385314941, + "learning_rate": 9.936066808131974e-05, + "loss": 1.2139, + "step": 1218 + }, + { + "epoch": 0.08259367165797141, + "grad_norm": 11.211687088012695, + "learning_rate": 9.935929906222192e-05, + "loss": 1.2107, + "step": 1219 + }, + { + "epoch": 0.0826614269259435, + "grad_norm": 12.987563133239746, + "learning_rate": 9.93579300431241e-05, + "loss": 1.2804, + "step": 1220 + }, + { + "epoch": 0.08272918219391558, + "grad_norm": 12.337888717651367, + "learning_rate": 9.935656102402628e-05, + "loss": 0.8713, + "step": 1221 + }, + { + "epoch": 0.08279693746188767, + "grad_norm": 11.717150688171387, + "learning_rate": 9.935519200492848e-05, + "loss": 1.2684, + "step": 1222 + }, + { + "epoch": 0.08286469272985975, + "grad_norm": 12.658769607543945, + "learning_rate": 9.935382298583066e-05, + "loss": 1.0231, + "step": 1223 + }, + { + "epoch": 0.08293244799783184, + "grad_norm": 10.526476860046387, + "learning_rate": 9.935245396673284e-05, + "loss": 1.0605, + "step": 1224 + }, + { + "epoch": 0.08300020326580392, + "grad_norm": 10.575004577636719, + "learning_rate": 9.935108494763502e-05, + "loss": 1.2169, + "step": 1225 + }, + { + "epoch": 0.083067958533776, + "grad_norm": 10.499407768249512, + "learning_rate": 9.934971592853721e-05, + "loss": 1.3064, + "step": 1226 + }, + { + "epoch": 0.08313571380174808, + "grad_norm": 12.25387191772461, + "learning_rate": 9.934834690943939e-05, + "loss": 1.1554, + "step": 1227 + }, + { + "epoch": 0.08320346906972018, + "grad_norm": 12.173775672912598, + "learning_rate": 9.934697789034157e-05, + "loss": 1.0917, + "step": 1228 + }, + { + "epoch": 0.08327122433769225, + "grad_norm": 12.424721717834473, + "learning_rate": 9.934560887124375e-05, + "loss": 1.3679, + "step": 1229 + }, + { + "epoch": 0.08333897960566435, + "grad_norm": 11.291987419128418, + "learning_rate": 9.934423985214593e-05, + "loss": 1.1439, + "step": 1230 + }, + { + "epoch": 0.08340673487363642, + "grad_norm": 10.908637046813965, + "learning_rate": 9.934287083304813e-05, + "loss": 1.0487, + "step": 1231 + }, + { + "epoch": 0.08347449014160851, + "grad_norm": 14.767544746398926, + "learning_rate": 9.934150181395031e-05, + "loss": 1.2951, + "step": 1232 + }, + { + "epoch": 0.08354224540958059, + "grad_norm": 11.959871292114258, + "learning_rate": 9.934013279485249e-05, + "loss": 1.1244, + "step": 1233 + }, + { + "epoch": 0.08361000067755268, + "grad_norm": 11.19450569152832, + "learning_rate": 9.933876377575467e-05, + "loss": 1.2802, + "step": 1234 + }, + { + "epoch": 0.08367775594552476, + "grad_norm": 10.71377182006836, + "learning_rate": 9.933739475665686e-05, + "loss": 1.0376, + "step": 1235 + }, + { + "epoch": 0.08374551121349685, + "grad_norm": 12.174454689025879, + "learning_rate": 9.933602573755904e-05, + "loss": 1.5779, + "step": 1236 + }, + { + "epoch": 0.08381326648146893, + "grad_norm": 9.863836288452148, + "learning_rate": 9.933465671846122e-05, + "loss": 1.0443, + "step": 1237 + }, + { + "epoch": 0.08388102174944102, + "grad_norm": 11.964838027954102, + "learning_rate": 9.93332876993634e-05, + "loss": 1.0613, + "step": 1238 + }, + { + "epoch": 0.0839487770174131, + "grad_norm": 11.49203109741211, + "learning_rate": 9.93319186802656e-05, + "loss": 1.162, + "step": 1239 + }, + { + "epoch": 0.08401653228538519, + "grad_norm": 12.548815727233887, + "learning_rate": 9.933054966116778e-05, + "loss": 1.0652, + "step": 1240 + }, + { + "epoch": 0.08408428755335727, + "grad_norm": 13.37637996673584, + "learning_rate": 9.932918064206996e-05, + "loss": 1.2281, + "step": 1241 + }, + { + "epoch": 0.08415204282132936, + "grad_norm": 12.59211254119873, + "learning_rate": 9.932781162297215e-05, + "loss": 1.0624, + "step": 1242 + }, + { + "epoch": 0.08421979808930144, + "grad_norm": 13.386221885681152, + "learning_rate": 9.932644260387433e-05, + "loss": 1.4393, + "step": 1243 + }, + { + "epoch": 0.08428755335727353, + "grad_norm": 12.647525787353516, + "learning_rate": 9.932507358477651e-05, + "loss": 1.2228, + "step": 1244 + }, + { + "epoch": 0.08435530862524561, + "grad_norm": 12.039474487304688, + "learning_rate": 9.93237045656787e-05, + "loss": 1.1354, + "step": 1245 + }, + { + "epoch": 0.0844230638932177, + "grad_norm": 11.373556137084961, + "learning_rate": 9.932233554658089e-05, + "loss": 1.2418, + "step": 1246 + }, + { + "epoch": 0.08449081916118978, + "grad_norm": 10.944781303405762, + "learning_rate": 9.932096652748307e-05, + "loss": 1.1515, + "step": 1247 + }, + { + "epoch": 0.08455857442916187, + "grad_norm": 12.174854278564453, + "learning_rate": 9.931959750838525e-05, + "loss": 1.2063, + "step": 1248 + }, + { + "epoch": 0.08462632969713395, + "grad_norm": 8.846879005432129, + "learning_rate": 9.931822848928744e-05, + "loss": 0.9639, + "step": 1249 + }, + { + "epoch": 0.08469408496510604, + "grad_norm": 13.793547630310059, + "learning_rate": 9.931685947018962e-05, + "loss": 1.1236, + "step": 1250 + }, + { + "epoch": 0.08476184023307812, + "grad_norm": 14.486831665039062, + "learning_rate": 9.93154904510918e-05, + "loss": 1.2047, + "step": 1251 + }, + { + "epoch": 0.08482959550105021, + "grad_norm": 13.262588500976562, + "learning_rate": 9.931412143199398e-05, + "loss": 1.4091, + "step": 1252 + }, + { + "epoch": 0.08489735076902229, + "grad_norm": 13.289068222045898, + "learning_rate": 9.931275241289616e-05, + "loss": 1.5065, + "step": 1253 + }, + { + "epoch": 0.08496510603699438, + "grad_norm": 10.22205638885498, + "learning_rate": 9.931138339379836e-05, + "loss": 1.1116, + "step": 1254 + }, + { + "epoch": 0.08503286130496646, + "grad_norm": 13.141668319702148, + "learning_rate": 9.931001437470054e-05, + "loss": 1.4006, + "step": 1255 + }, + { + "epoch": 0.08510061657293855, + "grad_norm": 11.817032814025879, + "learning_rate": 9.930864535560272e-05, + "loss": 1.2062, + "step": 1256 + }, + { + "epoch": 0.08516837184091063, + "grad_norm": 10.814498901367188, + "learning_rate": 9.93072763365049e-05, + "loss": 1.138, + "step": 1257 + }, + { + "epoch": 0.08523612710888272, + "grad_norm": 12.598155975341797, + "learning_rate": 9.930590731740709e-05, + "loss": 1.1959, + "step": 1258 + }, + { + "epoch": 0.0853038823768548, + "grad_norm": 11.909974098205566, + "learning_rate": 9.930453829830927e-05, + "loss": 1.2891, + "step": 1259 + }, + { + "epoch": 0.08537163764482689, + "grad_norm": 12.663064956665039, + "learning_rate": 9.930316927921145e-05, + "loss": 1.3017, + "step": 1260 + }, + { + "epoch": 0.08543939291279896, + "grad_norm": 10.810627937316895, + "learning_rate": 9.930180026011363e-05, + "loss": 1.16, + "step": 1261 + }, + { + "epoch": 0.08550714818077106, + "grad_norm": 9.182926177978516, + "learning_rate": 9.930043124101581e-05, + "loss": 1.2625, + "step": 1262 + }, + { + "epoch": 0.08557490344874313, + "grad_norm": 12.73978042602539, + "learning_rate": 9.9299062221918e-05, + "loss": 0.9904, + "step": 1263 + }, + { + "epoch": 0.08564265871671523, + "grad_norm": 10.346587181091309, + "learning_rate": 9.929769320282019e-05, + "loss": 1.0167, + "step": 1264 + }, + { + "epoch": 0.0857104139846873, + "grad_norm": 12.145682334899902, + "learning_rate": 9.929632418372237e-05, + "loss": 1.4282, + "step": 1265 + }, + { + "epoch": 0.0857781692526594, + "grad_norm": 11.515445709228516, + "learning_rate": 9.929495516462455e-05, + "loss": 1.0527, + "step": 1266 + }, + { + "epoch": 0.08584592452063147, + "grad_norm": 10.979050636291504, + "learning_rate": 9.929358614552674e-05, + "loss": 1.2888, + "step": 1267 + }, + { + "epoch": 0.08591367978860356, + "grad_norm": 10.248215675354004, + "learning_rate": 9.929221712642892e-05, + "loss": 1.4316, + "step": 1268 + }, + { + "epoch": 0.08598143505657564, + "grad_norm": 13.743851661682129, + "learning_rate": 9.92908481073311e-05, + "loss": 1.5047, + "step": 1269 + }, + { + "epoch": 0.08604919032454773, + "grad_norm": 9.919225692749023, + "learning_rate": 9.928947908823328e-05, + "loss": 0.9602, + "step": 1270 + }, + { + "epoch": 0.08611694559251983, + "grad_norm": 9.795915603637695, + "learning_rate": 9.928811006913546e-05, + "loss": 1.3736, + "step": 1271 + }, + { + "epoch": 0.0861847008604919, + "grad_norm": 13.108200073242188, + "learning_rate": 9.928674105003766e-05, + "loss": 1.2591, + "step": 1272 + }, + { + "epoch": 0.086252456128464, + "grad_norm": 13.108073234558105, + "learning_rate": 9.928537203093984e-05, + "loss": 1.102, + "step": 1273 + }, + { + "epoch": 0.08632021139643607, + "grad_norm": 15.177817344665527, + "learning_rate": 9.928400301184202e-05, + "loss": 1.3164, + "step": 1274 + }, + { + "epoch": 0.08638796666440816, + "grad_norm": 13.048440933227539, + "learning_rate": 9.92826339927442e-05, + "loss": 1.3029, + "step": 1275 + }, + { + "epoch": 0.08645572193238024, + "grad_norm": 10.982895851135254, + "learning_rate": 9.928126497364638e-05, + "loss": 1.1829, + "step": 1276 + }, + { + "epoch": 0.08652347720035233, + "grad_norm": 10.901629447937012, + "learning_rate": 9.927989595454857e-05, + "loss": 1.2116, + "step": 1277 + }, + { + "epoch": 0.08659123246832441, + "grad_norm": 12.924722671508789, + "learning_rate": 9.927852693545075e-05, + "loss": 1.6292, + "step": 1278 + }, + { + "epoch": 0.0866589877362965, + "grad_norm": 12.572770118713379, + "learning_rate": 9.927715791635293e-05, + "loss": 1.1012, + "step": 1279 + }, + { + "epoch": 0.08672674300426858, + "grad_norm": 12.803020477294922, + "learning_rate": 9.927578889725511e-05, + "loss": 1.276, + "step": 1280 + }, + { + "epoch": 0.08679449827224067, + "grad_norm": 10.92810344696045, + "learning_rate": 9.92744198781573e-05, + "loss": 1.1876, + "step": 1281 + }, + { + "epoch": 0.08686225354021275, + "grad_norm": 12.253180503845215, + "learning_rate": 9.927305085905949e-05, + "loss": 1.1807, + "step": 1282 + }, + { + "epoch": 0.08693000880818484, + "grad_norm": 11.163126945495605, + "learning_rate": 9.927168183996167e-05, + "loss": 1.1671, + "step": 1283 + }, + { + "epoch": 0.08699776407615692, + "grad_norm": 10.726607322692871, + "learning_rate": 9.927031282086385e-05, + "loss": 1.185, + "step": 1284 + }, + { + "epoch": 0.08706551934412901, + "grad_norm": 13.265491485595703, + "learning_rate": 9.926894380176603e-05, + "loss": 1.2916, + "step": 1285 + }, + { + "epoch": 0.08713327461210109, + "grad_norm": 14.559592247009277, + "learning_rate": 9.926757478266822e-05, + "loss": 1.288, + "step": 1286 + }, + { + "epoch": 0.08720102988007318, + "grad_norm": 14.816813468933105, + "learning_rate": 9.92662057635704e-05, + "loss": 1.4397, + "step": 1287 + }, + { + "epoch": 0.08726878514804526, + "grad_norm": 11.811420440673828, + "learning_rate": 9.926483674447258e-05, + "loss": 1.348, + "step": 1288 + }, + { + "epoch": 0.08733654041601735, + "grad_norm": 10.920133590698242, + "learning_rate": 9.926346772537478e-05, + "loss": 1.4169, + "step": 1289 + }, + { + "epoch": 0.08740429568398943, + "grad_norm": 11.690089225769043, + "learning_rate": 9.926209870627696e-05, + "loss": 1.1199, + "step": 1290 + }, + { + "epoch": 0.08747205095196152, + "grad_norm": 9.411031723022461, + "learning_rate": 9.926072968717914e-05, + "loss": 1.031, + "step": 1291 + }, + { + "epoch": 0.0875398062199336, + "grad_norm": 12.174457550048828, + "learning_rate": 9.925936066808133e-05, + "loss": 1.0622, + "step": 1292 + }, + { + "epoch": 0.08760756148790569, + "grad_norm": 10.346089363098145, + "learning_rate": 9.925799164898351e-05, + "loss": 1.2777, + "step": 1293 + }, + { + "epoch": 0.08767531675587777, + "grad_norm": 12.534863471984863, + "learning_rate": 9.925662262988569e-05, + "loss": 1.0093, + "step": 1294 + }, + { + "epoch": 0.08774307202384986, + "grad_norm": 12.050302505493164, + "learning_rate": 9.925525361078789e-05, + "loss": 1.0591, + "step": 1295 + }, + { + "epoch": 0.08781082729182194, + "grad_norm": 11.556166648864746, + "learning_rate": 9.925388459169007e-05, + "loss": 1.1452, + "step": 1296 + }, + { + "epoch": 0.08787858255979403, + "grad_norm": 9.693270683288574, + "learning_rate": 9.925251557259225e-05, + "loss": 1.0332, + "step": 1297 + }, + { + "epoch": 0.0879463378277661, + "grad_norm": 12.646526336669922, + "learning_rate": 9.925114655349443e-05, + "loss": 1.1894, + "step": 1298 + }, + { + "epoch": 0.0880140930957382, + "grad_norm": 10.676809310913086, + "learning_rate": 9.924977753439661e-05, + "loss": 1.1727, + "step": 1299 + }, + { + "epoch": 0.08808184836371027, + "grad_norm": 11.182327270507812, + "learning_rate": 9.92484085152988e-05, + "loss": 1.0145, + "step": 1300 + }, + { + "epoch": 0.08814960363168237, + "grad_norm": 11.55026626586914, + "learning_rate": 9.924703949620098e-05, + "loss": 1.2187, + "step": 1301 + }, + { + "epoch": 0.08821735889965444, + "grad_norm": 11.502679824829102, + "learning_rate": 9.924567047710316e-05, + "loss": 1.3116, + "step": 1302 + }, + { + "epoch": 0.08828511416762654, + "grad_norm": 9.676247596740723, + "learning_rate": 9.924430145800534e-05, + "loss": 1.2869, + "step": 1303 + }, + { + "epoch": 0.08835286943559861, + "grad_norm": 15.123950004577637, + "learning_rate": 9.924293243890754e-05, + "loss": 1.0225, + "step": 1304 + }, + { + "epoch": 0.0884206247035707, + "grad_norm": 14.030994415283203, + "learning_rate": 9.924156341980972e-05, + "loss": 0.9829, + "step": 1305 + }, + { + "epoch": 0.08848837997154278, + "grad_norm": 10.00402545928955, + "learning_rate": 9.92401944007119e-05, + "loss": 1.0946, + "step": 1306 + }, + { + "epoch": 0.08855613523951487, + "grad_norm": 9.077853202819824, + "learning_rate": 9.923882538161408e-05, + "loss": 0.852, + "step": 1307 + }, + { + "epoch": 0.08862389050748695, + "grad_norm": 12.777885437011719, + "learning_rate": 9.923745636251626e-05, + "loss": 1.0513, + "step": 1308 + }, + { + "epoch": 0.08869164577545904, + "grad_norm": 10.686469078063965, + "learning_rate": 9.923608734341845e-05, + "loss": 1.0873, + "step": 1309 + }, + { + "epoch": 0.08875940104343112, + "grad_norm": 11.51689338684082, + "learning_rate": 9.923471832432063e-05, + "loss": 1.0037, + "step": 1310 + }, + { + "epoch": 0.08882715631140321, + "grad_norm": 13.259784698486328, + "learning_rate": 9.923334930522281e-05, + "loss": 1.2004, + "step": 1311 + }, + { + "epoch": 0.08889491157937529, + "grad_norm": 10.63463306427002, + "learning_rate": 9.923198028612499e-05, + "loss": 1.2067, + "step": 1312 + }, + { + "epoch": 0.08896266684734738, + "grad_norm": 10.838210105895996, + "learning_rate": 9.923061126702719e-05, + "loss": 1.2939, + "step": 1313 + }, + { + "epoch": 0.08903042211531946, + "grad_norm": 12.058418273925781, + "learning_rate": 9.922924224792937e-05, + "loss": 1.3091, + "step": 1314 + }, + { + "epoch": 0.08909817738329155, + "grad_norm": 10.619451522827148, + "learning_rate": 9.922787322883155e-05, + "loss": 1.1635, + "step": 1315 + }, + { + "epoch": 0.08916593265126363, + "grad_norm": 9.37607192993164, + "learning_rate": 9.922650420973373e-05, + "loss": 1.2206, + "step": 1316 + }, + { + "epoch": 0.08923368791923572, + "grad_norm": 12.119776725769043, + "learning_rate": 9.922513519063591e-05, + "loss": 1.2159, + "step": 1317 + }, + { + "epoch": 0.0893014431872078, + "grad_norm": 14.013461112976074, + "learning_rate": 9.92237661715381e-05, + "loss": 1.1445, + "step": 1318 + }, + { + "epoch": 0.08936919845517989, + "grad_norm": 11.560707092285156, + "learning_rate": 9.922239715244028e-05, + "loss": 1.3617, + "step": 1319 + }, + { + "epoch": 0.08943695372315197, + "grad_norm": 11.817791938781738, + "learning_rate": 9.922102813334246e-05, + "loss": 1.1931, + "step": 1320 + }, + { + "epoch": 0.08950470899112406, + "grad_norm": 11.544127464294434, + "learning_rate": 9.921965911424464e-05, + "loss": 1.0766, + "step": 1321 + }, + { + "epoch": 0.08957246425909614, + "grad_norm": 10.67740249633789, + "learning_rate": 9.921829009514684e-05, + "loss": 1.2103, + "step": 1322 + }, + { + "epoch": 0.08964021952706823, + "grad_norm": 11.774645805358887, + "learning_rate": 9.921692107604902e-05, + "loss": 1.1921, + "step": 1323 + }, + { + "epoch": 0.08970797479504032, + "grad_norm": 12.594759941101074, + "learning_rate": 9.92155520569512e-05, + "loss": 0.906, + "step": 1324 + }, + { + "epoch": 0.0897757300630124, + "grad_norm": 10.988224983215332, + "learning_rate": 9.921418303785338e-05, + "loss": 1.4195, + "step": 1325 + }, + { + "epoch": 0.08984348533098449, + "grad_norm": 12.22718620300293, + "learning_rate": 9.921281401875556e-05, + "loss": 1.2461, + "step": 1326 + }, + { + "epoch": 0.08991124059895657, + "grad_norm": 10.964727401733398, + "learning_rate": 9.921144499965775e-05, + "loss": 1.1254, + "step": 1327 + }, + { + "epoch": 0.08997899586692866, + "grad_norm": 11.327523231506348, + "learning_rate": 9.921007598055993e-05, + "loss": 1.145, + "step": 1328 + }, + { + "epoch": 0.09004675113490074, + "grad_norm": 9.870691299438477, + "learning_rate": 9.920870696146211e-05, + "loss": 1.2408, + "step": 1329 + }, + { + "epoch": 0.09011450640287283, + "grad_norm": 11.02373218536377, + "learning_rate": 9.920733794236429e-05, + "loss": 1.1084, + "step": 1330 + }, + { + "epoch": 0.09018226167084491, + "grad_norm": 12.300410270690918, + "learning_rate": 9.920596892326647e-05, + "loss": 1.1542, + "step": 1331 + }, + { + "epoch": 0.090250016938817, + "grad_norm": 9.832919120788574, + "learning_rate": 9.920459990416867e-05, + "loss": 1.1195, + "step": 1332 + }, + { + "epoch": 0.09031777220678908, + "grad_norm": 10.432522773742676, + "learning_rate": 9.920323088507085e-05, + "loss": 1.1825, + "step": 1333 + }, + { + "epoch": 0.09038552747476117, + "grad_norm": 11.878792762756348, + "learning_rate": 9.920186186597303e-05, + "loss": 0.9647, + "step": 1334 + }, + { + "epoch": 0.09045328274273325, + "grad_norm": 11.866320610046387, + "learning_rate": 9.920049284687522e-05, + "loss": 1.3598, + "step": 1335 + }, + { + "epoch": 0.09052103801070534, + "grad_norm": 14.11543083190918, + "learning_rate": 9.91991238277774e-05, + "loss": 1.4172, + "step": 1336 + }, + { + "epoch": 0.09058879327867742, + "grad_norm": 13.841622352600098, + "learning_rate": 9.919775480867958e-05, + "loss": 1.4119, + "step": 1337 + }, + { + "epoch": 0.0906565485466495, + "grad_norm": 11.077167510986328, + "learning_rate": 9.919638578958178e-05, + "loss": 0.984, + "step": 1338 + }, + { + "epoch": 0.09072430381462158, + "grad_norm": 10.904266357421875, + "learning_rate": 9.919501677048396e-05, + "loss": 1.0439, + "step": 1339 + }, + { + "epoch": 0.09079205908259368, + "grad_norm": 11.623948097229004, + "learning_rate": 9.919364775138614e-05, + "loss": 1.0318, + "step": 1340 + }, + { + "epoch": 0.09085981435056575, + "grad_norm": 10.893725395202637, + "learning_rate": 9.919227873228833e-05, + "loss": 1.2626, + "step": 1341 + }, + { + "epoch": 0.09092756961853785, + "grad_norm": 10.064491271972656, + "learning_rate": 9.919090971319051e-05, + "loss": 1.1483, + "step": 1342 + }, + { + "epoch": 0.09099532488650992, + "grad_norm": 9.854101181030273, + "learning_rate": 9.918954069409269e-05, + "loss": 1.1595, + "step": 1343 + }, + { + "epoch": 0.09106308015448202, + "grad_norm": 11.682498931884766, + "learning_rate": 9.918817167499487e-05, + "loss": 1.2887, + "step": 1344 + }, + { + "epoch": 0.09113083542245409, + "grad_norm": 10.484097480773926, + "learning_rate": 9.918680265589707e-05, + "loss": 1.2, + "step": 1345 + }, + { + "epoch": 0.09119859069042618, + "grad_norm": 12.332358360290527, + "learning_rate": 9.918543363679925e-05, + "loss": 1.262, + "step": 1346 + }, + { + "epoch": 0.09126634595839826, + "grad_norm": 13.706925392150879, + "learning_rate": 9.918406461770143e-05, + "loss": 1.2151, + "step": 1347 + }, + { + "epoch": 0.09133410122637035, + "grad_norm": 13.918478965759277, + "learning_rate": 9.91826955986036e-05, + "loss": 1.4653, + "step": 1348 + }, + { + "epoch": 0.09140185649434243, + "grad_norm": 11.970015525817871, + "learning_rate": 9.918132657950579e-05, + "loss": 1.2452, + "step": 1349 + }, + { + "epoch": 0.09146961176231452, + "grad_norm": 9.698074340820312, + "learning_rate": 9.917995756040798e-05, + "loss": 0.9142, + "step": 1350 + }, + { + "epoch": 0.0915373670302866, + "grad_norm": 9.225728988647461, + "learning_rate": 9.917858854131016e-05, + "loss": 1.0293, + "step": 1351 + }, + { + "epoch": 0.09160512229825869, + "grad_norm": 10.77661418914795, + "learning_rate": 9.917721952221234e-05, + "loss": 1.1865, + "step": 1352 + }, + { + "epoch": 0.09167287756623077, + "grad_norm": 10.45409870147705, + "learning_rate": 9.917585050311452e-05, + "loss": 1.1553, + "step": 1353 + }, + { + "epoch": 0.09174063283420286, + "grad_norm": 10.44918441772461, + "learning_rate": 9.91744814840167e-05, + "loss": 1.1621, + "step": 1354 + }, + { + "epoch": 0.09180838810217494, + "grad_norm": 14.769590377807617, + "learning_rate": 9.91731124649189e-05, + "loss": 1.3502, + "step": 1355 + }, + { + "epoch": 0.09187614337014703, + "grad_norm": 12.733844757080078, + "learning_rate": 9.917174344582108e-05, + "loss": 1.1973, + "step": 1356 + }, + { + "epoch": 0.09194389863811911, + "grad_norm": 13.365818977355957, + "learning_rate": 9.917037442672326e-05, + "loss": 1.5412, + "step": 1357 + }, + { + "epoch": 0.0920116539060912, + "grad_norm": 11.163050651550293, + "learning_rate": 9.916900540762544e-05, + "loss": 1.3142, + "step": 1358 + }, + { + "epoch": 0.09207940917406328, + "grad_norm": 11.420190811157227, + "learning_rate": 9.916763638852763e-05, + "loss": 1.2257, + "step": 1359 + }, + { + "epoch": 0.09214716444203537, + "grad_norm": 9.66398811340332, + "learning_rate": 9.916626736942981e-05, + "loss": 1.0584, + "step": 1360 + }, + { + "epoch": 0.09221491971000745, + "grad_norm": 13.643363952636719, + "learning_rate": 9.916489835033199e-05, + "loss": 1.3783, + "step": 1361 + }, + { + "epoch": 0.09228267497797954, + "grad_norm": 11.658889770507812, + "learning_rate": 9.916352933123417e-05, + "loss": 1.2126, + "step": 1362 + }, + { + "epoch": 0.09235043024595162, + "grad_norm": 11.2728271484375, + "learning_rate": 9.916216031213635e-05, + "loss": 1.2117, + "step": 1363 + }, + { + "epoch": 0.09241818551392371, + "grad_norm": 13.576864242553711, + "learning_rate": 9.916079129303855e-05, + "loss": 1.2922, + "step": 1364 + }, + { + "epoch": 0.09248594078189579, + "grad_norm": 10.950700759887695, + "learning_rate": 9.915942227394073e-05, + "loss": 1.1394, + "step": 1365 + }, + { + "epoch": 0.09255369604986788, + "grad_norm": 11.638351440429688, + "learning_rate": 9.91580532548429e-05, + "loss": 1.3927, + "step": 1366 + }, + { + "epoch": 0.09262145131783996, + "grad_norm": 12.355545043945312, + "learning_rate": 9.915668423574509e-05, + "loss": 1.1861, + "step": 1367 + }, + { + "epoch": 0.09268920658581205, + "grad_norm": 11.543237686157227, + "learning_rate": 9.915531521664728e-05, + "loss": 0.9868, + "step": 1368 + }, + { + "epoch": 0.09275696185378413, + "grad_norm": 11.684252738952637, + "learning_rate": 9.915394619754946e-05, + "loss": 1.3332, + "step": 1369 + }, + { + "epoch": 0.09282471712175622, + "grad_norm": 10.775650024414062, + "learning_rate": 9.915257717845164e-05, + "loss": 1.1164, + "step": 1370 + }, + { + "epoch": 0.0928924723897283, + "grad_norm": 11.649751663208008, + "learning_rate": 9.915120815935382e-05, + "loss": 1.0369, + "step": 1371 + }, + { + "epoch": 0.09296022765770039, + "grad_norm": 9.741403579711914, + "learning_rate": 9.9149839140256e-05, + "loss": 1.0509, + "step": 1372 + }, + { + "epoch": 0.09302798292567246, + "grad_norm": 13.804118156433105, + "learning_rate": 9.91484701211582e-05, + "loss": 1.4318, + "step": 1373 + }, + { + "epoch": 0.09309573819364456, + "grad_norm": 10.939459800720215, + "learning_rate": 9.914710110206038e-05, + "loss": 1.218, + "step": 1374 + }, + { + "epoch": 0.09316349346161663, + "grad_norm": 14.076252937316895, + "learning_rate": 9.914573208296256e-05, + "loss": 1.4699, + "step": 1375 + }, + { + "epoch": 0.09323124872958873, + "grad_norm": 12.90072250366211, + "learning_rate": 9.914436306386474e-05, + "loss": 1.3418, + "step": 1376 + }, + { + "epoch": 0.09329900399756082, + "grad_norm": 12.7711820602417, + "learning_rate": 9.914299404476693e-05, + "loss": 0.9866, + "step": 1377 + }, + { + "epoch": 0.0933667592655329, + "grad_norm": 9.205671310424805, + "learning_rate": 9.914162502566911e-05, + "loss": 1.1345, + "step": 1378 + }, + { + "epoch": 0.09343451453350499, + "grad_norm": 11.027194023132324, + "learning_rate": 9.914025600657129e-05, + "loss": 1.2637, + "step": 1379 + }, + { + "epoch": 0.09350226980147706, + "grad_norm": 12.861044883728027, + "learning_rate": 9.913888698747347e-05, + "loss": 1.2348, + "step": 1380 + }, + { + "epoch": 0.09357002506944916, + "grad_norm": 11.266969680786133, + "learning_rate": 9.913751796837567e-05, + "loss": 1.0773, + "step": 1381 + }, + { + "epoch": 0.09363778033742123, + "grad_norm": 13.137110710144043, + "learning_rate": 9.913614894927785e-05, + "loss": 1.0537, + "step": 1382 + }, + { + "epoch": 0.09370553560539333, + "grad_norm": 11.343362808227539, + "learning_rate": 9.913477993018003e-05, + "loss": 1.335, + "step": 1383 + }, + { + "epoch": 0.0937732908733654, + "grad_norm": 11.472663879394531, + "learning_rate": 9.913341091108222e-05, + "loss": 1.1362, + "step": 1384 + }, + { + "epoch": 0.0938410461413375, + "grad_norm": 8.441573143005371, + "learning_rate": 9.91320418919844e-05, + "loss": 1.1646, + "step": 1385 + }, + { + "epoch": 0.09390880140930957, + "grad_norm": 12.570130348205566, + "learning_rate": 9.913067287288658e-05, + "loss": 1.3535, + "step": 1386 + }, + { + "epoch": 0.09397655667728166, + "grad_norm": 11.671664237976074, + "learning_rate": 9.912930385378877e-05, + "loss": 1.1707, + "step": 1387 + }, + { + "epoch": 0.09404431194525374, + "grad_norm": 12.638328552246094, + "learning_rate": 9.912793483469096e-05, + "loss": 1.057, + "step": 1388 + }, + { + "epoch": 0.09411206721322583, + "grad_norm": 10.506028175354004, + "learning_rate": 9.912656581559314e-05, + "loss": 0.9279, + "step": 1389 + }, + { + "epoch": 0.09417982248119791, + "grad_norm": 11.536858558654785, + "learning_rate": 9.912519679649532e-05, + "loss": 1.3351, + "step": 1390 + }, + { + "epoch": 0.09424757774917, + "grad_norm": 12.692436218261719, + "learning_rate": 9.912382777739751e-05, + "loss": 1.164, + "step": 1391 + }, + { + "epoch": 0.09431533301714208, + "grad_norm": 12.088066101074219, + "learning_rate": 9.912245875829969e-05, + "loss": 1.2971, + "step": 1392 + }, + { + "epoch": 0.09438308828511417, + "grad_norm": 12.133123397827148, + "learning_rate": 9.912108973920187e-05, + "loss": 1.2742, + "step": 1393 + }, + { + "epoch": 0.09445084355308625, + "grad_norm": 10.168001174926758, + "learning_rate": 9.911972072010405e-05, + "loss": 1.0964, + "step": 1394 + }, + { + "epoch": 0.09451859882105834, + "grad_norm": 10.561311721801758, + "learning_rate": 9.911835170100623e-05, + "loss": 1.1828, + "step": 1395 + }, + { + "epoch": 0.09458635408903042, + "grad_norm": 11.497330665588379, + "learning_rate": 9.911698268190843e-05, + "loss": 1.4699, + "step": 1396 + }, + { + "epoch": 0.09465410935700251, + "grad_norm": 12.190573692321777, + "learning_rate": 9.91156136628106e-05, + "loss": 1.1601, + "step": 1397 + }, + { + "epoch": 0.09472186462497459, + "grad_norm": 10.633028030395508, + "learning_rate": 9.911424464371279e-05, + "loss": 1.333, + "step": 1398 + }, + { + "epoch": 0.09478961989294668, + "grad_norm": 12.262279510498047, + "learning_rate": 9.911287562461497e-05, + "loss": 1.2214, + "step": 1399 + }, + { + "epoch": 0.09485737516091876, + "grad_norm": 11.506840705871582, + "learning_rate": 9.911150660551716e-05, + "loss": 1.1172, + "step": 1400 + }, + { + "epoch": 0.09492513042889085, + "grad_norm": 11.453936576843262, + "learning_rate": 9.911013758641934e-05, + "loss": 1.2205, + "step": 1401 + }, + { + "epoch": 0.09499288569686293, + "grad_norm": 9.980772972106934, + "learning_rate": 9.910876856732152e-05, + "loss": 1.1004, + "step": 1402 + }, + { + "epoch": 0.09506064096483502, + "grad_norm": 11.775416374206543, + "learning_rate": 9.91073995482237e-05, + "loss": 1.1587, + "step": 1403 + }, + { + "epoch": 0.0951283962328071, + "grad_norm": 8.840147972106934, + "learning_rate": 9.910603052912588e-05, + "loss": 0.8702, + "step": 1404 + }, + { + "epoch": 0.09519615150077919, + "grad_norm": 10.938506126403809, + "learning_rate": 9.910466151002808e-05, + "loss": 1.085, + "step": 1405 + }, + { + "epoch": 0.09526390676875127, + "grad_norm": 11.733402252197266, + "learning_rate": 9.910329249093026e-05, + "loss": 1.1202, + "step": 1406 + }, + { + "epoch": 0.09533166203672336, + "grad_norm": 11.616521835327148, + "learning_rate": 9.910192347183244e-05, + "loss": 1.3483, + "step": 1407 + }, + { + "epoch": 0.09539941730469544, + "grad_norm": 12.477338790893555, + "learning_rate": 9.910055445273462e-05, + "loss": 1.4798, + "step": 1408 + }, + { + "epoch": 0.09546717257266753, + "grad_norm": 11.233193397521973, + "learning_rate": 9.90991854336368e-05, + "loss": 1.0304, + "step": 1409 + }, + { + "epoch": 0.0955349278406396, + "grad_norm": 12.586124420166016, + "learning_rate": 9.909781641453899e-05, + "loss": 1.2216, + "step": 1410 + }, + { + "epoch": 0.0956026831086117, + "grad_norm": 12.974738121032715, + "learning_rate": 9.909644739544117e-05, + "loss": 1.1495, + "step": 1411 + }, + { + "epoch": 0.09567043837658377, + "grad_norm": 9.613628387451172, + "learning_rate": 9.909507837634335e-05, + "loss": 0.8326, + "step": 1412 + }, + { + "epoch": 0.09573819364455587, + "grad_norm": 10.644312858581543, + "learning_rate": 9.909370935724553e-05, + "loss": 1.0182, + "step": 1413 + }, + { + "epoch": 0.09580594891252794, + "grad_norm": 11.155874252319336, + "learning_rate": 9.909234033814773e-05, + "loss": 1.2887, + "step": 1414 + }, + { + "epoch": 0.09587370418050004, + "grad_norm": 12.068909645080566, + "learning_rate": 9.90909713190499e-05, + "loss": 1.1697, + "step": 1415 + }, + { + "epoch": 0.09594145944847211, + "grad_norm": 10.66831111907959, + "learning_rate": 9.908960229995209e-05, + "loss": 1.1275, + "step": 1416 + }, + { + "epoch": 0.0960092147164442, + "grad_norm": 11.80036449432373, + "learning_rate": 9.908823328085427e-05, + "loss": 1.38, + "step": 1417 + }, + { + "epoch": 0.09607696998441628, + "grad_norm": 11.677534103393555, + "learning_rate": 9.908686426175645e-05, + "loss": 1.075, + "step": 1418 + }, + { + "epoch": 0.09614472525238837, + "grad_norm": 10.54027271270752, + "learning_rate": 9.908549524265864e-05, + "loss": 0.9617, + "step": 1419 + }, + { + "epoch": 0.09621248052036045, + "grad_norm": 9.70718002319336, + "learning_rate": 9.908412622356082e-05, + "loss": 1.0395, + "step": 1420 + }, + { + "epoch": 0.09628023578833254, + "grad_norm": 10.439559936523438, + "learning_rate": 9.9082757204463e-05, + "loss": 1.4112, + "step": 1421 + }, + { + "epoch": 0.09634799105630462, + "grad_norm": 9.328675270080566, + "learning_rate": 9.908138818536518e-05, + "loss": 1.0481, + "step": 1422 + }, + { + "epoch": 0.09641574632427671, + "grad_norm": 12.834508895874023, + "learning_rate": 9.908001916626738e-05, + "loss": 1.0863, + "step": 1423 + }, + { + "epoch": 0.09648350159224879, + "grad_norm": 11.885201454162598, + "learning_rate": 9.907865014716956e-05, + "loss": 1.3509, + "step": 1424 + }, + { + "epoch": 0.09655125686022088, + "grad_norm": 11.299174308776855, + "learning_rate": 9.907728112807174e-05, + "loss": 1.1328, + "step": 1425 + }, + { + "epoch": 0.09661901212819296, + "grad_norm": 13.024226188659668, + "learning_rate": 9.907591210897392e-05, + "loss": 1.1495, + "step": 1426 + }, + { + "epoch": 0.09668676739616505, + "grad_norm": 13.418682098388672, + "learning_rate": 9.907454308987611e-05, + "loss": 1.3827, + "step": 1427 + }, + { + "epoch": 0.09675452266413713, + "grad_norm": 11.28375244140625, + "learning_rate": 9.907317407077829e-05, + "loss": 1.3658, + "step": 1428 + }, + { + "epoch": 0.09682227793210922, + "grad_norm": 9.711199760437012, + "learning_rate": 9.907180505168047e-05, + "loss": 0.8743, + "step": 1429 + }, + { + "epoch": 0.09689003320008131, + "grad_norm": 12.292948722839355, + "learning_rate": 9.907043603258267e-05, + "loss": 1.203, + "step": 1430 + }, + { + "epoch": 0.09695778846805339, + "grad_norm": 13.195072174072266, + "learning_rate": 9.906906701348485e-05, + "loss": 1.0403, + "step": 1431 + }, + { + "epoch": 0.09702554373602548, + "grad_norm": 11.45721435546875, + "learning_rate": 9.906769799438703e-05, + "loss": 1.2075, + "step": 1432 + }, + { + "epoch": 0.09709329900399756, + "grad_norm": 10.477989196777344, + "learning_rate": 9.906632897528922e-05, + "loss": 1.1646, + "step": 1433 + }, + { + "epoch": 0.09716105427196965, + "grad_norm": 12.572269439697266, + "learning_rate": 9.90649599561914e-05, + "loss": 1.3944, + "step": 1434 + }, + { + "epoch": 0.09722880953994173, + "grad_norm": 9.37205982208252, + "learning_rate": 9.906359093709358e-05, + "loss": 0.9902, + "step": 1435 + }, + { + "epoch": 0.09729656480791382, + "grad_norm": 11.590779304504395, + "learning_rate": 9.906222191799576e-05, + "loss": 1.2658, + "step": 1436 + }, + { + "epoch": 0.0973643200758859, + "grad_norm": 10.35207748413086, + "learning_rate": 9.906085289889795e-05, + "loss": 0.7905, + "step": 1437 + }, + { + "epoch": 0.09743207534385799, + "grad_norm": 9.993937492370605, + "learning_rate": 9.905948387980013e-05, + "loss": 1.1522, + "step": 1438 + }, + { + "epoch": 0.09749983061183007, + "grad_norm": 9.865569114685059, + "learning_rate": 9.905811486070232e-05, + "loss": 1.2536, + "step": 1439 + }, + { + "epoch": 0.09756758587980216, + "grad_norm": 12.836588859558105, + "learning_rate": 9.90567458416045e-05, + "loss": 1.2216, + "step": 1440 + }, + { + "epoch": 0.09763534114777424, + "grad_norm": 10.062298774719238, + "learning_rate": 9.905537682250668e-05, + "loss": 1.1576, + "step": 1441 + }, + { + "epoch": 0.09770309641574633, + "grad_norm": 10.897071838378906, + "learning_rate": 9.905400780340887e-05, + "loss": 1.1315, + "step": 1442 + }, + { + "epoch": 0.09777085168371841, + "grad_norm": 10.366122245788574, + "learning_rate": 9.905263878431105e-05, + "loss": 1.274, + "step": 1443 + }, + { + "epoch": 0.0978386069516905, + "grad_norm": 11.632966995239258, + "learning_rate": 9.905126976521323e-05, + "loss": 1.1427, + "step": 1444 + }, + { + "epoch": 0.09790636221966258, + "grad_norm": 10.537737846374512, + "learning_rate": 9.904990074611541e-05, + "loss": 1.0913, + "step": 1445 + }, + { + "epoch": 0.09797411748763467, + "grad_norm": 9.52363109588623, + "learning_rate": 9.90485317270176e-05, + "loss": 0.8677, + "step": 1446 + }, + { + "epoch": 0.09804187275560675, + "grad_norm": 11.511491775512695, + "learning_rate": 9.904716270791979e-05, + "loss": 1.0381, + "step": 1447 + }, + { + "epoch": 0.09810962802357884, + "grad_norm": 12.085793495178223, + "learning_rate": 9.904579368882197e-05, + "loss": 1.1415, + "step": 1448 + }, + { + "epoch": 0.09817738329155092, + "grad_norm": 8.665430068969727, + "learning_rate": 9.904442466972415e-05, + "loss": 1.0367, + "step": 1449 + }, + { + "epoch": 0.09824513855952301, + "grad_norm": 10.900618553161621, + "learning_rate": 9.904305565062633e-05, + "loss": 0.9835, + "step": 1450 + }, + { + "epoch": 0.09831289382749508, + "grad_norm": 10.3113431930542, + "learning_rate": 9.904168663152852e-05, + "loss": 0.924, + "step": 1451 + }, + { + "epoch": 0.09838064909546718, + "grad_norm": 10.001591682434082, + "learning_rate": 9.90403176124307e-05, + "loss": 1.0835, + "step": 1452 + }, + { + "epoch": 0.09844840436343925, + "grad_norm": 11.333273887634277, + "learning_rate": 9.903894859333288e-05, + "loss": 1.072, + "step": 1453 + }, + { + "epoch": 0.09851615963141135, + "grad_norm": 10.107904434204102, + "learning_rate": 9.903757957423506e-05, + "loss": 1.0848, + "step": 1454 + }, + { + "epoch": 0.09858391489938342, + "grad_norm": 12.578730583190918, + "learning_rate": 9.903621055513725e-05, + "loss": 1.2735, + "step": 1455 + }, + { + "epoch": 0.09865167016735552, + "grad_norm": 10.453478813171387, + "learning_rate": 9.903484153603944e-05, + "loss": 1.3141, + "step": 1456 + }, + { + "epoch": 0.09871942543532759, + "grad_norm": 10.383566856384277, + "learning_rate": 9.903347251694162e-05, + "loss": 1.0992, + "step": 1457 + }, + { + "epoch": 0.09878718070329968, + "grad_norm": 9.612902641296387, + "learning_rate": 9.90321034978438e-05, + "loss": 1.3103, + "step": 1458 + }, + { + "epoch": 0.09885493597127176, + "grad_norm": 12.111359596252441, + "learning_rate": 9.903073447874598e-05, + "loss": 1.3102, + "step": 1459 + }, + { + "epoch": 0.09892269123924385, + "grad_norm": 9.987195014953613, + "learning_rate": 9.902936545964817e-05, + "loss": 1.1961, + "step": 1460 + }, + { + "epoch": 0.09899044650721593, + "grad_norm": 10.900408744812012, + "learning_rate": 9.902799644055035e-05, + "loss": 1.0208, + "step": 1461 + }, + { + "epoch": 0.09905820177518802, + "grad_norm": 9.94915771484375, + "learning_rate": 9.902662742145253e-05, + "loss": 1.3347, + "step": 1462 + }, + { + "epoch": 0.0991259570431601, + "grad_norm": 13.393661499023438, + "learning_rate": 9.902525840235471e-05, + "loss": 1.2903, + "step": 1463 + }, + { + "epoch": 0.09919371231113219, + "grad_norm": 10.122967720031738, + "learning_rate": 9.902388938325689e-05, + "loss": 1.0229, + "step": 1464 + }, + { + "epoch": 0.09926146757910427, + "grad_norm": 10.775031089782715, + "learning_rate": 9.902252036415909e-05, + "loss": 1.1669, + "step": 1465 + }, + { + "epoch": 0.09932922284707636, + "grad_norm": 9.733497619628906, + "learning_rate": 9.902115134506127e-05, + "loss": 1.0903, + "step": 1466 + }, + { + "epoch": 0.09939697811504844, + "grad_norm": 9.230277061462402, + "learning_rate": 9.901978232596345e-05, + "loss": 1.1738, + "step": 1467 + }, + { + "epoch": 0.09946473338302053, + "grad_norm": 10.822884559631348, + "learning_rate": 9.901841330686563e-05, + "loss": 1.3392, + "step": 1468 + }, + { + "epoch": 0.09953248865099261, + "grad_norm": 10.64195442199707, + "learning_rate": 9.901704428776782e-05, + "loss": 1.0823, + "step": 1469 + }, + { + "epoch": 0.0996002439189647, + "grad_norm": 13.73645305633545, + "learning_rate": 9.901567526867e-05, + "loss": 1.1128, + "step": 1470 + }, + { + "epoch": 0.09966799918693678, + "grad_norm": 11.361958503723145, + "learning_rate": 9.901430624957218e-05, + "loss": 1.083, + "step": 1471 + }, + { + "epoch": 0.09973575445490887, + "grad_norm": 10.839045524597168, + "learning_rate": 9.901293723047436e-05, + "loss": 1.0288, + "step": 1472 + }, + { + "epoch": 0.09980350972288095, + "grad_norm": 10.41995906829834, + "learning_rate": 9.901156821137656e-05, + "loss": 1.0631, + "step": 1473 + }, + { + "epoch": 0.09987126499085304, + "grad_norm": 11.87709903717041, + "learning_rate": 9.901019919227874e-05, + "loss": 1.0715, + "step": 1474 + }, + { + "epoch": 0.09993902025882512, + "grad_norm": 10.46670913696289, + "learning_rate": 9.900883017318092e-05, + "loss": 1.1684, + "step": 1475 + }, + { + "epoch": 0.10000677552679721, + "grad_norm": 12.163457870483398, + "learning_rate": 9.900746115408311e-05, + "loss": 1.4416, + "step": 1476 + }, + { + "epoch": 0.10007453079476929, + "grad_norm": 13.417581558227539, + "learning_rate": 9.900609213498529e-05, + "loss": 1.1876, + "step": 1477 + }, + { + "epoch": 0.10014228606274138, + "grad_norm": 11.35722541809082, + "learning_rate": 9.900472311588747e-05, + "loss": 1.1389, + "step": 1478 + }, + { + "epoch": 0.10021004133071346, + "grad_norm": 10.042820930480957, + "learning_rate": 9.900335409678966e-05, + "loss": 1.2156, + "step": 1479 + }, + { + "epoch": 0.10027779659868555, + "grad_norm": 10.823782920837402, + "learning_rate": 9.900198507769184e-05, + "loss": 1.1915, + "step": 1480 + }, + { + "epoch": 0.10034555186665763, + "grad_norm": 13.6808443069458, + "learning_rate": 9.900061605859403e-05, + "loss": 1.1724, + "step": 1481 + }, + { + "epoch": 0.10041330713462972, + "grad_norm": 11.16846752166748, + "learning_rate": 9.89992470394962e-05, + "loss": 1.2093, + "step": 1482 + }, + { + "epoch": 0.10048106240260181, + "grad_norm": 10.391450881958008, + "learning_rate": 9.89978780203984e-05, + "loss": 1.142, + "step": 1483 + }, + { + "epoch": 0.10054881767057389, + "grad_norm": 9.324288368225098, + "learning_rate": 9.899650900130058e-05, + "loss": 1.0579, + "step": 1484 + }, + { + "epoch": 0.10061657293854598, + "grad_norm": 12.601625442504883, + "learning_rate": 9.899513998220276e-05, + "loss": 1.2643, + "step": 1485 + }, + { + "epoch": 0.10068432820651806, + "grad_norm": 14.270779609680176, + "learning_rate": 9.899377096310494e-05, + "loss": 1.2903, + "step": 1486 + }, + { + "epoch": 0.10075208347449015, + "grad_norm": 11.521232604980469, + "learning_rate": 9.899240194400712e-05, + "loss": 1.2385, + "step": 1487 + }, + { + "epoch": 0.10081983874246223, + "grad_norm": 10.76693344116211, + "learning_rate": 9.899103292490931e-05, + "loss": 1.1745, + "step": 1488 + }, + { + "epoch": 0.10088759401043432, + "grad_norm": 9.091184616088867, + "learning_rate": 9.89896639058115e-05, + "loss": 0.9571, + "step": 1489 + }, + { + "epoch": 0.1009553492784064, + "grad_norm": 11.930106163024902, + "learning_rate": 9.898829488671368e-05, + "loss": 1.1226, + "step": 1490 + }, + { + "epoch": 0.10102310454637849, + "grad_norm": 10.90937614440918, + "learning_rate": 9.898692586761586e-05, + "loss": 1.0776, + "step": 1491 + }, + { + "epoch": 0.10109085981435056, + "grad_norm": 10.618545532226562, + "learning_rate": 9.898555684851805e-05, + "loss": 1.1251, + "step": 1492 + }, + { + "epoch": 0.10115861508232266, + "grad_norm": 10.228861808776855, + "learning_rate": 9.898418782942023e-05, + "loss": 1.1987, + "step": 1493 + }, + { + "epoch": 0.10122637035029473, + "grad_norm": 8.807862281799316, + "learning_rate": 9.898281881032241e-05, + "loss": 0.9339, + "step": 1494 + }, + { + "epoch": 0.10129412561826683, + "grad_norm": 11.24593448638916, + "learning_rate": 9.898144979122459e-05, + "loss": 1.1495, + "step": 1495 + }, + { + "epoch": 0.1013618808862389, + "grad_norm": 11.192438125610352, + "learning_rate": 9.898008077212677e-05, + "loss": 1.1361, + "step": 1496 + }, + { + "epoch": 0.101429636154211, + "grad_norm": 10.440075874328613, + "learning_rate": 9.897871175302896e-05, + "loss": 1.2273, + "step": 1497 + }, + { + "epoch": 0.10149739142218307, + "grad_norm": 11.103675842285156, + "learning_rate": 9.897734273393115e-05, + "loss": 1.1308, + "step": 1498 + }, + { + "epoch": 0.10156514669015516, + "grad_norm": 9.78297233581543, + "learning_rate": 9.897597371483333e-05, + "loss": 1.0907, + "step": 1499 + }, + { + "epoch": 0.10163290195812724, + "grad_norm": 10.98086166381836, + "learning_rate": 9.89746046957355e-05, + "loss": 0.9743, + "step": 1500 + }, + { + "epoch": 0.10170065722609933, + "grad_norm": 9.268783569335938, + "learning_rate": 9.89732356766377e-05, + "loss": 0.8917, + "step": 1501 + }, + { + "epoch": 0.10176841249407141, + "grad_norm": 12.674605369567871, + "learning_rate": 9.897186665753988e-05, + "loss": 1.2247, + "step": 1502 + }, + { + "epoch": 0.1018361677620435, + "grad_norm": 10.987565040588379, + "learning_rate": 9.897049763844206e-05, + "loss": 1.0688, + "step": 1503 + }, + { + "epoch": 0.10190392303001558, + "grad_norm": 16.014053344726562, + "learning_rate": 9.896912861934424e-05, + "loss": 0.8334, + "step": 1504 + }, + { + "epoch": 0.10197167829798767, + "grad_norm": 11.119991302490234, + "learning_rate": 9.896775960024642e-05, + "loss": 1.3176, + "step": 1505 + }, + { + "epoch": 0.10203943356595975, + "grad_norm": 13.23279094696045, + "learning_rate": 9.896639058114861e-05, + "loss": 1.1625, + "step": 1506 + }, + { + "epoch": 0.10210718883393184, + "grad_norm": 9.3678560256958, + "learning_rate": 9.89650215620508e-05, + "loss": 1.2774, + "step": 1507 + }, + { + "epoch": 0.10217494410190392, + "grad_norm": 10.829100608825684, + "learning_rate": 9.896365254295298e-05, + "loss": 1.2423, + "step": 1508 + }, + { + "epoch": 0.10224269936987601, + "grad_norm": 12.12694263458252, + "learning_rate": 9.896228352385516e-05, + "loss": 1.2276, + "step": 1509 + }, + { + "epoch": 0.10231045463784809, + "grad_norm": 11.626548767089844, + "learning_rate": 9.896091450475735e-05, + "loss": 1.0871, + "step": 1510 + }, + { + "epoch": 0.10237820990582018, + "grad_norm": 11.388608932495117, + "learning_rate": 9.895954548565953e-05, + "loss": 1.5582, + "step": 1511 + }, + { + "epoch": 0.10244596517379226, + "grad_norm": 9.463730812072754, + "learning_rate": 9.895817646656171e-05, + "loss": 1.1066, + "step": 1512 + }, + { + "epoch": 0.10251372044176435, + "grad_norm": 10.291573524475098, + "learning_rate": 9.895680744746389e-05, + "loss": 1.4026, + "step": 1513 + }, + { + "epoch": 0.10258147570973643, + "grad_norm": 9.778963088989258, + "learning_rate": 9.895543842836607e-05, + "loss": 1.1109, + "step": 1514 + }, + { + "epoch": 0.10264923097770852, + "grad_norm": 9.685966491699219, + "learning_rate": 9.895406940926827e-05, + "loss": 1.1633, + "step": 1515 + }, + { + "epoch": 0.1027169862456806, + "grad_norm": 10.76310920715332, + "learning_rate": 9.895270039017045e-05, + "loss": 1.0813, + "step": 1516 + }, + { + "epoch": 0.10278474151365269, + "grad_norm": 9.795347213745117, + "learning_rate": 9.895133137107263e-05, + "loss": 1.2079, + "step": 1517 + }, + { + "epoch": 0.10285249678162477, + "grad_norm": 9.980990409851074, + "learning_rate": 9.89499623519748e-05, + "loss": 1.0701, + "step": 1518 + }, + { + "epoch": 0.10292025204959686, + "grad_norm": 9.682209014892578, + "learning_rate": 9.894859333287699e-05, + "loss": 1.4133, + "step": 1519 + }, + { + "epoch": 0.10298800731756894, + "grad_norm": 10.632065773010254, + "learning_rate": 9.894722431377918e-05, + "loss": 1.0631, + "step": 1520 + }, + { + "epoch": 0.10305576258554103, + "grad_norm": 10.099474906921387, + "learning_rate": 9.894585529468136e-05, + "loss": 1.2015, + "step": 1521 + }, + { + "epoch": 0.1031235178535131, + "grad_norm": 8.289199829101562, + "learning_rate": 9.894448627558354e-05, + "loss": 1.3009, + "step": 1522 + }, + { + "epoch": 0.1031912731214852, + "grad_norm": 9.403796195983887, + "learning_rate": 9.894311725648573e-05, + "loss": 1.113, + "step": 1523 + }, + { + "epoch": 0.10325902838945727, + "grad_norm": 12.6613130569458, + "learning_rate": 9.894174823738792e-05, + "loss": 0.9394, + "step": 1524 + }, + { + "epoch": 0.10332678365742937, + "grad_norm": 9.85255241394043, + "learning_rate": 9.894037921829011e-05, + "loss": 1.1007, + "step": 1525 + }, + { + "epoch": 0.10339453892540144, + "grad_norm": 11.918173789978027, + "learning_rate": 9.893901019919229e-05, + "loss": 1.1817, + "step": 1526 + }, + { + "epoch": 0.10346229419337354, + "grad_norm": 9.994447708129883, + "learning_rate": 9.893764118009447e-05, + "loss": 1.2522, + "step": 1527 + }, + { + "epoch": 0.10353004946134561, + "grad_norm": 9.879289627075195, + "learning_rate": 9.893627216099665e-05, + "loss": 1.152, + "step": 1528 + }, + { + "epoch": 0.1035978047293177, + "grad_norm": 10.103482246398926, + "learning_rate": 9.893490314189884e-05, + "loss": 1.2227, + "step": 1529 + }, + { + "epoch": 0.10366555999728978, + "grad_norm": 11.173476219177246, + "learning_rate": 9.893353412280102e-05, + "loss": 1.2188, + "step": 1530 + }, + { + "epoch": 0.10373331526526187, + "grad_norm": 11.540877342224121, + "learning_rate": 9.89321651037032e-05, + "loss": 1.1669, + "step": 1531 + }, + { + "epoch": 0.10380107053323395, + "grad_norm": 10.706154823303223, + "learning_rate": 9.893079608460539e-05, + "loss": 1.2771, + "step": 1532 + }, + { + "epoch": 0.10386882580120604, + "grad_norm": 11.781739234924316, + "learning_rate": 9.892942706550758e-05, + "loss": 1.0501, + "step": 1533 + }, + { + "epoch": 0.10393658106917812, + "grad_norm": 10.482099533081055, + "learning_rate": 9.892805804640976e-05, + "loss": 1.2327, + "step": 1534 + }, + { + "epoch": 0.10400433633715021, + "grad_norm": 12.450867652893066, + "learning_rate": 9.892668902731194e-05, + "loss": 1.3575, + "step": 1535 + }, + { + "epoch": 0.1040720916051223, + "grad_norm": 10.236811637878418, + "learning_rate": 9.892532000821412e-05, + "loss": 1.2559, + "step": 1536 + }, + { + "epoch": 0.10413984687309438, + "grad_norm": 9.311124801635742, + "learning_rate": 9.89239509891163e-05, + "loss": 1.1143, + "step": 1537 + }, + { + "epoch": 0.10420760214106647, + "grad_norm": 9.182706832885742, + "learning_rate": 9.89225819700185e-05, + "loss": 0.8875, + "step": 1538 + }, + { + "epoch": 0.10427535740903855, + "grad_norm": 12.762700080871582, + "learning_rate": 9.892121295092067e-05, + "loss": 1.0552, + "step": 1539 + }, + { + "epoch": 0.10434311267701064, + "grad_norm": 12.222203254699707, + "learning_rate": 9.891984393182285e-05, + "loss": 1.3185, + "step": 1540 + }, + { + "epoch": 0.10441086794498272, + "grad_norm": 11.45807933807373, + "learning_rate": 9.891847491272504e-05, + "loss": 1.2952, + "step": 1541 + }, + { + "epoch": 0.10447862321295481, + "grad_norm": 10.445068359375, + "learning_rate": 9.891710589362722e-05, + "loss": 1.1637, + "step": 1542 + }, + { + "epoch": 0.10454637848092689, + "grad_norm": 11.758063316345215, + "learning_rate": 9.891573687452941e-05, + "loss": 1.3202, + "step": 1543 + }, + { + "epoch": 0.10461413374889898, + "grad_norm": 10.176533699035645, + "learning_rate": 9.891436785543159e-05, + "loss": 1.1762, + "step": 1544 + }, + { + "epoch": 0.10468188901687106, + "grad_norm": 10.133155822753906, + "learning_rate": 9.891299883633377e-05, + "loss": 1.2226, + "step": 1545 + }, + { + "epoch": 0.10474964428484315, + "grad_norm": 9.883895874023438, + "learning_rate": 9.891162981723595e-05, + "loss": 0.9687, + "step": 1546 + }, + { + "epoch": 0.10481739955281523, + "grad_norm": 13.175050735473633, + "learning_rate": 9.891026079813814e-05, + "loss": 1.2075, + "step": 1547 + }, + { + "epoch": 0.10488515482078732, + "grad_norm": 11.183597564697266, + "learning_rate": 9.890889177904032e-05, + "loss": 1.0917, + "step": 1548 + }, + { + "epoch": 0.1049529100887594, + "grad_norm": 10.135035514831543, + "learning_rate": 9.89075227599425e-05, + "loss": 1.0087, + "step": 1549 + }, + { + "epoch": 0.10502066535673149, + "grad_norm": 16.02760887145996, + "learning_rate": 9.890615374084469e-05, + "loss": 1.2015, + "step": 1550 + }, + { + "epoch": 0.10508842062470357, + "grad_norm": 11.255363464355469, + "learning_rate": 9.890478472174687e-05, + "loss": 1.1106, + "step": 1551 + }, + { + "epoch": 0.10515617589267566, + "grad_norm": 10.740998268127441, + "learning_rate": 9.890341570264906e-05, + "loss": 1.1835, + "step": 1552 + }, + { + "epoch": 0.10522393116064774, + "grad_norm": 11.84919548034668, + "learning_rate": 9.890204668355124e-05, + "loss": 1.1655, + "step": 1553 + }, + { + "epoch": 0.10529168642861983, + "grad_norm": 12.041108131408691, + "learning_rate": 9.890067766445342e-05, + "loss": 1.2544, + "step": 1554 + }, + { + "epoch": 0.10535944169659191, + "grad_norm": 8.966646194458008, + "learning_rate": 9.88993086453556e-05, + "loss": 1.098, + "step": 1555 + }, + { + "epoch": 0.105427196964564, + "grad_norm": 11.838338851928711, + "learning_rate": 9.88979396262578e-05, + "loss": 1.1774, + "step": 1556 + }, + { + "epoch": 0.10549495223253608, + "grad_norm": 11.186326026916504, + "learning_rate": 9.889657060715997e-05, + "loss": 1.1778, + "step": 1557 + }, + { + "epoch": 0.10556270750050817, + "grad_norm": 9.448702812194824, + "learning_rate": 9.889520158806216e-05, + "loss": 1.0171, + "step": 1558 + }, + { + "epoch": 0.10563046276848025, + "grad_norm": 10.931096076965332, + "learning_rate": 9.889383256896434e-05, + "loss": 1.1797, + "step": 1559 + }, + { + "epoch": 0.10569821803645234, + "grad_norm": 10.293981552124023, + "learning_rate": 9.889246354986652e-05, + "loss": 1.0723, + "step": 1560 + }, + { + "epoch": 0.10576597330442442, + "grad_norm": 10.052331924438477, + "learning_rate": 9.889109453076871e-05, + "loss": 0.831, + "step": 1561 + }, + { + "epoch": 0.10583372857239651, + "grad_norm": 10.593210220336914, + "learning_rate": 9.888972551167089e-05, + "loss": 1.3415, + "step": 1562 + }, + { + "epoch": 0.10590148384036858, + "grad_norm": 11.580954551696777, + "learning_rate": 9.888835649257307e-05, + "loss": 1.1522, + "step": 1563 + }, + { + "epoch": 0.10596923910834068, + "grad_norm": 11.495551109313965, + "learning_rate": 9.888698747347525e-05, + "loss": 1.2515, + "step": 1564 + }, + { + "epoch": 0.10603699437631275, + "grad_norm": 10.543874740600586, + "learning_rate": 9.888561845437743e-05, + "loss": 1.1579, + "step": 1565 + }, + { + "epoch": 0.10610474964428485, + "grad_norm": 10.588164329528809, + "learning_rate": 9.888424943527963e-05, + "loss": 1.3825, + "step": 1566 + }, + { + "epoch": 0.10617250491225692, + "grad_norm": 13.591666221618652, + "learning_rate": 9.88828804161818e-05, + "loss": 1.0941, + "step": 1567 + }, + { + "epoch": 0.10624026018022902, + "grad_norm": 10.866951942443848, + "learning_rate": 9.888151139708399e-05, + "loss": 1.478, + "step": 1568 + }, + { + "epoch": 0.10630801544820109, + "grad_norm": 9.627554893493652, + "learning_rate": 9.888014237798618e-05, + "loss": 1.004, + "step": 1569 + }, + { + "epoch": 0.10637577071617318, + "grad_norm": 10.871118545532227, + "learning_rate": 9.887877335888836e-05, + "loss": 1.1997, + "step": 1570 + }, + { + "epoch": 0.10644352598414526, + "grad_norm": 10.74503231048584, + "learning_rate": 9.887740433979054e-05, + "loss": 1.005, + "step": 1571 + }, + { + "epoch": 0.10651128125211735, + "grad_norm": 11.119452476501465, + "learning_rate": 9.887603532069273e-05, + "loss": 1.1168, + "step": 1572 + }, + { + "epoch": 0.10657903652008943, + "grad_norm": 10.601544380187988, + "learning_rate": 9.887466630159491e-05, + "loss": 1.0256, + "step": 1573 + }, + { + "epoch": 0.10664679178806152, + "grad_norm": 10.329113006591797, + "learning_rate": 9.88732972824971e-05, + "loss": 1.3432, + "step": 1574 + }, + { + "epoch": 0.1067145470560336, + "grad_norm": 9.973999977111816, + "learning_rate": 9.887192826339929e-05, + "loss": 0.9961, + "step": 1575 + }, + { + "epoch": 0.10678230232400569, + "grad_norm": 10.981974601745605, + "learning_rate": 9.887055924430147e-05, + "loss": 1.3936, + "step": 1576 + }, + { + "epoch": 0.10685005759197777, + "grad_norm": 10.953417778015137, + "learning_rate": 9.886919022520365e-05, + "loss": 0.9669, + "step": 1577 + }, + { + "epoch": 0.10691781285994986, + "grad_norm": 10.656323432922363, + "learning_rate": 9.886782120610583e-05, + "loss": 1.1393, + "step": 1578 + }, + { + "epoch": 0.10698556812792194, + "grad_norm": 9.434617042541504, + "learning_rate": 9.886645218700802e-05, + "loss": 1.1813, + "step": 1579 + }, + { + "epoch": 0.10705332339589403, + "grad_norm": 12.334831237792969, + "learning_rate": 9.88650831679102e-05, + "loss": 1.453, + "step": 1580 + }, + { + "epoch": 0.10712107866386611, + "grad_norm": 9.892403602600098, + "learning_rate": 9.886371414881238e-05, + "loss": 1.1373, + "step": 1581 + }, + { + "epoch": 0.1071888339318382, + "grad_norm": 9.248678207397461, + "learning_rate": 9.886234512971456e-05, + "loss": 1.0734, + "step": 1582 + }, + { + "epoch": 0.10725658919981028, + "grad_norm": 10.317010879516602, + "learning_rate": 9.886097611061675e-05, + "loss": 1.1252, + "step": 1583 + }, + { + "epoch": 0.10732434446778237, + "grad_norm": 9.586435317993164, + "learning_rate": 9.885960709151894e-05, + "loss": 1.1705, + "step": 1584 + }, + { + "epoch": 0.10739209973575445, + "grad_norm": 8.883166313171387, + "learning_rate": 9.885823807242112e-05, + "loss": 1.3164, + "step": 1585 + }, + { + "epoch": 0.10745985500372654, + "grad_norm": 12.870014190673828, + "learning_rate": 9.88568690533233e-05, + "loss": 1.2187, + "step": 1586 + }, + { + "epoch": 0.10752761027169863, + "grad_norm": 10.858057975769043, + "learning_rate": 9.885550003422548e-05, + "loss": 1.2334, + "step": 1587 + }, + { + "epoch": 0.10759536553967071, + "grad_norm": 9.855050086975098, + "learning_rate": 9.885413101512767e-05, + "loss": 1.068, + "step": 1588 + }, + { + "epoch": 0.1076631208076428, + "grad_norm": 12.256099700927734, + "learning_rate": 9.885276199602985e-05, + "loss": 1.2481, + "step": 1589 + }, + { + "epoch": 0.10773087607561488, + "grad_norm": 9.756118774414062, + "learning_rate": 9.885139297693203e-05, + "loss": 1.2365, + "step": 1590 + }, + { + "epoch": 0.10779863134358697, + "grad_norm": 11.727996826171875, + "learning_rate": 9.885002395783421e-05, + "loss": 1.3839, + "step": 1591 + }, + { + "epoch": 0.10786638661155905, + "grad_norm": 11.58463191986084, + "learning_rate": 9.88486549387364e-05, + "loss": 0.937, + "step": 1592 + }, + { + "epoch": 0.10793414187953114, + "grad_norm": 12.663015365600586, + "learning_rate": 9.884728591963859e-05, + "loss": 1.3453, + "step": 1593 + }, + { + "epoch": 0.10800189714750322, + "grad_norm": 10.950531005859375, + "learning_rate": 9.884591690054077e-05, + "loss": 1.1345, + "step": 1594 + }, + { + "epoch": 0.10806965241547531, + "grad_norm": 10.361424446105957, + "learning_rate": 9.884454788144295e-05, + "loss": 1.2576, + "step": 1595 + }, + { + "epoch": 0.10813740768344739, + "grad_norm": 11.155044555664062, + "learning_rate": 9.884317886234513e-05, + "loss": 1.1253, + "step": 1596 + }, + { + "epoch": 0.10820516295141948, + "grad_norm": 12.457600593566895, + "learning_rate": 9.884180984324731e-05, + "loss": 1.0959, + "step": 1597 + }, + { + "epoch": 0.10827291821939156, + "grad_norm": 13.494983673095703, + "learning_rate": 9.88404408241495e-05, + "loss": 1.1671, + "step": 1598 + }, + { + "epoch": 0.10834067348736365, + "grad_norm": 11.870648384094238, + "learning_rate": 9.883907180505168e-05, + "loss": 1.2121, + "step": 1599 + }, + { + "epoch": 0.10840842875533573, + "grad_norm": 10.327920913696289, + "learning_rate": 9.883770278595387e-05, + "loss": 1.1319, + "step": 1600 + }, + { + "epoch": 0.10847618402330782, + "grad_norm": 9.578879356384277, + "learning_rate": 9.883633376685605e-05, + "loss": 0.9064, + "step": 1601 + }, + { + "epoch": 0.1085439392912799, + "grad_norm": 9.598779678344727, + "learning_rate": 9.883496474775824e-05, + "loss": 1.1119, + "step": 1602 + }, + { + "epoch": 0.10861169455925199, + "grad_norm": 13.522628784179688, + "learning_rate": 9.883359572866042e-05, + "loss": 1.2549, + "step": 1603 + }, + { + "epoch": 0.10867944982722406, + "grad_norm": 8.123127937316895, + "learning_rate": 9.88322267095626e-05, + "loss": 0.8778, + "step": 1604 + }, + { + "epoch": 0.10874720509519616, + "grad_norm": 13.100455284118652, + "learning_rate": 9.883085769046478e-05, + "loss": 1.2023, + "step": 1605 + }, + { + "epoch": 0.10881496036316823, + "grad_norm": 9.897802352905273, + "learning_rate": 9.882948867136696e-05, + "loss": 1.2094, + "step": 1606 + }, + { + "epoch": 0.10888271563114033, + "grad_norm": 8.55170726776123, + "learning_rate": 9.882811965226915e-05, + "loss": 1.0407, + "step": 1607 + }, + { + "epoch": 0.1089504708991124, + "grad_norm": 10.867438316345215, + "learning_rate": 9.882675063317133e-05, + "loss": 1.3021, + "step": 1608 + }, + { + "epoch": 0.1090182261670845, + "grad_norm": 11.062238693237305, + "learning_rate": 9.882538161407352e-05, + "loss": 1.2627, + "step": 1609 + }, + { + "epoch": 0.10908598143505657, + "grad_norm": 12.710458755493164, + "learning_rate": 9.88240125949757e-05, + "loss": 1.6196, + "step": 1610 + }, + { + "epoch": 0.10915373670302866, + "grad_norm": 10.986543655395508, + "learning_rate": 9.882264357587789e-05, + "loss": 1.2021, + "step": 1611 + }, + { + "epoch": 0.10922149197100074, + "grad_norm": 12.792850494384766, + "learning_rate": 9.882127455678007e-05, + "loss": 1.1246, + "step": 1612 + }, + { + "epoch": 0.10928924723897283, + "grad_norm": 12.120078086853027, + "learning_rate": 9.881990553768225e-05, + "loss": 1.2522, + "step": 1613 + }, + { + "epoch": 0.10935700250694491, + "grad_norm": 10.562193870544434, + "learning_rate": 9.881853651858443e-05, + "loss": 1.1791, + "step": 1614 + }, + { + "epoch": 0.109424757774917, + "grad_norm": 13.017698287963867, + "learning_rate": 9.881716749948662e-05, + "loss": 1.0733, + "step": 1615 + }, + { + "epoch": 0.10949251304288908, + "grad_norm": 10.904980659484863, + "learning_rate": 9.88157984803888e-05, + "loss": 1.0166, + "step": 1616 + }, + { + "epoch": 0.10956026831086117, + "grad_norm": 12.332829475402832, + "learning_rate": 9.881442946129099e-05, + "loss": 1.0688, + "step": 1617 + }, + { + "epoch": 0.10962802357883325, + "grad_norm": 12.238106727600098, + "learning_rate": 9.881306044219318e-05, + "loss": 0.9869, + "step": 1618 + }, + { + "epoch": 0.10969577884680534, + "grad_norm": 12.75059986114502, + "learning_rate": 9.881169142309536e-05, + "loss": 1.5365, + "step": 1619 + }, + { + "epoch": 0.10976353411477742, + "grad_norm": 11.019241333007812, + "learning_rate": 9.881032240399754e-05, + "loss": 1.2332, + "step": 1620 + }, + { + "epoch": 0.10983128938274951, + "grad_norm": 11.867498397827148, + "learning_rate": 9.880895338489973e-05, + "loss": 1.1955, + "step": 1621 + }, + { + "epoch": 0.10989904465072159, + "grad_norm": 9.685249328613281, + "learning_rate": 9.880758436580191e-05, + "loss": 1.1972, + "step": 1622 + }, + { + "epoch": 0.10996679991869368, + "grad_norm": 11.80540657043457, + "learning_rate": 9.88062153467041e-05, + "loss": 0.9501, + "step": 1623 + }, + { + "epoch": 0.11003455518666576, + "grad_norm": 9.719080924987793, + "learning_rate": 9.880484632760627e-05, + "loss": 0.9904, + "step": 1624 + }, + { + "epoch": 0.11010231045463785, + "grad_norm": 10.906637191772461, + "learning_rate": 9.880347730850847e-05, + "loss": 1.2807, + "step": 1625 + }, + { + "epoch": 0.11017006572260993, + "grad_norm": 10.40942668914795, + "learning_rate": 9.880210828941065e-05, + "loss": 1.2772, + "step": 1626 + }, + { + "epoch": 0.11023782099058202, + "grad_norm": 10.265239715576172, + "learning_rate": 9.880073927031283e-05, + "loss": 1.1583, + "step": 1627 + }, + { + "epoch": 0.1103055762585541, + "grad_norm": 12.158036231994629, + "learning_rate": 9.879937025121501e-05, + "loss": 1.3907, + "step": 1628 + }, + { + "epoch": 0.11037333152652619, + "grad_norm": 10.128400802612305, + "learning_rate": 9.879800123211719e-05, + "loss": 0.8838, + "step": 1629 + }, + { + "epoch": 0.11044108679449827, + "grad_norm": 9.536952018737793, + "learning_rate": 9.879663221301938e-05, + "loss": 1.0525, + "step": 1630 + }, + { + "epoch": 0.11050884206247036, + "grad_norm": 10.826987266540527, + "learning_rate": 9.879526319392156e-05, + "loss": 1.2005, + "step": 1631 + }, + { + "epoch": 0.11057659733044244, + "grad_norm": 10.08153247833252, + "learning_rate": 9.879389417482374e-05, + "loss": 1.1521, + "step": 1632 + }, + { + "epoch": 0.11064435259841453, + "grad_norm": 10.642118453979492, + "learning_rate": 9.879252515572592e-05, + "loss": 1.2316, + "step": 1633 + }, + { + "epoch": 0.1107121078663866, + "grad_norm": 9.595836639404297, + "learning_rate": 9.879115613662812e-05, + "loss": 1.0969, + "step": 1634 + }, + { + "epoch": 0.1107798631343587, + "grad_norm": 8.954519271850586, + "learning_rate": 9.87897871175303e-05, + "loss": 0.9422, + "step": 1635 + }, + { + "epoch": 0.11084761840233077, + "grad_norm": 9.143898010253906, + "learning_rate": 9.878841809843248e-05, + "loss": 0.8949, + "step": 1636 + }, + { + "epoch": 0.11091537367030287, + "grad_norm": 11.656847953796387, + "learning_rate": 9.878704907933466e-05, + "loss": 0.9739, + "step": 1637 + }, + { + "epoch": 0.11098312893827494, + "grad_norm": 11.996994018554688, + "learning_rate": 9.878568006023684e-05, + "loss": 1.3361, + "step": 1638 + }, + { + "epoch": 0.11105088420624704, + "grad_norm": 11.478599548339844, + "learning_rate": 9.878431104113903e-05, + "loss": 1.0726, + "step": 1639 + }, + { + "epoch": 0.11111863947421913, + "grad_norm": 14.594179153442383, + "learning_rate": 9.878294202204121e-05, + "loss": 1.3725, + "step": 1640 + }, + { + "epoch": 0.1111863947421912, + "grad_norm": 12.15703296661377, + "learning_rate": 9.87815730029434e-05, + "loss": 1.1996, + "step": 1641 + }, + { + "epoch": 0.1112541500101633, + "grad_norm": 9.384215354919434, + "learning_rate": 9.878020398384557e-05, + "loss": 1.1294, + "step": 1642 + }, + { + "epoch": 0.11132190527813537, + "grad_norm": 11.131610870361328, + "learning_rate": 9.877883496474777e-05, + "loss": 1.0526, + "step": 1643 + }, + { + "epoch": 0.11138966054610747, + "grad_norm": 9.628592491149902, + "learning_rate": 9.877746594564995e-05, + "loss": 1.0715, + "step": 1644 + }, + { + "epoch": 0.11145741581407954, + "grad_norm": 10.876824378967285, + "learning_rate": 9.877609692655213e-05, + "loss": 1.3337, + "step": 1645 + }, + { + "epoch": 0.11152517108205164, + "grad_norm": 10.489126205444336, + "learning_rate": 9.877472790745431e-05, + "loss": 1.1727, + "step": 1646 + }, + { + "epoch": 0.11159292635002371, + "grad_norm": 8.737699508666992, + "learning_rate": 9.877335888835649e-05, + "loss": 1.1252, + "step": 1647 + }, + { + "epoch": 0.1116606816179958, + "grad_norm": 9.8843355178833, + "learning_rate": 9.877198986925868e-05, + "loss": 0.9222, + "step": 1648 + }, + { + "epoch": 0.11172843688596788, + "grad_norm": 7.930290699005127, + "learning_rate": 9.877062085016086e-05, + "loss": 1.0095, + "step": 1649 + }, + { + "epoch": 0.11179619215393997, + "grad_norm": 9.721529006958008, + "learning_rate": 9.876925183106304e-05, + "loss": 1.0812, + "step": 1650 + }, + { + "epoch": 0.11186394742191205, + "grad_norm": 10.844324111938477, + "learning_rate": 9.876788281196523e-05, + "loss": 1.2245, + "step": 1651 + }, + { + "epoch": 0.11193170268988414, + "grad_norm": 8.750361442565918, + "learning_rate": 9.87665137928674e-05, + "loss": 0.9983, + "step": 1652 + }, + { + "epoch": 0.11199945795785622, + "grad_norm": 8.716521263122559, + "learning_rate": 9.87651447737696e-05, + "loss": 0.8612, + "step": 1653 + }, + { + "epoch": 0.11206721322582831, + "grad_norm": 10.06696891784668, + "learning_rate": 9.876377575467178e-05, + "loss": 1.0806, + "step": 1654 + }, + { + "epoch": 0.11213496849380039, + "grad_norm": 10.526103019714355, + "learning_rate": 9.876240673557396e-05, + "loss": 1.0799, + "step": 1655 + }, + { + "epoch": 0.11220272376177248, + "grad_norm": 11.108748435974121, + "learning_rate": 9.876103771647614e-05, + "loss": 1.3871, + "step": 1656 + }, + { + "epoch": 0.11227047902974456, + "grad_norm": 10.120033264160156, + "learning_rate": 9.875966869737833e-05, + "loss": 1.0399, + "step": 1657 + }, + { + "epoch": 0.11233823429771665, + "grad_norm": 10.589370727539062, + "learning_rate": 9.875829967828051e-05, + "loss": 1.3249, + "step": 1658 + }, + { + "epoch": 0.11240598956568873, + "grad_norm": 8.751884460449219, + "learning_rate": 9.87569306591827e-05, + "loss": 0.9404, + "step": 1659 + }, + { + "epoch": 0.11247374483366082, + "grad_norm": 9.923823356628418, + "learning_rate": 9.875556164008488e-05, + "loss": 1.067, + "step": 1660 + }, + { + "epoch": 0.1125415001016329, + "grad_norm": 9.70035171508789, + "learning_rate": 9.875419262098707e-05, + "loss": 1.0079, + "step": 1661 + }, + { + "epoch": 0.11260925536960499, + "grad_norm": 12.337713241577148, + "learning_rate": 9.875282360188925e-05, + "loss": 1.0962, + "step": 1662 + }, + { + "epoch": 0.11267701063757707, + "grad_norm": 10.142355918884277, + "learning_rate": 9.875145458279143e-05, + "loss": 1.1916, + "step": 1663 + }, + { + "epoch": 0.11274476590554916, + "grad_norm": 11.82714557647705, + "learning_rate": 9.875008556369362e-05, + "loss": 1.4582, + "step": 1664 + }, + { + "epoch": 0.11281252117352124, + "grad_norm": 10.524566650390625, + "learning_rate": 9.87487165445958e-05, + "loss": 1.2156, + "step": 1665 + }, + { + "epoch": 0.11288027644149333, + "grad_norm": 12.232719421386719, + "learning_rate": 9.874734752549798e-05, + "loss": 1.2152, + "step": 1666 + }, + { + "epoch": 0.11294803170946541, + "grad_norm": 10.665773391723633, + "learning_rate": 9.874597850640018e-05, + "loss": 1.085, + "step": 1667 + }, + { + "epoch": 0.1130157869774375, + "grad_norm": 10.0956449508667, + "learning_rate": 9.874460948730236e-05, + "loss": 1.1142, + "step": 1668 + }, + { + "epoch": 0.11308354224540958, + "grad_norm": 9.27825927734375, + "learning_rate": 9.874324046820454e-05, + "loss": 1.0263, + "step": 1669 + }, + { + "epoch": 0.11315129751338167, + "grad_norm": 11.171673774719238, + "learning_rate": 9.874187144910672e-05, + "loss": 1.5049, + "step": 1670 + }, + { + "epoch": 0.11321905278135375, + "grad_norm": 10.442279815673828, + "learning_rate": 9.874050243000891e-05, + "loss": 1.0697, + "step": 1671 + }, + { + "epoch": 0.11328680804932584, + "grad_norm": 11.563700675964355, + "learning_rate": 9.87391334109111e-05, + "loss": 1.003, + "step": 1672 + }, + { + "epoch": 0.11335456331729792, + "grad_norm": 12.447733879089355, + "learning_rate": 9.873776439181327e-05, + "loss": 1.2279, + "step": 1673 + }, + { + "epoch": 0.11342231858527001, + "grad_norm": 9.70698070526123, + "learning_rate": 9.873639537271545e-05, + "loss": 1.2208, + "step": 1674 + }, + { + "epoch": 0.11349007385324208, + "grad_norm": 10.89858341217041, + "learning_rate": 9.873502635361763e-05, + "loss": 1.1676, + "step": 1675 + }, + { + "epoch": 0.11355782912121418, + "grad_norm": 11.106192588806152, + "learning_rate": 9.873365733451983e-05, + "loss": 1.0918, + "step": 1676 + }, + { + "epoch": 0.11362558438918625, + "grad_norm": 10.186528205871582, + "learning_rate": 9.873228831542201e-05, + "loss": 1.1651, + "step": 1677 + }, + { + "epoch": 0.11369333965715835, + "grad_norm": 10.69977855682373, + "learning_rate": 9.873091929632419e-05, + "loss": 1.2867, + "step": 1678 + }, + { + "epoch": 0.11376109492513042, + "grad_norm": 11.934000015258789, + "learning_rate": 9.872955027722637e-05, + "loss": 1.2051, + "step": 1679 + }, + { + "epoch": 0.11382885019310252, + "grad_norm": 9.277094841003418, + "learning_rate": 9.872818125812856e-05, + "loss": 1.17, + "step": 1680 + }, + { + "epoch": 0.1138966054610746, + "grad_norm": 11.35097885131836, + "learning_rate": 9.872681223903074e-05, + "loss": 1.2433, + "step": 1681 + }, + { + "epoch": 0.11396436072904668, + "grad_norm": 11.362833023071289, + "learning_rate": 9.872544321993292e-05, + "loss": 1.3577, + "step": 1682 + }, + { + "epoch": 0.11403211599701876, + "grad_norm": 10.944365501403809, + "learning_rate": 9.87240742008351e-05, + "loss": 1.0731, + "step": 1683 + }, + { + "epoch": 0.11409987126499085, + "grad_norm": 10.292986869812012, + "learning_rate": 9.872270518173728e-05, + "loss": 1.0639, + "step": 1684 + }, + { + "epoch": 0.11416762653296293, + "grad_norm": 10.8629150390625, + "learning_rate": 9.872133616263948e-05, + "loss": 1.3503, + "step": 1685 + }, + { + "epoch": 0.11423538180093502, + "grad_norm": 10.277920722961426, + "learning_rate": 9.871996714354166e-05, + "loss": 1.1544, + "step": 1686 + }, + { + "epoch": 0.1143031370689071, + "grad_norm": 8.98864459991455, + "learning_rate": 9.871859812444384e-05, + "loss": 1.3638, + "step": 1687 + }, + { + "epoch": 0.11437089233687919, + "grad_norm": 8.126718521118164, + "learning_rate": 9.871722910534602e-05, + "loss": 1.0343, + "step": 1688 + }, + { + "epoch": 0.11443864760485127, + "grad_norm": 9.229771614074707, + "learning_rate": 9.871586008624821e-05, + "loss": 1.0137, + "step": 1689 + }, + { + "epoch": 0.11450640287282336, + "grad_norm": 10.554340362548828, + "learning_rate": 9.87144910671504e-05, + "loss": 0.9908, + "step": 1690 + }, + { + "epoch": 0.11457415814079544, + "grad_norm": 11.857934951782227, + "learning_rate": 9.871312204805257e-05, + "loss": 1.29, + "step": 1691 + }, + { + "epoch": 0.11464191340876753, + "grad_norm": 9.334272384643555, + "learning_rate": 9.871175302895475e-05, + "loss": 1.0449, + "step": 1692 + }, + { + "epoch": 0.11470966867673962, + "grad_norm": 9.092185974121094, + "learning_rate": 9.871038400985693e-05, + "loss": 1.1386, + "step": 1693 + }, + { + "epoch": 0.1147774239447117, + "grad_norm": 9.379465103149414, + "learning_rate": 9.870901499075913e-05, + "loss": 1.0964, + "step": 1694 + }, + { + "epoch": 0.11484517921268379, + "grad_norm": 10.038492202758789, + "learning_rate": 9.870764597166131e-05, + "loss": 0.8512, + "step": 1695 + }, + { + "epoch": 0.11491293448065587, + "grad_norm": 8.2230863571167, + "learning_rate": 9.870627695256349e-05, + "loss": 1.0646, + "step": 1696 + }, + { + "epoch": 0.11498068974862796, + "grad_norm": 9.939510345458984, + "learning_rate": 9.870490793346567e-05, + "loss": 1.081, + "step": 1697 + }, + { + "epoch": 0.11504844501660004, + "grad_norm": 11.864813804626465, + "learning_rate": 9.870353891436786e-05, + "loss": 1.2447, + "step": 1698 + }, + { + "epoch": 0.11511620028457213, + "grad_norm": 11.517714500427246, + "learning_rate": 9.870216989527004e-05, + "loss": 1.1274, + "step": 1699 + }, + { + "epoch": 0.11518395555254421, + "grad_norm": 12.172054290771484, + "learning_rate": 9.870080087617222e-05, + "loss": 1.3278, + "step": 1700 + }, + { + "epoch": 0.1152517108205163, + "grad_norm": 9.915261268615723, + "learning_rate": 9.86994318570744e-05, + "loss": 1.0298, + "step": 1701 + }, + { + "epoch": 0.11531946608848838, + "grad_norm": 10.937396049499512, + "learning_rate": 9.869806283797659e-05, + "loss": 1.363, + "step": 1702 + }, + { + "epoch": 0.11538722135646047, + "grad_norm": 11.586840629577637, + "learning_rate": 9.869669381887878e-05, + "loss": 0.9152, + "step": 1703 + }, + { + "epoch": 0.11545497662443255, + "grad_norm": 10.035614967346191, + "learning_rate": 9.869532479978096e-05, + "loss": 1.2152, + "step": 1704 + }, + { + "epoch": 0.11552273189240464, + "grad_norm": 8.775728225708008, + "learning_rate": 9.869395578068314e-05, + "loss": 1.0552, + "step": 1705 + }, + { + "epoch": 0.11559048716037672, + "grad_norm": 10.85958480834961, + "learning_rate": 9.869258676158532e-05, + "loss": 1.0473, + "step": 1706 + }, + { + "epoch": 0.11565824242834881, + "grad_norm": 9.311616897583008, + "learning_rate": 9.869121774248751e-05, + "loss": 1.1704, + "step": 1707 + }, + { + "epoch": 0.11572599769632089, + "grad_norm": 9.503780364990234, + "learning_rate": 9.86898487233897e-05, + "loss": 1.1375, + "step": 1708 + }, + { + "epoch": 0.11579375296429298, + "grad_norm": 9.02602481842041, + "learning_rate": 9.868847970429187e-05, + "loss": 0.9955, + "step": 1709 + }, + { + "epoch": 0.11586150823226506, + "grad_norm": 11.473995208740234, + "learning_rate": 9.868711068519407e-05, + "loss": 0.8842, + "step": 1710 + }, + { + "epoch": 0.11592926350023715, + "grad_norm": 10.124822616577148, + "learning_rate": 9.868574166609625e-05, + "loss": 1.0458, + "step": 1711 + }, + { + "epoch": 0.11599701876820923, + "grad_norm": 11.004744529724121, + "learning_rate": 9.868437264699843e-05, + "loss": 1.2136, + "step": 1712 + }, + { + "epoch": 0.11606477403618132, + "grad_norm": 10.497210502624512, + "learning_rate": 9.868300362790062e-05, + "loss": 1.3457, + "step": 1713 + }, + { + "epoch": 0.1161325293041534, + "grad_norm": 10.933736801147461, + "learning_rate": 9.86816346088028e-05, + "loss": 1.0931, + "step": 1714 + }, + { + "epoch": 0.11620028457212549, + "grad_norm": 11.045526504516602, + "learning_rate": 9.868026558970498e-05, + "loss": 1.2338, + "step": 1715 + }, + { + "epoch": 0.11626803984009756, + "grad_norm": 10.763197898864746, + "learning_rate": 9.867889657060716e-05, + "loss": 1.2325, + "step": 1716 + }, + { + "epoch": 0.11633579510806966, + "grad_norm": 9.53070068359375, + "learning_rate": 9.867752755150936e-05, + "loss": 1.1268, + "step": 1717 + }, + { + "epoch": 0.11640355037604173, + "grad_norm": 10.572071075439453, + "learning_rate": 9.867615853241154e-05, + "loss": 1.0118, + "step": 1718 + }, + { + "epoch": 0.11647130564401383, + "grad_norm": 7.633776664733887, + "learning_rate": 9.867478951331372e-05, + "loss": 1.081, + "step": 1719 + }, + { + "epoch": 0.1165390609119859, + "grad_norm": 8.915566444396973, + "learning_rate": 9.86734204942159e-05, + "loss": 1.1526, + "step": 1720 + }, + { + "epoch": 0.116606816179958, + "grad_norm": 8.296719551086426, + "learning_rate": 9.867205147511809e-05, + "loss": 1.0571, + "step": 1721 + }, + { + "epoch": 0.11667457144793007, + "grad_norm": 9.387847900390625, + "learning_rate": 9.867068245602027e-05, + "loss": 1.0447, + "step": 1722 + }, + { + "epoch": 0.11674232671590216, + "grad_norm": 10.832168579101562, + "learning_rate": 9.866931343692245e-05, + "loss": 1.2601, + "step": 1723 + }, + { + "epoch": 0.11681008198387424, + "grad_norm": 11.631181716918945, + "learning_rate": 9.866794441782463e-05, + "loss": 1.4507, + "step": 1724 + }, + { + "epoch": 0.11687783725184633, + "grad_norm": 8.372645378112793, + "learning_rate": 9.866657539872681e-05, + "loss": 1.1191, + "step": 1725 + }, + { + "epoch": 0.11694559251981841, + "grad_norm": 9.577934265136719, + "learning_rate": 9.866520637962901e-05, + "loss": 1.0336, + "step": 1726 + }, + { + "epoch": 0.1170133477877905, + "grad_norm": 10.626317977905273, + "learning_rate": 9.866383736053119e-05, + "loss": 1.1595, + "step": 1727 + }, + { + "epoch": 0.11708110305576258, + "grad_norm": 9.758353233337402, + "learning_rate": 9.866246834143337e-05, + "loss": 0.8803, + "step": 1728 + }, + { + "epoch": 0.11714885832373467, + "grad_norm": 9.643457412719727, + "learning_rate": 9.866109932233555e-05, + "loss": 1.3213, + "step": 1729 + }, + { + "epoch": 0.11721661359170675, + "grad_norm": 8.910233497619629, + "learning_rate": 9.865973030323773e-05, + "loss": 1.1123, + "step": 1730 + }, + { + "epoch": 0.11728436885967884, + "grad_norm": 8.893741607666016, + "learning_rate": 9.865836128413992e-05, + "loss": 1.1059, + "step": 1731 + }, + { + "epoch": 0.11735212412765092, + "grad_norm": 12.937616348266602, + "learning_rate": 9.86569922650421e-05, + "loss": 1.2488, + "step": 1732 + }, + { + "epoch": 0.11741987939562301, + "grad_norm": 9.696173667907715, + "learning_rate": 9.865562324594428e-05, + "loss": 1.1076, + "step": 1733 + }, + { + "epoch": 0.11748763466359509, + "grad_norm": 11.568793296813965, + "learning_rate": 9.865425422684646e-05, + "loss": 1.1023, + "step": 1734 + }, + { + "epoch": 0.11755538993156718, + "grad_norm": 11.438271522521973, + "learning_rate": 9.865288520774866e-05, + "loss": 1.1014, + "step": 1735 + }, + { + "epoch": 0.11762314519953926, + "grad_norm": 8.217238426208496, + "learning_rate": 9.865151618865084e-05, + "loss": 0.9566, + "step": 1736 + }, + { + "epoch": 0.11769090046751135, + "grad_norm": 9.716591835021973, + "learning_rate": 9.865014716955302e-05, + "loss": 1.0478, + "step": 1737 + }, + { + "epoch": 0.11775865573548343, + "grad_norm": 11.152593612670898, + "learning_rate": 9.86487781504552e-05, + "loss": 1.3552, + "step": 1738 + }, + { + "epoch": 0.11782641100345552, + "grad_norm": 8.824272155761719, + "learning_rate": 9.864740913135738e-05, + "loss": 1.0463, + "step": 1739 + }, + { + "epoch": 0.1178941662714276, + "grad_norm": 8.941548347473145, + "learning_rate": 9.864604011225957e-05, + "loss": 0.9552, + "step": 1740 + }, + { + "epoch": 0.11796192153939969, + "grad_norm": 10.76177978515625, + "learning_rate": 9.864467109316175e-05, + "loss": 1.0076, + "step": 1741 + }, + { + "epoch": 0.11802967680737177, + "grad_norm": 9.87431812286377, + "learning_rate": 9.864330207406393e-05, + "loss": 0.9905, + "step": 1742 + }, + { + "epoch": 0.11809743207534386, + "grad_norm": 12.406320571899414, + "learning_rate": 9.864193305496611e-05, + "loss": 1.2839, + "step": 1743 + }, + { + "epoch": 0.11816518734331594, + "grad_norm": 12.649428367614746, + "learning_rate": 9.864056403586831e-05, + "loss": 1.0806, + "step": 1744 + }, + { + "epoch": 0.11823294261128803, + "grad_norm": 10.888890266418457, + "learning_rate": 9.863919501677049e-05, + "loss": 0.8805, + "step": 1745 + }, + { + "epoch": 0.11830069787926012, + "grad_norm": 10.59262752532959, + "learning_rate": 9.863782599767267e-05, + "loss": 0.9977, + "step": 1746 + }, + { + "epoch": 0.1183684531472322, + "grad_norm": 11.14206314086914, + "learning_rate": 9.863645697857485e-05, + "loss": 1.4569, + "step": 1747 + }, + { + "epoch": 0.11843620841520429, + "grad_norm": 12.148449897766113, + "learning_rate": 9.863508795947703e-05, + "loss": 1.5222, + "step": 1748 + }, + { + "epoch": 0.11850396368317637, + "grad_norm": 11.814643859863281, + "learning_rate": 9.863371894037922e-05, + "loss": 1.3888, + "step": 1749 + }, + { + "epoch": 0.11857171895114846, + "grad_norm": 12.437151908874512, + "learning_rate": 9.86323499212814e-05, + "loss": 1.024, + "step": 1750 + }, + { + "epoch": 0.11863947421912054, + "grad_norm": 9.314751625061035, + "learning_rate": 9.863098090218358e-05, + "loss": 1.064, + "step": 1751 + }, + { + "epoch": 0.11870722948709263, + "grad_norm": 12.978782653808594, + "learning_rate": 9.862961188308576e-05, + "loss": 1.4048, + "step": 1752 + }, + { + "epoch": 0.1187749847550647, + "grad_norm": 9.825428009033203, + "learning_rate": 9.862824286398796e-05, + "loss": 0.9631, + "step": 1753 + }, + { + "epoch": 0.1188427400230368, + "grad_norm": 9.127273559570312, + "learning_rate": 9.862687384489014e-05, + "loss": 1.0364, + "step": 1754 + }, + { + "epoch": 0.11891049529100887, + "grad_norm": 10.290020942687988, + "learning_rate": 9.862550482579232e-05, + "loss": 1.329, + "step": 1755 + }, + { + "epoch": 0.11897825055898097, + "grad_norm": 9.023946762084961, + "learning_rate": 9.862413580669451e-05, + "loss": 1.186, + "step": 1756 + }, + { + "epoch": 0.11904600582695304, + "grad_norm": 11.889911651611328, + "learning_rate": 9.86227667875967e-05, + "loss": 1.0338, + "step": 1757 + }, + { + "epoch": 0.11911376109492514, + "grad_norm": 8.938654899597168, + "learning_rate": 9.862139776849887e-05, + "loss": 1.1933, + "step": 1758 + }, + { + "epoch": 0.11918151636289721, + "grad_norm": 9.240127563476562, + "learning_rate": 9.862002874940107e-05, + "loss": 1.2336, + "step": 1759 + }, + { + "epoch": 0.1192492716308693, + "grad_norm": 10.340953826904297, + "learning_rate": 9.861865973030325e-05, + "loss": 1.1097, + "step": 1760 + }, + { + "epoch": 0.11931702689884138, + "grad_norm": 10.807507514953613, + "learning_rate": 9.861729071120543e-05, + "loss": 0.9127, + "step": 1761 + }, + { + "epoch": 0.11938478216681347, + "grad_norm": 9.670829772949219, + "learning_rate": 9.861592169210761e-05, + "loss": 1.2296, + "step": 1762 + }, + { + "epoch": 0.11945253743478555, + "grad_norm": 10.85981273651123, + "learning_rate": 9.86145526730098e-05, + "loss": 1.1341, + "step": 1763 + }, + { + "epoch": 0.11952029270275764, + "grad_norm": 9.198482513427734, + "learning_rate": 9.861318365391198e-05, + "loss": 1.4397, + "step": 1764 + }, + { + "epoch": 0.11958804797072972, + "grad_norm": 11.130392074584961, + "learning_rate": 9.861181463481416e-05, + "loss": 1.381, + "step": 1765 + }, + { + "epoch": 0.11965580323870181, + "grad_norm": 9.568144798278809, + "learning_rate": 9.861044561571634e-05, + "loss": 1.0284, + "step": 1766 + }, + { + "epoch": 0.11972355850667389, + "grad_norm": 9.049298286437988, + "learning_rate": 9.860907659661854e-05, + "loss": 1.0714, + "step": 1767 + }, + { + "epoch": 0.11979131377464598, + "grad_norm": 8.948347091674805, + "learning_rate": 9.860770757752072e-05, + "loss": 1.0248, + "step": 1768 + }, + { + "epoch": 0.11985906904261806, + "grad_norm": 10.836170196533203, + "learning_rate": 9.86063385584229e-05, + "loss": 1.0551, + "step": 1769 + }, + { + "epoch": 0.11992682431059015, + "grad_norm": 11.044917106628418, + "learning_rate": 9.860496953932508e-05, + "loss": 1.1374, + "step": 1770 + }, + { + "epoch": 0.11999457957856223, + "grad_norm": 10.975854873657227, + "learning_rate": 9.860360052022726e-05, + "loss": 1.2018, + "step": 1771 + }, + { + "epoch": 0.12006233484653432, + "grad_norm": 9.752070426940918, + "learning_rate": 9.860223150112945e-05, + "loss": 1.0843, + "step": 1772 + }, + { + "epoch": 0.1201300901145064, + "grad_norm": 9.662758827209473, + "learning_rate": 9.860086248203163e-05, + "loss": 0.8866, + "step": 1773 + }, + { + "epoch": 0.12019784538247849, + "grad_norm": 11.268694877624512, + "learning_rate": 9.859949346293381e-05, + "loss": 1.2653, + "step": 1774 + }, + { + "epoch": 0.12026560065045057, + "grad_norm": 9.201732635498047, + "learning_rate": 9.8598124443836e-05, + "loss": 1.1466, + "step": 1775 + }, + { + "epoch": 0.12033335591842266, + "grad_norm": 12.577048301696777, + "learning_rate": 9.859675542473819e-05, + "loss": 1.2606, + "step": 1776 + }, + { + "epoch": 0.12040111118639474, + "grad_norm": 10.051258087158203, + "learning_rate": 9.859538640564037e-05, + "loss": 1.1533, + "step": 1777 + }, + { + "epoch": 0.12046886645436683, + "grad_norm": 9.358968734741211, + "learning_rate": 9.859401738654255e-05, + "loss": 1.0501, + "step": 1778 + }, + { + "epoch": 0.12053662172233891, + "grad_norm": 11.277347564697266, + "learning_rate": 9.859264836744473e-05, + "loss": 0.9804, + "step": 1779 + }, + { + "epoch": 0.120604376990311, + "grad_norm": 12.41299057006836, + "learning_rate": 9.859127934834691e-05, + "loss": 1.427, + "step": 1780 + }, + { + "epoch": 0.12067213225828308, + "grad_norm": 10.081144332885742, + "learning_rate": 9.85899103292491e-05, + "loss": 1.1321, + "step": 1781 + }, + { + "epoch": 0.12073988752625517, + "grad_norm": 11.469686508178711, + "learning_rate": 9.858854131015128e-05, + "loss": 1.1972, + "step": 1782 + }, + { + "epoch": 0.12080764279422725, + "grad_norm": 8.608443260192871, + "learning_rate": 9.858717229105346e-05, + "loss": 0.9914, + "step": 1783 + }, + { + "epoch": 0.12087539806219934, + "grad_norm": 11.673405647277832, + "learning_rate": 9.858580327195564e-05, + "loss": 1.3406, + "step": 1784 + }, + { + "epoch": 0.12094315333017142, + "grad_norm": 9.533596992492676, + "learning_rate": 9.858443425285782e-05, + "loss": 1.2119, + "step": 1785 + }, + { + "epoch": 0.12101090859814351, + "grad_norm": 7.792461395263672, + "learning_rate": 9.858306523376002e-05, + "loss": 1.167, + "step": 1786 + }, + { + "epoch": 0.12107866386611559, + "grad_norm": 10.209978103637695, + "learning_rate": 9.85816962146622e-05, + "loss": 1.164, + "step": 1787 + }, + { + "epoch": 0.12114641913408768, + "grad_norm": 11.591109275817871, + "learning_rate": 9.858032719556438e-05, + "loss": 1.2598, + "step": 1788 + }, + { + "epoch": 0.12121417440205975, + "grad_norm": 10.562797546386719, + "learning_rate": 9.857895817646656e-05, + "loss": 1.3071, + "step": 1789 + }, + { + "epoch": 0.12128192967003185, + "grad_norm": 9.419530868530273, + "learning_rate": 9.857758915736875e-05, + "loss": 0.9951, + "step": 1790 + }, + { + "epoch": 0.12134968493800392, + "grad_norm": 9.525064468383789, + "learning_rate": 9.857622013827093e-05, + "loss": 1.1833, + "step": 1791 + }, + { + "epoch": 0.12141744020597602, + "grad_norm": 9.140012741088867, + "learning_rate": 9.857485111917311e-05, + "loss": 1.0573, + "step": 1792 + }, + { + "epoch": 0.1214851954739481, + "grad_norm": 10.88685417175293, + "learning_rate": 9.85734821000753e-05, + "loss": 1.1669, + "step": 1793 + }, + { + "epoch": 0.12155295074192018, + "grad_norm": 8.63829231262207, + "learning_rate": 9.857211308097747e-05, + "loss": 1.1733, + "step": 1794 + }, + { + "epoch": 0.12162070600989226, + "grad_norm": 9.90237808227539, + "learning_rate": 9.857074406187967e-05, + "loss": 0.9175, + "step": 1795 + }, + { + "epoch": 0.12168846127786435, + "grad_norm": 10.068902969360352, + "learning_rate": 9.856937504278185e-05, + "loss": 0.9422, + "step": 1796 + }, + { + "epoch": 0.12175621654583643, + "grad_norm": 7.482577800750732, + "learning_rate": 9.856800602368403e-05, + "loss": 1.0359, + "step": 1797 + }, + { + "epoch": 0.12182397181380852, + "grad_norm": 10.847206115722656, + "learning_rate": 9.856663700458621e-05, + "loss": 1.2258, + "step": 1798 + }, + { + "epoch": 0.12189172708178062, + "grad_norm": 11.122177124023438, + "learning_rate": 9.85652679854884e-05, + "loss": 1.1378, + "step": 1799 + }, + { + "epoch": 0.1219594823497527, + "grad_norm": 9.994139671325684, + "learning_rate": 9.856389896639058e-05, + "loss": 1.2663, + "step": 1800 + }, + { + "epoch": 0.12202723761772478, + "grad_norm": 10.687590599060059, + "learning_rate": 9.856252994729276e-05, + "loss": 1.1777, + "step": 1801 + }, + { + "epoch": 0.12209499288569686, + "grad_norm": 8.403971672058105, + "learning_rate": 9.856116092819494e-05, + "loss": 0.9812, + "step": 1802 + }, + { + "epoch": 0.12216274815366895, + "grad_norm": 9.77443790435791, + "learning_rate": 9.855979190909714e-05, + "loss": 1.0615, + "step": 1803 + }, + { + "epoch": 0.12223050342164103, + "grad_norm": 10.772642135620117, + "learning_rate": 9.855842288999932e-05, + "loss": 1.1835, + "step": 1804 + }, + { + "epoch": 0.12229825868961312, + "grad_norm": 10.509920120239258, + "learning_rate": 9.85570538709015e-05, + "loss": 1.0361, + "step": 1805 + }, + { + "epoch": 0.1223660139575852, + "grad_norm": 11.14194393157959, + "learning_rate": 9.855568485180369e-05, + "loss": 0.9947, + "step": 1806 + }, + { + "epoch": 0.12243376922555729, + "grad_norm": 11.018975257873535, + "learning_rate": 9.855431583270587e-05, + "loss": 1.2036, + "step": 1807 + }, + { + "epoch": 0.12250152449352937, + "grad_norm": 12.099370002746582, + "learning_rate": 9.855294681360805e-05, + "loss": 1.2977, + "step": 1808 + }, + { + "epoch": 0.12256927976150146, + "grad_norm": 8.841024398803711, + "learning_rate": 9.855157779451025e-05, + "loss": 1.2093, + "step": 1809 + }, + { + "epoch": 0.12263703502947354, + "grad_norm": 9.149311065673828, + "learning_rate": 9.855020877541243e-05, + "loss": 1.0088, + "step": 1810 + }, + { + "epoch": 0.12270479029744563, + "grad_norm": 10.53498649597168, + "learning_rate": 9.854883975631461e-05, + "loss": 0.9527, + "step": 1811 + }, + { + "epoch": 0.12277254556541771, + "grad_norm": 9.230859756469727, + "learning_rate": 9.854747073721679e-05, + "loss": 0.9584, + "step": 1812 + }, + { + "epoch": 0.1228403008333898, + "grad_norm": 11.702610969543457, + "learning_rate": 9.854610171811898e-05, + "loss": 1.2369, + "step": 1813 + }, + { + "epoch": 0.12290805610136188, + "grad_norm": 9.912004470825195, + "learning_rate": 9.854473269902116e-05, + "loss": 1.2571, + "step": 1814 + }, + { + "epoch": 0.12297581136933397, + "grad_norm": 10.071510314941406, + "learning_rate": 9.854336367992334e-05, + "loss": 1.1666, + "step": 1815 + }, + { + "epoch": 0.12304356663730605, + "grad_norm": 10.642035484313965, + "learning_rate": 9.854199466082552e-05, + "loss": 1.0309, + "step": 1816 + }, + { + "epoch": 0.12311132190527814, + "grad_norm": 11.135947227478027, + "learning_rate": 9.85406256417277e-05, + "loss": 1.1276, + "step": 1817 + }, + { + "epoch": 0.12317907717325022, + "grad_norm": 11.575469017028809, + "learning_rate": 9.85392566226299e-05, + "loss": 1.1507, + "step": 1818 + }, + { + "epoch": 0.12324683244122231, + "grad_norm": 10.470771789550781, + "learning_rate": 9.853788760353208e-05, + "loss": 1.3777, + "step": 1819 + }, + { + "epoch": 0.12331458770919439, + "grad_norm": 9.77783489227295, + "learning_rate": 9.853651858443426e-05, + "loss": 1.0492, + "step": 1820 + }, + { + "epoch": 0.12338234297716648, + "grad_norm": 10.132309913635254, + "learning_rate": 9.853514956533644e-05, + "loss": 1.1488, + "step": 1821 + }, + { + "epoch": 0.12345009824513856, + "grad_norm": 11.159482955932617, + "learning_rate": 9.853378054623863e-05, + "loss": 1.2571, + "step": 1822 + }, + { + "epoch": 0.12351785351311065, + "grad_norm": 11.599156379699707, + "learning_rate": 9.853241152714081e-05, + "loss": 1.1734, + "step": 1823 + }, + { + "epoch": 0.12358560878108273, + "grad_norm": 10.278475761413574, + "learning_rate": 9.853104250804299e-05, + "loss": 1.0637, + "step": 1824 + }, + { + "epoch": 0.12365336404905482, + "grad_norm": 12.126015663146973, + "learning_rate": 9.852967348894517e-05, + "loss": 1.0546, + "step": 1825 + }, + { + "epoch": 0.1237211193170269, + "grad_norm": 12.094749450683594, + "learning_rate": 9.852830446984735e-05, + "loss": 1.374, + "step": 1826 + }, + { + "epoch": 0.12378887458499899, + "grad_norm": 7.607821941375732, + "learning_rate": 9.852693545074955e-05, + "loss": 0.827, + "step": 1827 + }, + { + "epoch": 0.12385662985297106, + "grad_norm": 12.086756706237793, + "learning_rate": 9.852556643165173e-05, + "loss": 1.1219, + "step": 1828 + }, + { + "epoch": 0.12392438512094316, + "grad_norm": 10.215773582458496, + "learning_rate": 9.852419741255391e-05, + "loss": 1.0148, + "step": 1829 + }, + { + "epoch": 0.12399214038891523, + "grad_norm": 10.438709259033203, + "learning_rate": 9.852282839345609e-05, + "loss": 1.0649, + "step": 1830 + }, + { + "epoch": 0.12405989565688733, + "grad_norm": 10.564906120300293, + "learning_rate": 9.852145937435828e-05, + "loss": 1.4346, + "step": 1831 + }, + { + "epoch": 0.1241276509248594, + "grad_norm": 9.83198356628418, + "learning_rate": 9.852009035526046e-05, + "loss": 0.847, + "step": 1832 + }, + { + "epoch": 0.1241954061928315, + "grad_norm": 10.990026473999023, + "learning_rate": 9.851872133616264e-05, + "loss": 1.2358, + "step": 1833 + }, + { + "epoch": 0.12426316146080357, + "grad_norm": 10.108382225036621, + "learning_rate": 9.851735231706482e-05, + "loss": 0.9943, + "step": 1834 + }, + { + "epoch": 0.12433091672877566, + "grad_norm": 10.23820686340332, + "learning_rate": 9.8515983297967e-05, + "loss": 1.2464, + "step": 1835 + }, + { + "epoch": 0.12439867199674774, + "grad_norm": 11.552473068237305, + "learning_rate": 9.85146142788692e-05, + "loss": 1.2175, + "step": 1836 + }, + { + "epoch": 0.12446642726471983, + "grad_norm": 8.646978378295898, + "learning_rate": 9.851324525977138e-05, + "loss": 0.9975, + "step": 1837 + }, + { + "epoch": 0.12453418253269191, + "grad_norm": 9.344541549682617, + "learning_rate": 9.851187624067356e-05, + "loss": 0.9463, + "step": 1838 + }, + { + "epoch": 0.124601937800664, + "grad_norm": 12.645013809204102, + "learning_rate": 9.851050722157574e-05, + "loss": 1.0396, + "step": 1839 + }, + { + "epoch": 0.12466969306863608, + "grad_norm": 10.093807220458984, + "learning_rate": 9.850913820247792e-05, + "loss": 1.2554, + "step": 1840 + }, + { + "epoch": 0.12473744833660817, + "grad_norm": 8.792567253112793, + "learning_rate": 9.850776918338011e-05, + "loss": 0.9852, + "step": 1841 + }, + { + "epoch": 0.12480520360458025, + "grad_norm": 10.448987007141113, + "learning_rate": 9.85064001642823e-05, + "loss": 1.168, + "step": 1842 + }, + { + "epoch": 0.12487295887255234, + "grad_norm": 9.953516960144043, + "learning_rate": 9.850503114518447e-05, + "loss": 1.0116, + "step": 1843 + }, + { + "epoch": 0.12494071414052442, + "grad_norm": 10.187036514282227, + "learning_rate": 9.850366212608665e-05, + "loss": 1.203, + "step": 1844 + }, + { + "epoch": 0.1250084694084965, + "grad_norm": 8.890667915344238, + "learning_rate": 9.850229310698885e-05, + "loss": 1.0104, + "step": 1845 + }, + { + "epoch": 0.1250762246764686, + "grad_norm": 11.916625022888184, + "learning_rate": 9.850092408789103e-05, + "loss": 1.0254, + "step": 1846 + }, + { + "epoch": 0.12514397994444068, + "grad_norm": 10.934864044189453, + "learning_rate": 9.849955506879321e-05, + "loss": 0.975, + "step": 1847 + }, + { + "epoch": 0.12521173521241277, + "grad_norm": 11.214954376220703, + "learning_rate": 9.849818604969539e-05, + "loss": 1.45, + "step": 1848 + }, + { + "epoch": 0.12527949048038484, + "grad_norm": 8.801512718200684, + "learning_rate": 9.849681703059758e-05, + "loss": 1.0899, + "step": 1849 + }, + { + "epoch": 0.12534724574835693, + "grad_norm": 10.340089797973633, + "learning_rate": 9.849544801149976e-05, + "loss": 1.1704, + "step": 1850 + }, + { + "epoch": 0.12541500101632902, + "grad_norm": 9.204201698303223, + "learning_rate": 9.849407899240194e-05, + "loss": 1.0569, + "step": 1851 + }, + { + "epoch": 0.1254827562843011, + "grad_norm": 9.43604564666748, + "learning_rate": 9.849270997330414e-05, + "loss": 1.0721, + "step": 1852 + }, + { + "epoch": 0.1255505115522732, + "grad_norm": 10.110416412353516, + "learning_rate": 9.849134095420632e-05, + "loss": 1.1265, + "step": 1853 + }, + { + "epoch": 0.12561826682024527, + "grad_norm": 10.630755424499512, + "learning_rate": 9.84899719351085e-05, + "loss": 1.1103, + "step": 1854 + }, + { + "epoch": 0.12568602208821736, + "grad_norm": 10.888835906982422, + "learning_rate": 9.848860291601069e-05, + "loss": 1.347, + "step": 1855 + }, + { + "epoch": 0.12575377735618945, + "grad_norm": 8.84623908996582, + "learning_rate": 9.848723389691287e-05, + "loss": 0.8531, + "step": 1856 + }, + { + "epoch": 0.12582153262416154, + "grad_norm": 10.99928092956543, + "learning_rate": 9.848586487781505e-05, + "loss": 1.1747, + "step": 1857 + }, + { + "epoch": 0.1258892878921336, + "grad_norm": 8.556151390075684, + "learning_rate": 9.848449585871723e-05, + "loss": 1.0207, + "step": 1858 + }, + { + "epoch": 0.1259570431601057, + "grad_norm": 10.207547187805176, + "learning_rate": 9.848312683961943e-05, + "loss": 1.4782, + "step": 1859 + }, + { + "epoch": 0.1260247984280778, + "grad_norm": 9.39505386352539, + "learning_rate": 9.848175782052161e-05, + "loss": 1.1415, + "step": 1860 + }, + { + "epoch": 0.12609255369604988, + "grad_norm": 10.647768020629883, + "learning_rate": 9.848038880142379e-05, + "loss": 1.1439, + "step": 1861 + }, + { + "epoch": 0.12616030896402194, + "grad_norm": 11.517707824707031, + "learning_rate": 9.847901978232597e-05, + "loss": 1.3181, + "step": 1862 + }, + { + "epoch": 0.12622806423199404, + "grad_norm": 10.916093826293945, + "learning_rate": 9.847765076322815e-05, + "loss": 1.0708, + "step": 1863 + }, + { + "epoch": 0.12629581949996613, + "grad_norm": 9.893363952636719, + "learning_rate": 9.847628174413034e-05, + "loss": 1.1932, + "step": 1864 + }, + { + "epoch": 0.12636357476793822, + "grad_norm": 12.349174499511719, + "learning_rate": 9.847491272503252e-05, + "loss": 1.3161, + "step": 1865 + }, + { + "epoch": 0.12643133003591028, + "grad_norm": 10.157081604003906, + "learning_rate": 9.84735437059347e-05, + "loss": 1.1325, + "step": 1866 + }, + { + "epoch": 0.12649908530388237, + "grad_norm": 9.772073745727539, + "learning_rate": 9.847217468683688e-05, + "loss": 1.2739, + "step": 1867 + }, + { + "epoch": 0.12656684057185447, + "grad_norm": 10.468371391296387, + "learning_rate": 9.847080566773908e-05, + "loss": 1.0279, + "step": 1868 + }, + { + "epoch": 0.12663459583982656, + "grad_norm": 11.055033683776855, + "learning_rate": 9.846943664864126e-05, + "loss": 1.0584, + "step": 1869 + }, + { + "epoch": 0.12670235110779862, + "grad_norm": 10.47987174987793, + "learning_rate": 9.846806762954344e-05, + "loss": 1.0385, + "step": 1870 + }, + { + "epoch": 0.1267701063757707, + "grad_norm": 8.933801651000977, + "learning_rate": 9.846669861044562e-05, + "loss": 1.0259, + "step": 1871 + }, + { + "epoch": 0.1268378616437428, + "grad_norm": 10.32228946685791, + "learning_rate": 9.84653295913478e-05, + "loss": 1.0472, + "step": 1872 + }, + { + "epoch": 0.1269056169117149, + "grad_norm": 9.294051170349121, + "learning_rate": 9.846396057224999e-05, + "loss": 1.1638, + "step": 1873 + }, + { + "epoch": 0.12697337217968696, + "grad_norm": 8.32187271118164, + "learning_rate": 9.846259155315217e-05, + "loss": 0.9865, + "step": 1874 + }, + { + "epoch": 0.12704112744765905, + "grad_norm": 8.782264709472656, + "learning_rate": 9.846122253405435e-05, + "loss": 1.0868, + "step": 1875 + }, + { + "epoch": 0.12710888271563114, + "grad_norm": 10.444697380065918, + "learning_rate": 9.845985351495653e-05, + "loss": 1.2137, + "step": 1876 + }, + { + "epoch": 0.12717663798360324, + "grad_norm": 10.543922424316406, + "learning_rate": 9.845848449585873e-05, + "loss": 1.2337, + "step": 1877 + }, + { + "epoch": 0.1272443932515753, + "grad_norm": 8.504612922668457, + "learning_rate": 9.845711547676091e-05, + "loss": 1.0234, + "step": 1878 + }, + { + "epoch": 0.1273121485195474, + "grad_norm": 8.83178997039795, + "learning_rate": 9.845574645766309e-05, + "loss": 1.0611, + "step": 1879 + }, + { + "epoch": 0.12737990378751948, + "grad_norm": 14.37713623046875, + "learning_rate": 9.845437743856527e-05, + "loss": 1.1704, + "step": 1880 + }, + { + "epoch": 0.12744765905549157, + "grad_norm": 12.43575668334961, + "learning_rate": 9.845300841946745e-05, + "loss": 0.848, + "step": 1881 + }, + { + "epoch": 0.12751541432346364, + "grad_norm": 11.625554084777832, + "learning_rate": 9.845163940036964e-05, + "loss": 1.2395, + "step": 1882 + }, + { + "epoch": 0.12758316959143573, + "grad_norm": 7.8962082862854, + "learning_rate": 9.845027038127182e-05, + "loss": 0.9702, + "step": 1883 + }, + { + "epoch": 0.12765092485940782, + "grad_norm": 11.689674377441406, + "learning_rate": 9.8448901362174e-05, + "loss": 1.4885, + "step": 1884 + }, + { + "epoch": 0.1277186801273799, + "grad_norm": 11.037276268005371, + "learning_rate": 9.844753234307618e-05, + "loss": 1.1751, + "step": 1885 + }, + { + "epoch": 0.12778643539535198, + "grad_norm": 11.115680694580078, + "learning_rate": 9.844616332397836e-05, + "loss": 1.2912, + "step": 1886 + }, + { + "epoch": 0.12785419066332407, + "grad_norm": 10.553492546081543, + "learning_rate": 9.844479430488056e-05, + "loss": 1.1001, + "step": 1887 + }, + { + "epoch": 0.12792194593129616, + "grad_norm": 9.92845630645752, + "learning_rate": 9.844342528578274e-05, + "loss": 1.1697, + "step": 1888 + }, + { + "epoch": 0.12798970119926825, + "grad_norm": 9.33232307434082, + "learning_rate": 9.844205626668492e-05, + "loss": 1.0931, + "step": 1889 + }, + { + "epoch": 0.12805745646724032, + "grad_norm": 12.35257339477539, + "learning_rate": 9.84406872475871e-05, + "loss": 0.9819, + "step": 1890 + }, + { + "epoch": 0.1281252117352124, + "grad_norm": 9.913105010986328, + "learning_rate": 9.843931822848929e-05, + "loss": 1.2515, + "step": 1891 + }, + { + "epoch": 0.1281929670031845, + "grad_norm": 11.218729019165039, + "learning_rate": 9.843794920939147e-05, + "loss": 1.0073, + "step": 1892 + }, + { + "epoch": 0.1282607222711566, + "grad_norm": 9.97374153137207, + "learning_rate": 9.843658019029365e-05, + "loss": 1.0527, + "step": 1893 + }, + { + "epoch": 0.12832847753912865, + "grad_norm": 8.837554931640625, + "learning_rate": 9.843521117119583e-05, + "loss": 1.0175, + "step": 1894 + }, + { + "epoch": 0.12839623280710075, + "grad_norm": 9.207158088684082, + "learning_rate": 9.843384215209803e-05, + "loss": 1.0729, + "step": 1895 + }, + { + "epoch": 0.12846398807507284, + "grad_norm": 10.960060119628906, + "learning_rate": 9.843247313300021e-05, + "loss": 1.0485, + "step": 1896 + }, + { + "epoch": 0.12853174334304493, + "grad_norm": 9.724952697753906, + "learning_rate": 9.843110411390239e-05, + "loss": 1.2624, + "step": 1897 + }, + { + "epoch": 0.128599498611017, + "grad_norm": 9.02813720703125, + "learning_rate": 9.842973509480458e-05, + "loss": 1.011, + "step": 1898 + }, + { + "epoch": 0.12866725387898909, + "grad_norm": 8.507847785949707, + "learning_rate": 9.842836607570676e-05, + "loss": 0.9155, + "step": 1899 + }, + { + "epoch": 0.12873500914696118, + "grad_norm": 10.609807968139648, + "learning_rate": 9.842699705660894e-05, + "loss": 1.1984, + "step": 1900 + }, + { + "epoch": 0.12880276441493327, + "grad_norm": 10.144070625305176, + "learning_rate": 9.842562803751114e-05, + "loss": 1.2331, + "step": 1901 + }, + { + "epoch": 0.12887051968290533, + "grad_norm": 11.719651222229004, + "learning_rate": 9.842425901841332e-05, + "loss": 1.2086, + "step": 1902 + }, + { + "epoch": 0.12893827495087742, + "grad_norm": 10.689997673034668, + "learning_rate": 9.84228899993155e-05, + "loss": 1.3769, + "step": 1903 + }, + { + "epoch": 0.12900603021884952, + "grad_norm": 10.7424955368042, + "learning_rate": 9.842152098021768e-05, + "loss": 1.1844, + "step": 1904 + }, + { + "epoch": 0.1290737854868216, + "grad_norm": 11.207498550415039, + "learning_rate": 9.842015196111987e-05, + "loss": 1.2151, + "step": 1905 + }, + { + "epoch": 0.1291415407547937, + "grad_norm": 10.689212799072266, + "learning_rate": 9.841878294202205e-05, + "loss": 1.1182, + "step": 1906 + }, + { + "epoch": 0.12920929602276576, + "grad_norm": 9.708647727966309, + "learning_rate": 9.841741392292423e-05, + "loss": 1.2874, + "step": 1907 + }, + { + "epoch": 0.12927705129073785, + "grad_norm": 9.809847831726074, + "learning_rate": 9.841604490382641e-05, + "loss": 1.1716, + "step": 1908 + }, + { + "epoch": 0.12934480655870995, + "grad_norm": 8.170798301696777, + "learning_rate": 9.84146758847286e-05, + "loss": 1.1342, + "step": 1909 + }, + { + "epoch": 0.12941256182668204, + "grad_norm": 9.12773609161377, + "learning_rate": 9.841330686563079e-05, + "loss": 1.1506, + "step": 1910 + }, + { + "epoch": 0.1294803170946541, + "grad_norm": 13.67032241821289, + "learning_rate": 9.841193784653297e-05, + "loss": 1.2065, + "step": 1911 + }, + { + "epoch": 0.1295480723626262, + "grad_norm": 9.35611629486084, + "learning_rate": 9.841056882743515e-05, + "loss": 1.281, + "step": 1912 + }, + { + "epoch": 0.12961582763059828, + "grad_norm": 9.044548988342285, + "learning_rate": 9.840919980833733e-05, + "loss": 1.0285, + "step": 1913 + }, + { + "epoch": 0.12968358289857038, + "grad_norm": 9.955796241760254, + "learning_rate": 9.840783078923952e-05, + "loss": 1.1742, + "step": 1914 + }, + { + "epoch": 0.12975133816654244, + "grad_norm": 10.456665992736816, + "learning_rate": 9.84064617701417e-05, + "loss": 1.3452, + "step": 1915 + }, + { + "epoch": 0.12981909343451453, + "grad_norm": 10.861869812011719, + "learning_rate": 9.840509275104388e-05, + "loss": 1.0397, + "step": 1916 + }, + { + "epoch": 0.12988684870248662, + "grad_norm": 8.546277046203613, + "learning_rate": 9.840372373194606e-05, + "loss": 0.8762, + "step": 1917 + }, + { + "epoch": 0.12995460397045872, + "grad_norm": 9.103103637695312, + "learning_rate": 9.840235471284824e-05, + "loss": 1.0998, + "step": 1918 + }, + { + "epoch": 0.13002235923843078, + "grad_norm": 9.794631958007812, + "learning_rate": 9.840098569375044e-05, + "loss": 1.0644, + "step": 1919 + }, + { + "epoch": 0.13009011450640287, + "grad_norm": 8.702750205993652, + "learning_rate": 9.839961667465262e-05, + "loss": 1.108, + "step": 1920 + }, + { + "epoch": 0.13015786977437496, + "grad_norm": 11.108800888061523, + "learning_rate": 9.83982476555548e-05, + "loss": 1.375, + "step": 1921 + }, + { + "epoch": 0.13022562504234705, + "grad_norm": 11.253337860107422, + "learning_rate": 9.839687863645698e-05, + "loss": 1.0246, + "step": 1922 + }, + { + "epoch": 0.13029338031031912, + "grad_norm": 9.817541122436523, + "learning_rate": 9.839550961735917e-05, + "loss": 1.0222, + "step": 1923 + }, + { + "epoch": 0.1303611355782912, + "grad_norm": 9.378199577331543, + "learning_rate": 9.839414059826135e-05, + "loss": 1.0387, + "step": 1924 + }, + { + "epoch": 0.1304288908462633, + "grad_norm": 8.789336204528809, + "learning_rate": 9.839277157916353e-05, + "loss": 1.0635, + "step": 1925 + }, + { + "epoch": 0.1304966461142354, + "grad_norm": 11.900403022766113, + "learning_rate": 9.839140256006571e-05, + "loss": 1.1224, + "step": 1926 + }, + { + "epoch": 0.13056440138220746, + "grad_norm": 10.007912635803223, + "learning_rate": 9.83900335409679e-05, + "loss": 0.8988, + "step": 1927 + }, + { + "epoch": 0.13063215665017955, + "grad_norm": 10.157328605651855, + "learning_rate": 9.838866452187009e-05, + "loss": 0.952, + "step": 1928 + }, + { + "epoch": 0.13069991191815164, + "grad_norm": 8.763729095458984, + "learning_rate": 9.838729550277227e-05, + "loss": 0.9304, + "step": 1929 + }, + { + "epoch": 0.13076766718612373, + "grad_norm": 10.75432300567627, + "learning_rate": 9.838592648367445e-05, + "loss": 1.0438, + "step": 1930 + }, + { + "epoch": 0.1308354224540958, + "grad_norm": 8.840702056884766, + "learning_rate": 9.838455746457663e-05, + "loss": 0.9164, + "step": 1931 + }, + { + "epoch": 0.1309031777220679, + "grad_norm": 9.526811599731445, + "learning_rate": 9.838318844547882e-05, + "loss": 1.2181, + "step": 1932 + }, + { + "epoch": 0.13097093299003998, + "grad_norm": 12.827199935913086, + "learning_rate": 9.8381819426381e-05, + "loss": 1.223, + "step": 1933 + }, + { + "epoch": 0.13103868825801207, + "grad_norm": 9.47105884552002, + "learning_rate": 9.838045040728318e-05, + "loss": 1.1014, + "step": 1934 + }, + { + "epoch": 0.13110644352598413, + "grad_norm": 9.044878959655762, + "learning_rate": 9.837908138818536e-05, + "loss": 1.1451, + "step": 1935 + }, + { + "epoch": 0.13117419879395623, + "grad_norm": 9.24599838256836, + "learning_rate": 9.837771236908754e-05, + "loss": 1.1144, + "step": 1936 + }, + { + "epoch": 0.13124195406192832, + "grad_norm": 12.543252944946289, + "learning_rate": 9.837634334998974e-05, + "loss": 1.1233, + "step": 1937 + }, + { + "epoch": 0.1313097093299004, + "grad_norm": 10.022245407104492, + "learning_rate": 9.837497433089192e-05, + "loss": 1.3569, + "step": 1938 + }, + { + "epoch": 0.13137746459787247, + "grad_norm": 11.967863082885742, + "learning_rate": 9.83736053117941e-05, + "loss": 1.2086, + "step": 1939 + }, + { + "epoch": 0.13144521986584456, + "grad_norm": 9.64406967163086, + "learning_rate": 9.837223629269628e-05, + "loss": 1.1647, + "step": 1940 + }, + { + "epoch": 0.13151297513381666, + "grad_norm": 8.538762092590332, + "learning_rate": 9.837086727359847e-05, + "loss": 1.012, + "step": 1941 + }, + { + "epoch": 0.13158073040178875, + "grad_norm": 10.889129638671875, + "learning_rate": 9.836949825450065e-05, + "loss": 1.1559, + "step": 1942 + }, + { + "epoch": 0.1316484856697608, + "grad_norm": 8.407093048095703, + "learning_rate": 9.836812923540283e-05, + "loss": 0.9624, + "step": 1943 + }, + { + "epoch": 0.1317162409377329, + "grad_norm": 9.175569534301758, + "learning_rate": 9.836676021630503e-05, + "loss": 0.9814, + "step": 1944 + }, + { + "epoch": 0.131783996205705, + "grad_norm": 11.240396499633789, + "learning_rate": 9.836539119720721e-05, + "loss": 1.3059, + "step": 1945 + }, + { + "epoch": 0.1318517514736771, + "grad_norm": 8.791098594665527, + "learning_rate": 9.836402217810939e-05, + "loss": 0.9891, + "step": 1946 + }, + { + "epoch": 0.13191950674164915, + "grad_norm": 7.401971340179443, + "learning_rate": 9.836265315901158e-05, + "loss": 0.9706, + "step": 1947 + }, + { + "epoch": 0.13198726200962124, + "grad_norm": 13.127768516540527, + "learning_rate": 9.836128413991376e-05, + "loss": 0.8924, + "step": 1948 + }, + { + "epoch": 0.13205501727759333, + "grad_norm": 9.805618286132812, + "learning_rate": 9.835991512081594e-05, + "loss": 0.9969, + "step": 1949 + }, + { + "epoch": 0.13212277254556543, + "grad_norm": 10.500420570373535, + "learning_rate": 9.835854610171812e-05, + "loss": 1.1045, + "step": 1950 + }, + { + "epoch": 0.1321905278135375, + "grad_norm": 10.043769836425781, + "learning_rate": 9.835717708262032e-05, + "loss": 1.2475, + "step": 1951 + }, + { + "epoch": 0.13225828308150958, + "grad_norm": 10.6277437210083, + "learning_rate": 9.83558080635225e-05, + "loss": 0.9285, + "step": 1952 + }, + { + "epoch": 0.13232603834948167, + "grad_norm": 9.011415481567383, + "learning_rate": 9.835443904442468e-05, + "loss": 1.1895, + "step": 1953 + }, + { + "epoch": 0.13239379361745376, + "grad_norm": 11.031267166137695, + "learning_rate": 9.835307002532686e-05, + "loss": 1.1781, + "step": 1954 + }, + { + "epoch": 0.13246154888542583, + "grad_norm": 9.960331916809082, + "learning_rate": 9.835170100622905e-05, + "loss": 1.0972, + "step": 1955 + }, + { + "epoch": 0.13252930415339792, + "grad_norm": 9.549619674682617, + "learning_rate": 9.835033198713123e-05, + "loss": 1.0918, + "step": 1956 + }, + { + "epoch": 0.13259705942137, + "grad_norm": 9.780478477478027, + "learning_rate": 9.834896296803341e-05, + "loss": 1.1491, + "step": 1957 + }, + { + "epoch": 0.1326648146893421, + "grad_norm": 8.948554992675781, + "learning_rate": 9.834759394893559e-05, + "loss": 1.0969, + "step": 1958 + }, + { + "epoch": 0.1327325699573142, + "grad_norm": 9.165532112121582, + "learning_rate": 9.834622492983777e-05, + "loss": 0.9294, + "step": 1959 + }, + { + "epoch": 0.13280032522528626, + "grad_norm": 8.738619804382324, + "learning_rate": 9.834485591073997e-05, + "loss": 1.3074, + "step": 1960 + }, + { + "epoch": 0.13286808049325835, + "grad_norm": 9.442314147949219, + "learning_rate": 9.834348689164215e-05, + "loss": 1.1238, + "step": 1961 + }, + { + "epoch": 0.13293583576123044, + "grad_norm": 12.411934852600098, + "learning_rate": 9.834211787254433e-05, + "loss": 1.1405, + "step": 1962 + }, + { + "epoch": 0.13300359102920253, + "grad_norm": 9.911120414733887, + "learning_rate": 9.834074885344651e-05, + "loss": 1.2197, + "step": 1963 + }, + { + "epoch": 0.1330713462971746, + "grad_norm": 9.619095802307129, + "learning_rate": 9.83393798343487e-05, + "loss": 1.0526, + "step": 1964 + }, + { + "epoch": 0.1331391015651467, + "grad_norm": 10.163374900817871, + "learning_rate": 9.833801081525088e-05, + "loss": 0.8374, + "step": 1965 + }, + { + "epoch": 0.13320685683311878, + "grad_norm": 9.342517852783203, + "learning_rate": 9.833664179615306e-05, + "loss": 1.1908, + "step": 1966 + }, + { + "epoch": 0.13327461210109087, + "grad_norm": 10.419418334960938, + "learning_rate": 9.833527277705524e-05, + "loss": 1.1175, + "step": 1967 + }, + { + "epoch": 0.13334236736906294, + "grad_norm": 9.5196533203125, + "learning_rate": 9.833390375795742e-05, + "loss": 1.2347, + "step": 1968 + }, + { + "epoch": 0.13341012263703503, + "grad_norm": 9.242755889892578, + "learning_rate": 9.833253473885962e-05, + "loss": 1.2401, + "step": 1969 + }, + { + "epoch": 0.13347787790500712, + "grad_norm": 10.243762969970703, + "learning_rate": 9.83311657197618e-05, + "loss": 1.3492, + "step": 1970 + }, + { + "epoch": 0.1335456331729792, + "grad_norm": 9.169745445251465, + "learning_rate": 9.832979670066398e-05, + "loss": 0.9533, + "step": 1971 + }, + { + "epoch": 0.13361338844095128, + "grad_norm": 10.292695999145508, + "learning_rate": 9.832842768156616e-05, + "loss": 1.2216, + "step": 1972 + }, + { + "epoch": 0.13368114370892337, + "grad_norm": 9.25019645690918, + "learning_rate": 9.832705866246834e-05, + "loss": 1.1613, + "step": 1973 + }, + { + "epoch": 0.13374889897689546, + "grad_norm": 8.518020629882812, + "learning_rate": 9.832568964337053e-05, + "loss": 1.0767, + "step": 1974 + }, + { + "epoch": 0.13381665424486755, + "grad_norm": 11.824376106262207, + "learning_rate": 9.832432062427271e-05, + "loss": 1.1884, + "step": 1975 + }, + { + "epoch": 0.13388440951283961, + "grad_norm": 10.544015884399414, + "learning_rate": 9.832295160517489e-05, + "loss": 0.9675, + "step": 1976 + }, + { + "epoch": 0.1339521647808117, + "grad_norm": 9.495721817016602, + "learning_rate": 9.832158258607707e-05, + "loss": 1.0586, + "step": 1977 + }, + { + "epoch": 0.1340199200487838, + "grad_norm": 10.378434181213379, + "learning_rate": 9.832021356697927e-05, + "loss": 1.176, + "step": 1978 + }, + { + "epoch": 0.1340876753167559, + "grad_norm": 10.026887893676758, + "learning_rate": 9.831884454788145e-05, + "loss": 1.035, + "step": 1979 + }, + { + "epoch": 0.13415543058472795, + "grad_norm": 8.878249168395996, + "learning_rate": 9.831747552878363e-05, + "loss": 0.8441, + "step": 1980 + }, + { + "epoch": 0.13422318585270004, + "grad_norm": 9.637616157531738, + "learning_rate": 9.831610650968581e-05, + "loss": 1.1435, + "step": 1981 + }, + { + "epoch": 0.13429094112067214, + "grad_norm": 10.153711318969727, + "learning_rate": 9.831473749058799e-05, + "loss": 1.0833, + "step": 1982 + }, + { + "epoch": 0.13435869638864423, + "grad_norm": 8.48596477508545, + "learning_rate": 9.831336847149018e-05, + "loss": 0.7752, + "step": 1983 + }, + { + "epoch": 0.1344264516566163, + "grad_norm": 8.454994201660156, + "learning_rate": 9.831199945239236e-05, + "loss": 0.8657, + "step": 1984 + }, + { + "epoch": 0.13449420692458838, + "grad_norm": 10.607659339904785, + "learning_rate": 9.831063043329454e-05, + "loss": 1.1836, + "step": 1985 + }, + { + "epoch": 0.13456196219256047, + "grad_norm": 8.560189247131348, + "learning_rate": 9.830926141419672e-05, + "loss": 0.7877, + "step": 1986 + }, + { + "epoch": 0.13462971746053257, + "grad_norm": 10.687662124633789, + "learning_rate": 9.830789239509892e-05, + "loss": 1.1586, + "step": 1987 + }, + { + "epoch": 0.13469747272850463, + "grad_norm": 9.725050926208496, + "learning_rate": 9.83065233760011e-05, + "loss": 1.214, + "step": 1988 + }, + { + "epoch": 0.13476522799647672, + "grad_norm": 9.808280944824219, + "learning_rate": 9.830515435690328e-05, + "loss": 1.0532, + "step": 1989 + }, + { + "epoch": 0.1348329832644488, + "grad_norm": 8.838544845581055, + "learning_rate": 9.830378533780547e-05, + "loss": 1.0872, + "step": 1990 + }, + { + "epoch": 0.1349007385324209, + "grad_norm": 12.043976783752441, + "learning_rate": 9.830241631870765e-05, + "loss": 1.2248, + "step": 1991 + }, + { + "epoch": 0.13496849380039297, + "grad_norm": 10.044602394104004, + "learning_rate": 9.830104729960983e-05, + "loss": 0.9746, + "step": 1992 + }, + { + "epoch": 0.13503624906836506, + "grad_norm": 10.861641883850098, + "learning_rate": 9.829967828051203e-05, + "loss": 0.9868, + "step": 1993 + }, + { + "epoch": 0.13510400433633715, + "grad_norm": 9.005234718322754, + "learning_rate": 9.82983092614142e-05, + "loss": 1.0619, + "step": 1994 + }, + { + "epoch": 0.13517175960430924, + "grad_norm": 9.902874946594238, + "learning_rate": 9.829694024231639e-05, + "loss": 1.3176, + "step": 1995 + }, + { + "epoch": 0.1352395148722813, + "grad_norm": 12.133747100830078, + "learning_rate": 9.829557122321857e-05, + "loss": 1.3634, + "step": 1996 + }, + { + "epoch": 0.1353072701402534, + "grad_norm": 9.571062088012695, + "learning_rate": 9.829420220412076e-05, + "loss": 1.139, + "step": 1997 + }, + { + "epoch": 0.1353750254082255, + "grad_norm": 8.269545555114746, + "learning_rate": 9.829283318502294e-05, + "loss": 1.1422, + "step": 1998 + }, + { + "epoch": 0.13544278067619758, + "grad_norm": 9.963309288024902, + "learning_rate": 9.829146416592512e-05, + "loss": 1.0612, + "step": 1999 + }, + { + "epoch": 0.13551053594416965, + "grad_norm": 7.434775352478027, + "learning_rate": 9.82900951468273e-05, + "loss": 1.0418, + "step": 2000 + }, + { + "epoch": 0.13557829121214174, + "grad_norm": 8.89494800567627, + "learning_rate": 9.82887261277295e-05, + "loss": 1.0677, + "step": 2001 + }, + { + "epoch": 0.13564604648011383, + "grad_norm": 9.349754333496094, + "learning_rate": 9.828735710863168e-05, + "loss": 0.8932, + "step": 2002 + }, + { + "epoch": 0.13571380174808592, + "grad_norm": 7.574460506439209, + "learning_rate": 9.828598808953386e-05, + "loss": 0.7652, + "step": 2003 + }, + { + "epoch": 0.13578155701605799, + "grad_norm": 10.191296577453613, + "learning_rate": 9.828461907043604e-05, + "loss": 0.9881, + "step": 2004 + }, + { + "epoch": 0.13584931228403008, + "grad_norm": 11.235671043395996, + "learning_rate": 9.828325005133822e-05, + "loss": 1.1942, + "step": 2005 + }, + { + "epoch": 0.13591706755200217, + "grad_norm": 9.97700023651123, + "learning_rate": 9.828188103224041e-05, + "loss": 1.154, + "step": 2006 + }, + { + "epoch": 0.13598482281997426, + "grad_norm": 8.283380508422852, + "learning_rate": 9.828051201314259e-05, + "loss": 0.8207, + "step": 2007 + }, + { + "epoch": 0.13605257808794632, + "grad_norm": 9.147144317626953, + "learning_rate": 9.827914299404477e-05, + "loss": 1.0751, + "step": 2008 + }, + { + "epoch": 0.13612033335591842, + "grad_norm": 9.99758529663086, + "learning_rate": 9.827777397494695e-05, + "loss": 1.1166, + "step": 2009 + }, + { + "epoch": 0.1361880886238905, + "grad_norm": 8.340705871582031, + "learning_rate": 9.827640495584915e-05, + "loss": 1.0311, + "step": 2010 + }, + { + "epoch": 0.1362558438918626, + "grad_norm": 10.536659240722656, + "learning_rate": 9.827503593675133e-05, + "loss": 1.1704, + "step": 2011 + }, + { + "epoch": 0.1363235991598347, + "grad_norm": 9.01259994506836, + "learning_rate": 9.827366691765351e-05, + "loss": 1.0531, + "step": 2012 + }, + { + "epoch": 0.13639135442780675, + "grad_norm": 10.476259231567383, + "learning_rate": 9.827229789855569e-05, + "loss": 1.053, + "step": 2013 + }, + { + "epoch": 0.13645910969577885, + "grad_norm": 8.496943473815918, + "learning_rate": 9.827092887945787e-05, + "loss": 0.836, + "step": 2014 + }, + { + "epoch": 0.13652686496375094, + "grad_norm": 11.638651847839355, + "learning_rate": 9.826955986036006e-05, + "loss": 1.0886, + "step": 2015 + }, + { + "epoch": 0.13659462023172303, + "grad_norm": 7.483241558074951, + "learning_rate": 9.826819084126224e-05, + "loss": 1.0406, + "step": 2016 + }, + { + "epoch": 0.1366623754996951, + "grad_norm": 8.648175239562988, + "learning_rate": 9.826682182216442e-05, + "loss": 1.0442, + "step": 2017 + }, + { + "epoch": 0.13673013076766719, + "grad_norm": 8.593596458435059, + "learning_rate": 9.82654528030666e-05, + "loss": 1.1184, + "step": 2018 + }, + { + "epoch": 0.13679788603563928, + "grad_norm": 8.563817977905273, + "learning_rate": 9.826408378396878e-05, + "loss": 0.8645, + "step": 2019 + }, + { + "epoch": 0.13686564130361137, + "grad_norm": 7.812311172485352, + "learning_rate": 9.826271476487098e-05, + "loss": 1.0605, + "step": 2020 + }, + { + "epoch": 0.13693339657158343, + "grad_norm": 10.97168254852295, + "learning_rate": 9.826134574577316e-05, + "loss": 1.1474, + "step": 2021 + }, + { + "epoch": 0.13700115183955552, + "grad_norm": 7.749011993408203, + "learning_rate": 9.825997672667534e-05, + "loss": 0.9836, + "step": 2022 + }, + { + "epoch": 0.13706890710752762, + "grad_norm": 11.088539123535156, + "learning_rate": 9.825860770757752e-05, + "loss": 1.1456, + "step": 2023 + }, + { + "epoch": 0.1371366623754997, + "grad_norm": 10.960288047790527, + "learning_rate": 9.825723868847971e-05, + "loss": 1.4315, + "step": 2024 + }, + { + "epoch": 0.13720441764347177, + "grad_norm": 10.804458618164062, + "learning_rate": 9.825586966938189e-05, + "loss": 1.0388, + "step": 2025 + }, + { + "epoch": 0.13727217291144386, + "grad_norm": 7.906947612762451, + "learning_rate": 9.825450065028407e-05, + "loss": 1.1306, + "step": 2026 + }, + { + "epoch": 0.13733992817941595, + "grad_norm": 10.404504776000977, + "learning_rate": 9.825313163118625e-05, + "loss": 1.0907, + "step": 2027 + }, + { + "epoch": 0.13740768344738805, + "grad_norm": 9.684488296508789, + "learning_rate": 9.825176261208843e-05, + "loss": 1.1007, + "step": 2028 + }, + { + "epoch": 0.1374754387153601, + "grad_norm": 8.345703125, + "learning_rate": 9.825039359299063e-05, + "loss": 0.9342, + "step": 2029 + }, + { + "epoch": 0.1375431939833322, + "grad_norm": 8.513103485107422, + "learning_rate": 9.824902457389281e-05, + "loss": 1.1139, + "step": 2030 + }, + { + "epoch": 0.1376109492513043, + "grad_norm": 8.675403594970703, + "learning_rate": 9.824765555479499e-05, + "loss": 0.8499, + "step": 2031 + }, + { + "epoch": 0.13767870451927638, + "grad_norm": 10.550504684448242, + "learning_rate": 9.824628653569717e-05, + "loss": 1.2563, + "step": 2032 + }, + { + "epoch": 0.13774645978724845, + "grad_norm": 7.775172710418701, + "learning_rate": 9.824491751659936e-05, + "loss": 1.0394, + "step": 2033 + }, + { + "epoch": 0.13781421505522054, + "grad_norm": 10.057134628295898, + "learning_rate": 9.824354849750154e-05, + "loss": 1.337, + "step": 2034 + }, + { + "epoch": 0.13788197032319263, + "grad_norm": 10.658480644226074, + "learning_rate": 9.824217947840372e-05, + "loss": 1.1675, + "step": 2035 + }, + { + "epoch": 0.13794972559116472, + "grad_norm": 10.499738693237305, + "learning_rate": 9.82408104593059e-05, + "loss": 1.1397, + "step": 2036 + }, + { + "epoch": 0.1380174808591368, + "grad_norm": 12.853378295898438, + "learning_rate": 9.82394414402081e-05, + "loss": 1.2373, + "step": 2037 + }, + { + "epoch": 0.13808523612710888, + "grad_norm": 8.66174602508545, + "learning_rate": 9.823807242111028e-05, + "loss": 0.9863, + "step": 2038 + }, + { + "epoch": 0.13815299139508097, + "grad_norm": 10.246259689331055, + "learning_rate": 9.823670340201246e-05, + "loss": 1.0892, + "step": 2039 + }, + { + "epoch": 0.13822074666305306, + "grad_norm": 11.880420684814453, + "learning_rate": 9.823533438291465e-05, + "loss": 1.069, + "step": 2040 + }, + { + "epoch": 0.13828850193102513, + "grad_norm": 9.475419998168945, + "learning_rate": 9.823396536381683e-05, + "loss": 0.9276, + "step": 2041 + }, + { + "epoch": 0.13835625719899722, + "grad_norm": 9.827219009399414, + "learning_rate": 9.823259634471903e-05, + "loss": 1.2536, + "step": 2042 + }, + { + "epoch": 0.1384240124669693, + "grad_norm": 8.558990478515625, + "learning_rate": 9.82312273256212e-05, + "loss": 1.2779, + "step": 2043 + }, + { + "epoch": 0.1384917677349414, + "grad_norm": 12.038803100585938, + "learning_rate": 9.822985830652339e-05, + "loss": 0.9019, + "step": 2044 + }, + { + "epoch": 0.13855952300291347, + "grad_norm": 10.764846801757812, + "learning_rate": 9.822848928742557e-05, + "loss": 1.1803, + "step": 2045 + }, + { + "epoch": 0.13862727827088556, + "grad_norm": 10.788616180419922, + "learning_rate": 9.822712026832775e-05, + "loss": 1.3469, + "step": 2046 + }, + { + "epoch": 0.13869503353885765, + "grad_norm": 8.718049049377441, + "learning_rate": 9.822575124922994e-05, + "loss": 0.9135, + "step": 2047 + }, + { + "epoch": 0.13876278880682974, + "grad_norm": 12.245726585388184, + "learning_rate": 9.822438223013212e-05, + "loss": 1.3114, + "step": 2048 + }, + { + "epoch": 0.1388305440748018, + "grad_norm": 10.55252456665039, + "learning_rate": 9.82230132110343e-05, + "loss": 1.1303, + "step": 2049 + }, + { + "epoch": 0.1388982993427739, + "grad_norm": 8.177289962768555, + "learning_rate": 9.822164419193648e-05, + "loss": 0.9796, + "step": 2050 + }, + { + "epoch": 0.138966054610746, + "grad_norm": 10.708680152893066, + "learning_rate": 9.822027517283866e-05, + "loss": 1.1546, + "step": 2051 + }, + { + "epoch": 0.13903380987871808, + "grad_norm": 8.631631851196289, + "learning_rate": 9.821890615374086e-05, + "loss": 1.0055, + "step": 2052 + }, + { + "epoch": 0.13910156514669014, + "grad_norm": 8.778770446777344, + "learning_rate": 9.821753713464304e-05, + "loss": 0.9626, + "step": 2053 + }, + { + "epoch": 0.13916932041466223, + "grad_norm": 10.004143714904785, + "learning_rate": 9.821616811554522e-05, + "loss": 1.1411, + "step": 2054 + }, + { + "epoch": 0.13923707568263433, + "grad_norm": 9.47324275970459, + "learning_rate": 9.82147990964474e-05, + "loss": 1.0081, + "step": 2055 + }, + { + "epoch": 0.13930483095060642, + "grad_norm": 10.014934539794922, + "learning_rate": 9.821343007734959e-05, + "loss": 1.3015, + "step": 2056 + }, + { + "epoch": 0.13937258621857848, + "grad_norm": 10.589959144592285, + "learning_rate": 9.821206105825177e-05, + "loss": 1.128, + "step": 2057 + }, + { + "epoch": 0.13944034148655057, + "grad_norm": 9.273834228515625, + "learning_rate": 9.821069203915395e-05, + "loss": 0.7743, + "step": 2058 + }, + { + "epoch": 0.13950809675452266, + "grad_norm": 10.72019100189209, + "learning_rate": 9.820932302005613e-05, + "loss": 0.9524, + "step": 2059 + }, + { + "epoch": 0.13957585202249476, + "grad_norm": 11.212404251098633, + "learning_rate": 9.820795400095831e-05, + "loss": 1.2044, + "step": 2060 + }, + { + "epoch": 0.13964360729046682, + "grad_norm": 9.800296783447266, + "learning_rate": 9.82065849818605e-05, + "loss": 1.0819, + "step": 2061 + }, + { + "epoch": 0.1397113625584389, + "grad_norm": 8.668676376342773, + "learning_rate": 9.820521596276269e-05, + "loss": 1.0778, + "step": 2062 + }, + { + "epoch": 0.139779117826411, + "grad_norm": 10.854613304138184, + "learning_rate": 9.820384694366487e-05, + "loss": 1.1181, + "step": 2063 + }, + { + "epoch": 0.1398468730943831, + "grad_norm": 12.019871711730957, + "learning_rate": 9.820247792456705e-05, + "loss": 1.2507, + "step": 2064 + }, + { + "epoch": 0.1399146283623552, + "grad_norm": 8.835234642028809, + "learning_rate": 9.820110890546924e-05, + "loss": 1.1338, + "step": 2065 + }, + { + "epoch": 0.13998238363032725, + "grad_norm": 8.57636547088623, + "learning_rate": 9.819973988637142e-05, + "loss": 0.957, + "step": 2066 + }, + { + "epoch": 0.14005013889829934, + "grad_norm": 9.080939292907715, + "learning_rate": 9.81983708672736e-05, + "loss": 1.0035, + "step": 2067 + }, + { + "epoch": 0.14011789416627143, + "grad_norm": 9.616862297058105, + "learning_rate": 9.819700184817578e-05, + "loss": 0.9045, + "step": 2068 + }, + { + "epoch": 0.14018564943424353, + "grad_norm": 10.04652214050293, + "learning_rate": 9.819563282907796e-05, + "loss": 1.1202, + "step": 2069 + }, + { + "epoch": 0.1402534047022156, + "grad_norm": 9.132543563842773, + "learning_rate": 9.819426380998016e-05, + "loss": 0.9918, + "step": 2070 + }, + { + "epoch": 0.14032115997018768, + "grad_norm": 12.023603439331055, + "learning_rate": 9.819289479088234e-05, + "loss": 1.1271, + "step": 2071 + }, + { + "epoch": 0.14038891523815977, + "grad_norm": 9.167064666748047, + "learning_rate": 9.819152577178452e-05, + "loss": 0.9136, + "step": 2072 + }, + { + "epoch": 0.14045667050613186, + "grad_norm": 11.954336166381836, + "learning_rate": 9.81901567526867e-05, + "loss": 1.0073, + "step": 2073 + }, + { + "epoch": 0.14052442577410393, + "grad_norm": 9.166129112243652, + "learning_rate": 9.818878773358888e-05, + "loss": 1.1101, + "step": 2074 + }, + { + "epoch": 0.14059218104207602, + "grad_norm": 8.53992748260498, + "learning_rate": 9.818741871449107e-05, + "loss": 0.9556, + "step": 2075 + }, + { + "epoch": 0.1406599363100481, + "grad_norm": 10.769463539123535, + "learning_rate": 9.818604969539325e-05, + "loss": 1.3009, + "step": 2076 + }, + { + "epoch": 0.1407276915780202, + "grad_norm": 10.938043594360352, + "learning_rate": 9.818468067629543e-05, + "loss": 1.4299, + "step": 2077 + }, + { + "epoch": 0.14079544684599227, + "grad_norm": 10.8585844039917, + "learning_rate": 9.818331165719761e-05, + "loss": 1.0804, + "step": 2078 + }, + { + "epoch": 0.14086320211396436, + "grad_norm": 8.763557434082031, + "learning_rate": 9.81819426380998e-05, + "loss": 0.8321, + "step": 2079 + }, + { + "epoch": 0.14093095738193645, + "grad_norm": 11.917708396911621, + "learning_rate": 9.818057361900199e-05, + "loss": 1.4293, + "step": 2080 + }, + { + "epoch": 0.14099871264990854, + "grad_norm": 10.189987182617188, + "learning_rate": 9.817920459990417e-05, + "loss": 1.139, + "step": 2081 + }, + { + "epoch": 0.1410664679178806, + "grad_norm": 11.09156608581543, + "learning_rate": 9.817783558080635e-05, + "loss": 0.9866, + "step": 2082 + }, + { + "epoch": 0.1411342231858527, + "grad_norm": 9.394566535949707, + "learning_rate": 9.817646656170854e-05, + "loss": 0.9401, + "step": 2083 + }, + { + "epoch": 0.1412019784538248, + "grad_norm": 8.232394218444824, + "learning_rate": 9.817509754261072e-05, + "loss": 0.9193, + "step": 2084 + }, + { + "epoch": 0.14126973372179688, + "grad_norm": 9.4952392578125, + "learning_rate": 9.81737285235129e-05, + "loss": 1.3217, + "step": 2085 + }, + { + "epoch": 0.14133748898976894, + "grad_norm": 10.110014915466309, + "learning_rate": 9.81723595044151e-05, + "loss": 1.0413, + "step": 2086 + }, + { + "epoch": 0.14140524425774104, + "grad_norm": 10.850225448608398, + "learning_rate": 9.817099048531728e-05, + "loss": 1.3722, + "step": 2087 + }, + { + "epoch": 0.14147299952571313, + "grad_norm": 10.924457550048828, + "learning_rate": 9.816962146621946e-05, + "loss": 1.4004, + "step": 2088 + }, + { + "epoch": 0.14154075479368522, + "grad_norm": 10.022381782531738, + "learning_rate": 9.816825244712165e-05, + "loss": 1.1989, + "step": 2089 + }, + { + "epoch": 0.14160851006165728, + "grad_norm": 11.537693977355957, + "learning_rate": 9.816688342802383e-05, + "loss": 1.1922, + "step": 2090 + }, + { + "epoch": 0.14167626532962938, + "grad_norm": 8.525372505187988, + "learning_rate": 9.816551440892601e-05, + "loss": 0.9291, + "step": 2091 + }, + { + "epoch": 0.14174402059760147, + "grad_norm": 8.972722053527832, + "learning_rate": 9.816414538982819e-05, + "loss": 1.0917, + "step": 2092 + }, + { + "epoch": 0.14181177586557356, + "grad_norm": 8.386235237121582, + "learning_rate": 9.816277637073039e-05, + "loss": 1.0482, + "step": 2093 + }, + { + "epoch": 0.14187953113354562, + "grad_norm": 9.620473861694336, + "learning_rate": 9.816140735163257e-05, + "loss": 1.2523, + "step": 2094 + }, + { + "epoch": 0.1419472864015177, + "grad_norm": 9.236804962158203, + "learning_rate": 9.816003833253475e-05, + "loss": 0.9474, + "step": 2095 + }, + { + "epoch": 0.1420150416694898, + "grad_norm": 9.06925106048584, + "learning_rate": 9.815866931343693e-05, + "loss": 0.8921, + "step": 2096 + }, + { + "epoch": 0.1420827969374619, + "grad_norm": 10.127729415893555, + "learning_rate": 9.815730029433912e-05, + "loss": 1.1693, + "step": 2097 + }, + { + "epoch": 0.14215055220543396, + "grad_norm": 10.703007698059082, + "learning_rate": 9.81559312752413e-05, + "loss": 1.0652, + "step": 2098 + }, + { + "epoch": 0.14221830747340605, + "grad_norm": 10.004093170166016, + "learning_rate": 9.815456225614348e-05, + "loss": 1.0158, + "step": 2099 + }, + { + "epoch": 0.14228606274137814, + "grad_norm": 8.900236129760742, + "learning_rate": 9.815319323704566e-05, + "loss": 0.7327, + "step": 2100 + }, + { + "epoch": 0.14235381800935024, + "grad_norm": 13.102290153503418, + "learning_rate": 9.815182421794784e-05, + "loss": 1.2837, + "step": 2101 + }, + { + "epoch": 0.1424215732773223, + "grad_norm": 9.32657527923584, + "learning_rate": 9.815045519885004e-05, + "loss": 1.0812, + "step": 2102 + }, + { + "epoch": 0.1424893285452944, + "grad_norm": 11.42785930633545, + "learning_rate": 9.814908617975222e-05, + "loss": 1.4467, + "step": 2103 + }, + { + "epoch": 0.14255708381326648, + "grad_norm": 9.878450393676758, + "learning_rate": 9.81477171606544e-05, + "loss": 1.0774, + "step": 2104 + }, + { + "epoch": 0.14262483908123857, + "grad_norm": 9.329227447509766, + "learning_rate": 9.814634814155658e-05, + "loss": 1.2536, + "step": 2105 + }, + { + "epoch": 0.14269259434921064, + "grad_norm": 9.071858406066895, + "learning_rate": 9.814497912245876e-05, + "loss": 1.148, + "step": 2106 + }, + { + "epoch": 0.14276034961718273, + "grad_norm": 11.595810890197754, + "learning_rate": 9.814361010336095e-05, + "loss": 1.1992, + "step": 2107 + }, + { + "epoch": 0.14282810488515482, + "grad_norm": 10.041107177734375, + "learning_rate": 9.814224108426313e-05, + "loss": 1.0447, + "step": 2108 + }, + { + "epoch": 0.1428958601531269, + "grad_norm": 9.913261413574219, + "learning_rate": 9.814087206516531e-05, + "loss": 1.1629, + "step": 2109 + }, + { + "epoch": 0.14296361542109898, + "grad_norm": 10.983177185058594, + "learning_rate": 9.813950304606749e-05, + "loss": 1.0288, + "step": 2110 + }, + { + "epoch": 0.14303137068907107, + "grad_norm": 10.479610443115234, + "learning_rate": 9.813813402696969e-05, + "loss": 1.5308, + "step": 2111 + }, + { + "epoch": 0.14309912595704316, + "grad_norm": 8.70734691619873, + "learning_rate": 9.813676500787187e-05, + "loss": 0.9987, + "step": 2112 + }, + { + "epoch": 0.14316688122501525, + "grad_norm": 8.242984771728516, + "learning_rate": 9.813539598877405e-05, + "loss": 1.0051, + "step": 2113 + }, + { + "epoch": 0.14323463649298732, + "grad_norm": 8.627467155456543, + "learning_rate": 9.813402696967623e-05, + "loss": 1.1134, + "step": 2114 + }, + { + "epoch": 0.1433023917609594, + "grad_norm": 9.970986366271973, + "learning_rate": 9.813265795057841e-05, + "loss": 1.148, + "step": 2115 + }, + { + "epoch": 0.1433701470289315, + "grad_norm": 8.719794273376465, + "learning_rate": 9.81312889314806e-05, + "loss": 1.1013, + "step": 2116 + }, + { + "epoch": 0.1434379022969036, + "grad_norm": 9.455860137939453, + "learning_rate": 9.812991991238278e-05, + "loss": 1.2333, + "step": 2117 + }, + { + "epoch": 0.14350565756487568, + "grad_norm": 8.851629257202148, + "learning_rate": 9.812855089328496e-05, + "loss": 1.0985, + "step": 2118 + }, + { + "epoch": 0.14357341283284775, + "grad_norm": 11.843599319458008, + "learning_rate": 9.812718187418714e-05, + "loss": 1.2292, + "step": 2119 + }, + { + "epoch": 0.14364116810081984, + "grad_norm": 9.550506591796875, + "learning_rate": 9.812581285508934e-05, + "loss": 0.9466, + "step": 2120 + }, + { + "epoch": 0.14370892336879193, + "grad_norm": 9.234643936157227, + "learning_rate": 9.812444383599152e-05, + "loss": 1.1968, + "step": 2121 + }, + { + "epoch": 0.14377667863676402, + "grad_norm": 9.365439414978027, + "learning_rate": 9.81230748168937e-05, + "loss": 1.1643, + "step": 2122 + }, + { + "epoch": 0.14384443390473609, + "grad_norm": 9.476024627685547, + "learning_rate": 9.812170579779588e-05, + "loss": 1.0541, + "step": 2123 + }, + { + "epoch": 0.14391218917270818, + "grad_norm": 9.53847885131836, + "learning_rate": 9.812033677869806e-05, + "loss": 1.0808, + "step": 2124 + }, + { + "epoch": 0.14397994444068027, + "grad_norm": 8.918699264526367, + "learning_rate": 9.811896775960025e-05, + "loss": 1.1055, + "step": 2125 + }, + { + "epoch": 0.14404769970865236, + "grad_norm": 12.126187324523926, + "learning_rate": 9.811759874050243e-05, + "loss": 1.6079, + "step": 2126 + }, + { + "epoch": 0.14411545497662442, + "grad_norm": 10.922599792480469, + "learning_rate": 9.811622972140461e-05, + "loss": 1.2569, + "step": 2127 + }, + { + "epoch": 0.14418321024459652, + "grad_norm": 8.582880020141602, + "learning_rate": 9.811486070230679e-05, + "loss": 0.9562, + "step": 2128 + }, + { + "epoch": 0.1442509655125686, + "grad_norm": 8.926568031311035, + "learning_rate": 9.811349168320899e-05, + "loss": 1.0108, + "step": 2129 + }, + { + "epoch": 0.1443187207805407, + "grad_norm": 13.906332015991211, + "learning_rate": 9.811212266411117e-05, + "loss": 1.1851, + "step": 2130 + }, + { + "epoch": 0.14438647604851276, + "grad_norm": 10.374212265014648, + "learning_rate": 9.811075364501335e-05, + "loss": 1.2661, + "step": 2131 + }, + { + "epoch": 0.14445423131648485, + "grad_norm": 10.580810546875, + "learning_rate": 9.810938462591554e-05, + "loss": 0.8967, + "step": 2132 + }, + { + "epoch": 0.14452198658445695, + "grad_norm": 8.91865062713623, + "learning_rate": 9.810801560681772e-05, + "loss": 1.032, + "step": 2133 + }, + { + "epoch": 0.14458974185242904, + "grad_norm": 8.16041374206543, + "learning_rate": 9.81066465877199e-05, + "loss": 0.8622, + "step": 2134 + }, + { + "epoch": 0.1446574971204011, + "grad_norm": 8.575905799865723, + "learning_rate": 9.81052775686221e-05, + "loss": 0.9956, + "step": 2135 + }, + { + "epoch": 0.1447252523883732, + "grad_norm": 9.620558738708496, + "learning_rate": 9.810390854952428e-05, + "loss": 1.1442, + "step": 2136 + }, + { + "epoch": 0.14479300765634529, + "grad_norm": 10.390005111694336, + "learning_rate": 9.810253953042646e-05, + "loss": 1.0518, + "step": 2137 + }, + { + "epoch": 0.14486076292431738, + "grad_norm": 10.714217185974121, + "learning_rate": 9.810117051132864e-05, + "loss": 1.3215, + "step": 2138 + }, + { + "epoch": 0.14492851819228944, + "grad_norm": 9.390836715698242, + "learning_rate": 9.809980149223083e-05, + "loss": 1.0936, + "step": 2139 + }, + { + "epoch": 0.14499627346026153, + "grad_norm": 10.848445892333984, + "learning_rate": 9.809843247313301e-05, + "loss": 1.1946, + "step": 2140 + }, + { + "epoch": 0.14506402872823362, + "grad_norm": 11.041672706604004, + "learning_rate": 9.809706345403519e-05, + "loss": 1.0422, + "step": 2141 + }, + { + "epoch": 0.14513178399620572, + "grad_norm": 10.1151762008667, + "learning_rate": 9.809569443493737e-05, + "loss": 1.2199, + "step": 2142 + }, + { + "epoch": 0.14519953926417778, + "grad_norm": 8.686629295349121, + "learning_rate": 9.809432541583957e-05, + "loss": 0.9861, + "step": 2143 + }, + { + "epoch": 0.14526729453214987, + "grad_norm": 10.579313278198242, + "learning_rate": 9.809295639674175e-05, + "loss": 0.9372, + "step": 2144 + }, + { + "epoch": 0.14533504980012196, + "grad_norm": 8.916631698608398, + "learning_rate": 9.809158737764393e-05, + "loss": 0.8024, + "step": 2145 + }, + { + "epoch": 0.14540280506809405, + "grad_norm": 9.29333209991455, + "learning_rate": 9.80902183585461e-05, + "loss": 1.0785, + "step": 2146 + }, + { + "epoch": 0.14547056033606612, + "grad_norm": 10.584277153015137, + "learning_rate": 9.808884933944829e-05, + "loss": 0.9167, + "step": 2147 + }, + { + "epoch": 0.1455383156040382, + "grad_norm": 10.68551254272461, + "learning_rate": 9.808748032035048e-05, + "loss": 1.0019, + "step": 2148 + }, + { + "epoch": 0.1456060708720103, + "grad_norm": 9.057500839233398, + "learning_rate": 9.808611130125266e-05, + "loss": 1.194, + "step": 2149 + }, + { + "epoch": 0.1456738261399824, + "grad_norm": 8.642207145690918, + "learning_rate": 9.808474228215484e-05, + "loss": 1.0849, + "step": 2150 + }, + { + "epoch": 0.14574158140795446, + "grad_norm": 9.460419654846191, + "learning_rate": 9.808337326305702e-05, + "loss": 1.1747, + "step": 2151 + }, + { + "epoch": 0.14580933667592655, + "grad_norm": 9.003097534179688, + "learning_rate": 9.80820042439592e-05, + "loss": 0.8967, + "step": 2152 + }, + { + "epoch": 0.14587709194389864, + "grad_norm": 9.85009765625, + "learning_rate": 9.80806352248614e-05, + "loss": 1.1698, + "step": 2153 + }, + { + "epoch": 0.14594484721187073, + "grad_norm": 10.233894348144531, + "learning_rate": 9.807926620576358e-05, + "loss": 1.0496, + "step": 2154 + }, + { + "epoch": 0.1460126024798428, + "grad_norm": 9.660355567932129, + "learning_rate": 9.807789718666576e-05, + "loss": 0.9616, + "step": 2155 + }, + { + "epoch": 0.1460803577478149, + "grad_norm": 9.46469497680664, + "learning_rate": 9.807652816756794e-05, + "loss": 1.0005, + "step": 2156 + }, + { + "epoch": 0.14614811301578698, + "grad_norm": 8.004712104797363, + "learning_rate": 9.807515914847013e-05, + "loss": 0.8455, + "step": 2157 + }, + { + "epoch": 0.14621586828375907, + "grad_norm": 10.039002418518066, + "learning_rate": 9.807379012937231e-05, + "loss": 1.097, + "step": 2158 + }, + { + "epoch": 0.14628362355173113, + "grad_norm": 11.16292667388916, + "learning_rate": 9.807242111027449e-05, + "loss": 1.4332, + "step": 2159 + }, + { + "epoch": 0.14635137881970323, + "grad_norm": 9.34833812713623, + "learning_rate": 9.807105209117667e-05, + "loss": 1.0438, + "step": 2160 + }, + { + "epoch": 0.14641913408767532, + "grad_norm": 10.256768226623535, + "learning_rate": 9.806968307207885e-05, + "loss": 1.2923, + "step": 2161 + }, + { + "epoch": 0.1464868893556474, + "grad_norm": 8.890941619873047, + "learning_rate": 9.806831405298105e-05, + "loss": 0.8419, + "step": 2162 + }, + { + "epoch": 0.14655464462361947, + "grad_norm": 9.340752601623535, + "learning_rate": 9.806694503388323e-05, + "loss": 0.9504, + "step": 2163 + }, + { + "epoch": 0.14662239989159156, + "grad_norm": 10.89192008972168, + "learning_rate": 9.80655760147854e-05, + "loss": 1.012, + "step": 2164 + }, + { + "epoch": 0.14669015515956366, + "grad_norm": 8.626432418823242, + "learning_rate": 9.806420699568759e-05, + "loss": 0.8943, + "step": 2165 + }, + { + "epoch": 0.14675791042753575, + "grad_norm": 9.465259552001953, + "learning_rate": 9.806283797658978e-05, + "loss": 1.1617, + "step": 2166 + }, + { + "epoch": 0.1468256656955078, + "grad_norm": 11.695369720458984, + "learning_rate": 9.806146895749196e-05, + "loss": 1.2719, + "step": 2167 + }, + { + "epoch": 0.1468934209634799, + "grad_norm": 9.416814804077148, + "learning_rate": 9.806009993839414e-05, + "loss": 1.3615, + "step": 2168 + }, + { + "epoch": 0.146961176231452, + "grad_norm": 9.6004638671875, + "learning_rate": 9.805873091929632e-05, + "loss": 1.2571, + "step": 2169 + }, + { + "epoch": 0.1470289314994241, + "grad_norm": 9.187546730041504, + "learning_rate": 9.80573619001985e-05, + "loss": 0.962, + "step": 2170 + }, + { + "epoch": 0.14709668676739618, + "grad_norm": 10.691286087036133, + "learning_rate": 9.80559928811007e-05, + "loss": 1.0619, + "step": 2171 + }, + { + "epoch": 0.14716444203536824, + "grad_norm": 9.17457103729248, + "learning_rate": 9.805462386200288e-05, + "loss": 1.1507, + "step": 2172 + }, + { + "epoch": 0.14723219730334033, + "grad_norm": 10.739082336425781, + "learning_rate": 9.805325484290506e-05, + "loss": 1.2929, + "step": 2173 + }, + { + "epoch": 0.14729995257131243, + "grad_norm": 9.091232299804688, + "learning_rate": 9.805188582380724e-05, + "loss": 0.9422, + "step": 2174 + }, + { + "epoch": 0.14736770783928452, + "grad_norm": 8.231295585632324, + "learning_rate": 9.805051680470943e-05, + "loss": 0.9695, + "step": 2175 + }, + { + "epoch": 0.14743546310725658, + "grad_norm": 9.622401237487793, + "learning_rate": 9.804914778561161e-05, + "loss": 0.982, + "step": 2176 + }, + { + "epoch": 0.14750321837522867, + "grad_norm": 9.49541187286377, + "learning_rate": 9.804777876651379e-05, + "loss": 0.9005, + "step": 2177 + }, + { + "epoch": 0.14757097364320076, + "grad_norm": 10.584654808044434, + "learning_rate": 9.804640974741599e-05, + "loss": 0.9427, + "step": 2178 + }, + { + "epoch": 0.14763872891117286, + "grad_norm": 9.132317543029785, + "learning_rate": 9.804504072831817e-05, + "loss": 1.0192, + "step": 2179 + }, + { + "epoch": 0.14770648417914492, + "grad_norm": 8.598082542419434, + "learning_rate": 9.804367170922035e-05, + "loss": 0.9842, + "step": 2180 + }, + { + "epoch": 0.147774239447117, + "grad_norm": 8.941360473632812, + "learning_rate": 9.804230269012254e-05, + "loss": 0.9907, + "step": 2181 + }, + { + "epoch": 0.1478419947150891, + "grad_norm": 8.119913101196289, + "learning_rate": 9.804093367102472e-05, + "loss": 1.1259, + "step": 2182 + }, + { + "epoch": 0.1479097499830612, + "grad_norm": 9.505135536193848, + "learning_rate": 9.80395646519269e-05, + "loss": 1.0509, + "step": 2183 + }, + { + "epoch": 0.14797750525103326, + "grad_norm": 8.420906066894531, + "learning_rate": 9.803819563282908e-05, + "loss": 1.1827, + "step": 2184 + }, + { + "epoch": 0.14804526051900535, + "grad_norm": 9.860353469848633, + "learning_rate": 9.803682661373128e-05, + "loss": 1.0286, + "step": 2185 + }, + { + "epoch": 0.14811301578697744, + "grad_norm": 8.259954452514648, + "learning_rate": 9.803545759463346e-05, + "loss": 1.0094, + "step": 2186 + }, + { + "epoch": 0.14818077105494953, + "grad_norm": 10.46882438659668, + "learning_rate": 9.803408857553564e-05, + "loss": 1.1817, + "step": 2187 + }, + { + "epoch": 0.1482485263229216, + "grad_norm": 9.315580368041992, + "learning_rate": 9.803271955643782e-05, + "loss": 1.2816, + "step": 2188 + }, + { + "epoch": 0.1483162815908937, + "grad_norm": 10.408548355102539, + "learning_rate": 9.803135053734001e-05, + "loss": 1.0355, + "step": 2189 + }, + { + "epoch": 0.14838403685886578, + "grad_norm": 9.682003021240234, + "learning_rate": 9.802998151824219e-05, + "loss": 1.0413, + "step": 2190 + }, + { + "epoch": 0.14845179212683787, + "grad_norm": 10.041797637939453, + "learning_rate": 9.802861249914437e-05, + "loss": 0.8388, + "step": 2191 + }, + { + "epoch": 0.14851954739480994, + "grad_norm": 8.367657661437988, + "learning_rate": 9.802724348004655e-05, + "loss": 1.0724, + "step": 2192 + }, + { + "epoch": 0.14858730266278203, + "grad_norm": 9.9558744430542, + "learning_rate": 9.802587446094873e-05, + "loss": 1.3534, + "step": 2193 + }, + { + "epoch": 0.14865505793075412, + "grad_norm": 9.244332313537598, + "learning_rate": 9.802450544185093e-05, + "loss": 1.366, + "step": 2194 + }, + { + "epoch": 0.1487228131987262, + "grad_norm": 9.560718536376953, + "learning_rate": 9.80231364227531e-05, + "loss": 1.0556, + "step": 2195 + }, + { + "epoch": 0.14879056846669828, + "grad_norm": 8.724915504455566, + "learning_rate": 9.802176740365529e-05, + "loss": 1.316, + "step": 2196 + }, + { + "epoch": 0.14885832373467037, + "grad_norm": 9.468677520751953, + "learning_rate": 9.802039838455747e-05, + "loss": 1.1289, + "step": 2197 + }, + { + "epoch": 0.14892607900264246, + "grad_norm": 8.56347942352295, + "learning_rate": 9.801902936545966e-05, + "loss": 1.0402, + "step": 2198 + }, + { + "epoch": 0.14899383427061455, + "grad_norm": 9.328559875488281, + "learning_rate": 9.801766034636184e-05, + "loss": 1.186, + "step": 2199 + }, + { + "epoch": 0.14906158953858661, + "grad_norm": 10.20579719543457, + "learning_rate": 9.801629132726402e-05, + "loss": 1.0896, + "step": 2200 + }, + { + "epoch": 0.1491293448065587, + "grad_norm": 11.614208221435547, + "learning_rate": 9.80149223081662e-05, + "loss": 1.2742, + "step": 2201 + }, + { + "epoch": 0.1491971000745308, + "grad_norm": 8.291358947753906, + "learning_rate": 9.801355328906838e-05, + "loss": 1.0299, + "step": 2202 + }, + { + "epoch": 0.1492648553425029, + "grad_norm": 8.734344482421875, + "learning_rate": 9.801218426997058e-05, + "loss": 1.0922, + "step": 2203 + }, + { + "epoch": 0.14933261061047495, + "grad_norm": 8.498616218566895, + "learning_rate": 9.801081525087276e-05, + "loss": 0.8436, + "step": 2204 + }, + { + "epoch": 0.14940036587844704, + "grad_norm": 7.4501447677612305, + "learning_rate": 9.800944623177494e-05, + "loss": 0.8629, + "step": 2205 + }, + { + "epoch": 0.14946812114641914, + "grad_norm": 11.652278900146484, + "learning_rate": 9.800807721267712e-05, + "loss": 1.1089, + "step": 2206 + }, + { + "epoch": 0.14953587641439123, + "grad_norm": 11.043471336364746, + "learning_rate": 9.80067081935793e-05, + "loss": 1.2758, + "step": 2207 + }, + { + "epoch": 0.1496036316823633, + "grad_norm": 9.376245498657227, + "learning_rate": 9.800533917448149e-05, + "loss": 1.0178, + "step": 2208 + }, + { + "epoch": 0.14967138695033538, + "grad_norm": 8.678750038146973, + "learning_rate": 9.800397015538367e-05, + "loss": 1.0276, + "step": 2209 + }, + { + "epoch": 0.14973914221830747, + "grad_norm": 10.233052253723145, + "learning_rate": 9.800260113628585e-05, + "loss": 1.0468, + "step": 2210 + }, + { + "epoch": 0.14980689748627957, + "grad_norm": 10.711477279663086, + "learning_rate": 9.800123211718803e-05, + "loss": 1.3919, + "step": 2211 + }, + { + "epoch": 0.14987465275425163, + "grad_norm": 10.130411148071289, + "learning_rate": 9.799986309809023e-05, + "loss": 1.1085, + "step": 2212 + }, + { + "epoch": 0.14994240802222372, + "grad_norm": 9.64091682434082, + "learning_rate": 9.79984940789924e-05, + "loss": 1.0661, + "step": 2213 + }, + { + "epoch": 0.1500101632901958, + "grad_norm": 9.80176830291748, + "learning_rate": 9.799712505989459e-05, + "loss": 1.3426, + "step": 2214 + }, + { + "epoch": 0.1500779185581679, + "grad_norm": 9.34835147857666, + "learning_rate": 9.799575604079677e-05, + "loss": 1.0281, + "step": 2215 + }, + { + "epoch": 0.15014567382613997, + "grad_norm": 8.638712882995605, + "learning_rate": 9.799438702169895e-05, + "loss": 0.9037, + "step": 2216 + }, + { + "epoch": 0.15021342909411206, + "grad_norm": 10.499733924865723, + "learning_rate": 9.799301800260114e-05, + "loss": 1.1152, + "step": 2217 + }, + { + "epoch": 0.15028118436208415, + "grad_norm": 9.093478202819824, + "learning_rate": 9.799164898350332e-05, + "loss": 1.2026, + "step": 2218 + }, + { + "epoch": 0.15034893963005624, + "grad_norm": 13.181863784790039, + "learning_rate": 9.79902799644055e-05, + "loss": 1.3256, + "step": 2219 + }, + { + "epoch": 0.1504166948980283, + "grad_norm": 10.221563339233398, + "learning_rate": 9.798891094530768e-05, + "loss": 1.1867, + "step": 2220 + }, + { + "epoch": 0.1504844501660004, + "grad_norm": 9.512944221496582, + "learning_rate": 9.798754192620988e-05, + "loss": 1.1145, + "step": 2221 + }, + { + "epoch": 0.1505522054339725, + "grad_norm": 8.991315841674805, + "learning_rate": 9.798617290711206e-05, + "loss": 1.0289, + "step": 2222 + }, + { + "epoch": 0.15061996070194458, + "grad_norm": 9.003118515014648, + "learning_rate": 9.798480388801424e-05, + "loss": 0.9618, + "step": 2223 + }, + { + "epoch": 0.15068771596991667, + "grad_norm": 11.337166786193848, + "learning_rate": 9.798343486891643e-05, + "loss": 1.3864, + "step": 2224 + }, + { + "epoch": 0.15075547123788874, + "grad_norm": 9.193288803100586, + "learning_rate": 9.798206584981861e-05, + "loss": 1.1314, + "step": 2225 + }, + { + "epoch": 0.15082322650586083, + "grad_norm": 10.137048721313477, + "learning_rate": 9.798069683072079e-05, + "loss": 1.3325, + "step": 2226 + }, + { + "epoch": 0.15089098177383292, + "grad_norm": 8.248672485351562, + "learning_rate": 9.797932781162299e-05, + "loss": 1.0688, + "step": 2227 + }, + { + "epoch": 0.150958737041805, + "grad_norm": 10.51007080078125, + "learning_rate": 9.797795879252517e-05, + "loss": 1.2191, + "step": 2228 + }, + { + "epoch": 0.15102649230977708, + "grad_norm": 14.15853214263916, + "learning_rate": 9.797658977342735e-05, + "loss": 1.0961, + "step": 2229 + }, + { + "epoch": 0.15109424757774917, + "grad_norm": 8.42485523223877, + "learning_rate": 9.797522075432954e-05, + "loss": 1.0314, + "step": 2230 + }, + { + "epoch": 0.15116200284572126, + "grad_norm": 9.3310546875, + "learning_rate": 9.797385173523172e-05, + "loss": 1.2372, + "step": 2231 + }, + { + "epoch": 0.15122975811369335, + "grad_norm": 9.323569297790527, + "learning_rate": 9.79724827161339e-05, + "loss": 1.1823, + "step": 2232 + }, + { + "epoch": 0.15129751338166542, + "grad_norm": 9.719592094421387, + "learning_rate": 9.797111369703608e-05, + "loss": 1.0689, + "step": 2233 + }, + { + "epoch": 0.1513652686496375, + "grad_norm": 9.37340259552002, + "learning_rate": 9.796974467793826e-05, + "loss": 1.0703, + "step": 2234 + }, + { + "epoch": 0.1514330239176096, + "grad_norm": 9.803778648376465, + "learning_rate": 9.796837565884045e-05, + "loss": 1.0422, + "step": 2235 + }, + { + "epoch": 0.1515007791855817, + "grad_norm": 8.317301750183105, + "learning_rate": 9.796700663974264e-05, + "loss": 0.9829, + "step": 2236 + }, + { + "epoch": 0.15156853445355375, + "grad_norm": 8.473258972167969, + "learning_rate": 9.796563762064482e-05, + "loss": 1.064, + "step": 2237 + }, + { + "epoch": 0.15163628972152585, + "grad_norm": 8.495006561279297, + "learning_rate": 9.7964268601547e-05, + "loss": 0.8646, + "step": 2238 + }, + { + "epoch": 0.15170404498949794, + "grad_norm": 9.140898704528809, + "learning_rate": 9.796289958244918e-05, + "loss": 1.068, + "step": 2239 + }, + { + "epoch": 0.15177180025747003, + "grad_norm": 10.264453887939453, + "learning_rate": 9.796153056335137e-05, + "loss": 0.9681, + "step": 2240 + }, + { + "epoch": 0.1518395555254421, + "grad_norm": 10.407776832580566, + "learning_rate": 9.796016154425355e-05, + "loss": 1.2538, + "step": 2241 + }, + { + "epoch": 0.15190731079341419, + "grad_norm": 10.143677711486816, + "learning_rate": 9.795879252515573e-05, + "loss": 1.138, + "step": 2242 + }, + { + "epoch": 0.15197506606138628, + "grad_norm": 12.528799057006836, + "learning_rate": 9.795742350605791e-05, + "loss": 1.2654, + "step": 2243 + }, + { + "epoch": 0.15204282132935837, + "grad_norm": 10.635498046875, + "learning_rate": 9.79560544869601e-05, + "loss": 1.2116, + "step": 2244 + }, + { + "epoch": 0.15211057659733043, + "grad_norm": 10.71164608001709, + "learning_rate": 9.795468546786229e-05, + "loss": 1.0262, + "step": 2245 + }, + { + "epoch": 0.15217833186530252, + "grad_norm": 9.121109962463379, + "learning_rate": 9.795331644876447e-05, + "loss": 1.1657, + "step": 2246 + }, + { + "epoch": 0.15224608713327462, + "grad_norm": 8.174636840820312, + "learning_rate": 9.795194742966665e-05, + "loss": 1.0027, + "step": 2247 + }, + { + "epoch": 0.1523138424012467, + "grad_norm": 8.763494491577148, + "learning_rate": 9.795057841056883e-05, + "loss": 1.1103, + "step": 2248 + }, + { + "epoch": 0.15238159766921877, + "grad_norm": 8.028278350830078, + "learning_rate": 9.794920939147102e-05, + "loss": 1.0205, + "step": 2249 + }, + { + "epoch": 0.15244935293719086, + "grad_norm": 8.96112060546875, + "learning_rate": 9.79478403723732e-05, + "loss": 1.2949, + "step": 2250 + }, + { + "epoch": 0.15251710820516295, + "grad_norm": 8.67423152923584, + "learning_rate": 9.794647135327538e-05, + "loss": 1.0602, + "step": 2251 + }, + { + "epoch": 0.15258486347313505, + "grad_norm": 9.45671272277832, + "learning_rate": 9.794510233417756e-05, + "loss": 1.0362, + "step": 2252 + }, + { + "epoch": 0.1526526187411071, + "grad_norm": 10.24669075012207, + "learning_rate": 9.794373331507976e-05, + "loss": 0.9918, + "step": 2253 + }, + { + "epoch": 0.1527203740090792, + "grad_norm": 9.014079093933105, + "learning_rate": 9.794236429598194e-05, + "loss": 1.1636, + "step": 2254 + }, + { + "epoch": 0.1527881292770513, + "grad_norm": 6.800943851470947, + "learning_rate": 9.794099527688412e-05, + "loss": 0.9089, + "step": 2255 + }, + { + "epoch": 0.15285588454502338, + "grad_norm": 9.90794849395752, + "learning_rate": 9.79396262577863e-05, + "loss": 0.7669, + "step": 2256 + }, + { + "epoch": 0.15292363981299545, + "grad_norm": 9.870927810668945, + "learning_rate": 9.793825723868848e-05, + "loss": 1.1243, + "step": 2257 + }, + { + "epoch": 0.15299139508096754, + "grad_norm": 9.707404136657715, + "learning_rate": 9.793688821959067e-05, + "loss": 1.1558, + "step": 2258 + }, + { + "epoch": 0.15305915034893963, + "grad_norm": 8.362896919250488, + "learning_rate": 9.793551920049285e-05, + "loss": 0.8601, + "step": 2259 + }, + { + "epoch": 0.15312690561691172, + "grad_norm": 9.536920547485352, + "learning_rate": 9.793415018139503e-05, + "loss": 1.2741, + "step": 2260 + }, + { + "epoch": 0.1531946608848838, + "grad_norm": 11.108535766601562, + "learning_rate": 9.793278116229721e-05, + "loss": 1.3118, + "step": 2261 + }, + { + "epoch": 0.15326241615285588, + "grad_norm": 7.281479358673096, + "learning_rate": 9.793141214319939e-05, + "loss": 0.9165, + "step": 2262 + }, + { + "epoch": 0.15333017142082797, + "grad_norm": 9.166728973388672, + "learning_rate": 9.793004312410159e-05, + "loss": 1.0845, + "step": 2263 + }, + { + "epoch": 0.15339792668880006, + "grad_norm": 11.539854049682617, + "learning_rate": 9.792867410500377e-05, + "loss": 1.1129, + "step": 2264 + }, + { + "epoch": 0.15346568195677213, + "grad_norm": 8.588869094848633, + "learning_rate": 9.792730508590595e-05, + "loss": 0.9698, + "step": 2265 + }, + { + "epoch": 0.15353343722474422, + "grad_norm": 8.270078659057617, + "learning_rate": 9.792593606680813e-05, + "loss": 1.2306, + "step": 2266 + }, + { + "epoch": 0.1536011924927163, + "grad_norm": 7.908688545227051, + "learning_rate": 9.792456704771032e-05, + "loss": 1.283, + "step": 2267 + }, + { + "epoch": 0.1536689477606884, + "grad_norm": 10.376410484313965, + "learning_rate": 9.79231980286125e-05, + "loss": 1.0246, + "step": 2268 + }, + { + "epoch": 0.15373670302866047, + "grad_norm": 9.517715454101562, + "learning_rate": 9.792182900951468e-05, + "loss": 0.9975, + "step": 2269 + }, + { + "epoch": 0.15380445829663256, + "grad_norm": 8.789438247680664, + "learning_rate": 9.792045999041688e-05, + "loss": 1.0886, + "step": 2270 + }, + { + "epoch": 0.15387221356460465, + "grad_norm": 9.649114608764648, + "learning_rate": 9.791909097131906e-05, + "loss": 1.1497, + "step": 2271 + }, + { + "epoch": 0.15393996883257674, + "grad_norm": 8.533876419067383, + "learning_rate": 9.791772195222124e-05, + "loss": 0.8701, + "step": 2272 + }, + { + "epoch": 0.1540077241005488, + "grad_norm": 10.64561653137207, + "learning_rate": 9.791635293312343e-05, + "loss": 1.2755, + "step": 2273 + }, + { + "epoch": 0.1540754793685209, + "grad_norm": 9.510658264160156, + "learning_rate": 9.791498391402561e-05, + "loss": 0.8772, + "step": 2274 + }, + { + "epoch": 0.154143234636493, + "grad_norm": 9.849981307983398, + "learning_rate": 9.791361489492779e-05, + "loss": 1.1689, + "step": 2275 + }, + { + "epoch": 0.15421098990446508, + "grad_norm": 8.152152061462402, + "learning_rate": 9.791224587582998e-05, + "loss": 0.8148, + "step": 2276 + }, + { + "epoch": 0.15427874517243717, + "grad_norm": 8.653456687927246, + "learning_rate": 9.791087685673216e-05, + "loss": 1.0061, + "step": 2277 + }, + { + "epoch": 0.15434650044040923, + "grad_norm": 6.928426742553711, + "learning_rate": 9.790950783763435e-05, + "loss": 0.9192, + "step": 2278 + }, + { + "epoch": 0.15441425570838133, + "grad_norm": 9.606708526611328, + "learning_rate": 9.790813881853653e-05, + "loss": 0.9956, + "step": 2279 + }, + { + "epoch": 0.15448201097635342, + "grad_norm": 9.42773151397705, + "learning_rate": 9.79067697994387e-05, + "loss": 0.9302, + "step": 2280 + }, + { + "epoch": 0.1545497662443255, + "grad_norm": 8.248319625854492, + "learning_rate": 9.79054007803409e-05, + "loss": 1.2318, + "step": 2281 + }, + { + "epoch": 0.15461752151229757, + "grad_norm": 9.706393241882324, + "learning_rate": 9.790403176124308e-05, + "loss": 1.1489, + "step": 2282 + }, + { + "epoch": 0.15468527678026966, + "grad_norm": 8.88716983795166, + "learning_rate": 9.790266274214526e-05, + "loss": 0.8686, + "step": 2283 + }, + { + "epoch": 0.15475303204824176, + "grad_norm": 8.596991539001465, + "learning_rate": 9.790129372304744e-05, + "loss": 1.0623, + "step": 2284 + }, + { + "epoch": 0.15482078731621385, + "grad_norm": 9.262425422668457, + "learning_rate": 9.789992470394962e-05, + "loss": 1.2067, + "step": 2285 + }, + { + "epoch": 0.1548885425841859, + "grad_norm": 7.772172927856445, + "learning_rate": 9.789855568485181e-05, + "loss": 0.9976, + "step": 2286 + }, + { + "epoch": 0.154956297852158, + "grad_norm": 9.320001602172852, + "learning_rate": 9.7897186665754e-05, + "loss": 1.1377, + "step": 2287 + }, + { + "epoch": 0.1550240531201301, + "grad_norm": 11.02434253692627, + "learning_rate": 9.789581764665618e-05, + "loss": 1.1365, + "step": 2288 + }, + { + "epoch": 0.1550918083881022, + "grad_norm": 9.90654182434082, + "learning_rate": 9.789444862755836e-05, + "loss": 1.208, + "step": 2289 + }, + { + "epoch": 0.15515956365607425, + "grad_norm": 9.591778755187988, + "learning_rate": 9.789307960846055e-05, + "loss": 1.2426, + "step": 2290 + }, + { + "epoch": 0.15522731892404634, + "grad_norm": 9.221457481384277, + "learning_rate": 9.789171058936273e-05, + "loss": 1.1449, + "step": 2291 + }, + { + "epoch": 0.15529507419201843, + "grad_norm": 7.294323444366455, + "learning_rate": 9.789034157026491e-05, + "loss": 0.8846, + "step": 2292 + }, + { + "epoch": 0.15536282945999053, + "grad_norm": 10.6463041305542, + "learning_rate": 9.788897255116709e-05, + "loss": 1.1397, + "step": 2293 + }, + { + "epoch": 0.1554305847279626, + "grad_norm": 9.412870407104492, + "learning_rate": 9.788760353206927e-05, + "loss": 1.1857, + "step": 2294 + }, + { + "epoch": 0.15549833999593468, + "grad_norm": 9.163009643554688, + "learning_rate": 9.788623451297147e-05, + "loss": 0.8907, + "step": 2295 + }, + { + "epoch": 0.15556609526390677, + "grad_norm": 8.157798767089844, + "learning_rate": 9.788486549387365e-05, + "loss": 0.893, + "step": 2296 + }, + { + "epoch": 0.15563385053187886, + "grad_norm": 9.155460357666016, + "learning_rate": 9.788349647477583e-05, + "loss": 1.0361, + "step": 2297 + }, + { + "epoch": 0.15570160579985093, + "grad_norm": 10.280989646911621, + "learning_rate": 9.7882127455678e-05, + "loss": 1.2953, + "step": 2298 + }, + { + "epoch": 0.15576936106782302, + "grad_norm": 9.654706954956055, + "learning_rate": 9.78807584365802e-05, + "loss": 0.9012, + "step": 2299 + }, + { + "epoch": 0.1558371163357951, + "grad_norm": 12.425939559936523, + "learning_rate": 9.787938941748238e-05, + "loss": 1.4103, + "step": 2300 + }, + { + "epoch": 0.1559048716037672, + "grad_norm": 9.888884544372559, + "learning_rate": 9.787802039838456e-05, + "loss": 1.1599, + "step": 2301 + }, + { + "epoch": 0.15597262687173927, + "grad_norm": 10.30229377746582, + "learning_rate": 9.787665137928674e-05, + "loss": 1.0505, + "step": 2302 + }, + { + "epoch": 0.15604038213971136, + "grad_norm": 11.208477973937988, + "learning_rate": 9.787528236018892e-05, + "loss": 1.009, + "step": 2303 + }, + { + "epoch": 0.15610813740768345, + "grad_norm": 9.264431953430176, + "learning_rate": 9.787391334109112e-05, + "loss": 1.1453, + "step": 2304 + }, + { + "epoch": 0.15617589267565554, + "grad_norm": 9.0980224609375, + "learning_rate": 9.78725443219933e-05, + "loss": 1.0824, + "step": 2305 + }, + { + "epoch": 0.1562436479436276, + "grad_norm": 8.346585273742676, + "learning_rate": 9.787117530289548e-05, + "loss": 1.0632, + "step": 2306 + }, + { + "epoch": 0.1563114032115997, + "grad_norm": 10.607507705688477, + "learning_rate": 9.786980628379766e-05, + "loss": 1.0102, + "step": 2307 + }, + { + "epoch": 0.1563791584795718, + "grad_norm": 8.189208984375, + "learning_rate": 9.786843726469985e-05, + "loss": 1.0872, + "step": 2308 + }, + { + "epoch": 0.15644691374754388, + "grad_norm": 9.84202766418457, + "learning_rate": 9.786706824560203e-05, + "loss": 0.9871, + "step": 2309 + }, + { + "epoch": 0.15651466901551594, + "grad_norm": 11.937589645385742, + "learning_rate": 9.786569922650421e-05, + "loss": 1.2255, + "step": 2310 + }, + { + "epoch": 0.15658242428348804, + "grad_norm": 9.855016708374023, + "learning_rate": 9.786433020740639e-05, + "loss": 1.0097, + "step": 2311 + }, + { + "epoch": 0.15665017955146013, + "grad_norm": 8.661060333251953, + "learning_rate": 9.786296118830857e-05, + "loss": 0.8732, + "step": 2312 + }, + { + "epoch": 0.15671793481943222, + "grad_norm": 10.57170295715332, + "learning_rate": 9.786159216921077e-05, + "loss": 0.862, + "step": 2313 + }, + { + "epoch": 0.15678569008740428, + "grad_norm": 7.759045600891113, + "learning_rate": 9.786022315011295e-05, + "loss": 0.77, + "step": 2314 + }, + { + "epoch": 0.15685344535537638, + "grad_norm": 10.758045196533203, + "learning_rate": 9.785885413101513e-05, + "loss": 0.8919, + "step": 2315 + }, + { + "epoch": 0.15692120062334847, + "grad_norm": 8.521660804748535, + "learning_rate": 9.78574851119173e-05, + "loss": 1.131, + "step": 2316 + }, + { + "epoch": 0.15698895589132056, + "grad_norm": 8.72917652130127, + "learning_rate": 9.78561160928195e-05, + "loss": 0.8359, + "step": 2317 + }, + { + "epoch": 0.15705671115929262, + "grad_norm": 11.679365158081055, + "learning_rate": 9.785474707372168e-05, + "loss": 1.0078, + "step": 2318 + }, + { + "epoch": 0.15712446642726471, + "grad_norm": 11.50632381439209, + "learning_rate": 9.785337805462386e-05, + "loss": 1.208, + "step": 2319 + }, + { + "epoch": 0.1571922216952368, + "grad_norm": 9.6107759475708, + "learning_rate": 9.785200903552605e-05, + "loss": 1.0967, + "step": 2320 + }, + { + "epoch": 0.1572599769632089, + "grad_norm": 8.629117012023926, + "learning_rate": 9.785064001642824e-05, + "loss": 1.0594, + "step": 2321 + }, + { + "epoch": 0.15732773223118096, + "grad_norm": 11.136920928955078, + "learning_rate": 9.784927099733042e-05, + "loss": 1.2874, + "step": 2322 + }, + { + "epoch": 0.15739548749915305, + "grad_norm": 11.097023963928223, + "learning_rate": 9.784790197823261e-05, + "loss": 1.1598, + "step": 2323 + }, + { + "epoch": 0.15746324276712514, + "grad_norm": 11.117433547973633, + "learning_rate": 9.784653295913479e-05, + "loss": 1.0601, + "step": 2324 + }, + { + "epoch": 0.15753099803509724, + "grad_norm": 10.152684211730957, + "learning_rate": 9.784516394003697e-05, + "loss": 1.2763, + "step": 2325 + }, + { + "epoch": 0.1575987533030693, + "grad_norm": 8.37531852722168, + "learning_rate": 9.784379492093915e-05, + "loss": 1.1055, + "step": 2326 + }, + { + "epoch": 0.1576665085710414, + "grad_norm": 10.463939666748047, + "learning_rate": 9.784242590184134e-05, + "loss": 1.3088, + "step": 2327 + }, + { + "epoch": 0.15773426383901348, + "grad_norm": 8.74315357208252, + "learning_rate": 9.784105688274352e-05, + "loss": 1.1165, + "step": 2328 + }, + { + "epoch": 0.15780201910698557, + "grad_norm": 8.691280364990234, + "learning_rate": 9.78396878636457e-05, + "loss": 1.1019, + "step": 2329 + }, + { + "epoch": 0.15786977437495767, + "grad_norm": 10.424938201904297, + "learning_rate": 9.783831884454789e-05, + "loss": 1.1957, + "step": 2330 + }, + { + "epoch": 0.15793752964292973, + "grad_norm": 6.867722034454346, + "learning_rate": 9.783694982545008e-05, + "loss": 1.0476, + "step": 2331 + }, + { + "epoch": 0.15800528491090182, + "grad_norm": 9.434804916381836, + "learning_rate": 9.783558080635226e-05, + "loss": 1.0247, + "step": 2332 + }, + { + "epoch": 0.1580730401788739, + "grad_norm": 7.771170616149902, + "learning_rate": 9.783421178725444e-05, + "loss": 1.2582, + "step": 2333 + }, + { + "epoch": 0.158140795446846, + "grad_norm": 8.366608619689941, + "learning_rate": 9.783284276815662e-05, + "loss": 1.1078, + "step": 2334 + }, + { + "epoch": 0.15820855071481807, + "grad_norm": 8.85851764678955, + "learning_rate": 9.78314737490588e-05, + "loss": 1.2405, + "step": 2335 + }, + { + "epoch": 0.15827630598279016, + "grad_norm": 9.134325981140137, + "learning_rate": 9.7830104729961e-05, + "loss": 1.2277, + "step": 2336 + }, + { + "epoch": 0.15834406125076225, + "grad_norm": 9.150130271911621, + "learning_rate": 9.782873571086317e-05, + "loss": 1.1355, + "step": 2337 + }, + { + "epoch": 0.15841181651873434, + "grad_norm": 8.687469482421875, + "learning_rate": 9.782736669176536e-05, + "loss": 1.0454, + "step": 2338 + }, + { + "epoch": 0.1584795717867064, + "grad_norm": 10.071285247802734, + "learning_rate": 9.782599767266754e-05, + "loss": 1.0041, + "step": 2339 + }, + { + "epoch": 0.1585473270546785, + "grad_norm": 8.373369216918945, + "learning_rate": 9.782462865356972e-05, + "loss": 0.8378, + "step": 2340 + }, + { + "epoch": 0.1586150823226506, + "grad_norm": 9.427014350891113, + "learning_rate": 9.782325963447191e-05, + "loss": 1.1413, + "step": 2341 + }, + { + "epoch": 0.15868283759062268, + "grad_norm": 8.38814640045166, + "learning_rate": 9.782189061537409e-05, + "loss": 1.0696, + "step": 2342 + }, + { + "epoch": 0.15875059285859475, + "grad_norm": 8.518644332885742, + "learning_rate": 9.782052159627627e-05, + "loss": 0.9814, + "step": 2343 + }, + { + "epoch": 0.15881834812656684, + "grad_norm": 9.926252365112305, + "learning_rate": 9.781915257717845e-05, + "loss": 1.1971, + "step": 2344 + }, + { + "epoch": 0.15888610339453893, + "grad_norm": 7.946019649505615, + "learning_rate": 9.781778355808064e-05, + "loss": 1.1902, + "step": 2345 + }, + { + "epoch": 0.15895385866251102, + "grad_norm": 8.686945915222168, + "learning_rate": 9.781641453898283e-05, + "loss": 0.9939, + "step": 2346 + }, + { + "epoch": 0.15902161393048309, + "grad_norm": 8.225680351257324, + "learning_rate": 9.7815045519885e-05, + "loss": 1.3665, + "step": 2347 + }, + { + "epoch": 0.15908936919845518, + "grad_norm": 10.381987571716309, + "learning_rate": 9.781367650078719e-05, + "loss": 1.09, + "step": 2348 + }, + { + "epoch": 0.15915712446642727, + "grad_norm": 8.57552719116211, + "learning_rate": 9.781230748168937e-05, + "loss": 0.8261, + "step": 2349 + }, + { + "epoch": 0.15922487973439936, + "grad_norm": 9.105220794677734, + "learning_rate": 9.781093846259156e-05, + "loss": 1.102, + "step": 2350 + }, + { + "epoch": 0.15929263500237142, + "grad_norm": 10.08092212677002, + "learning_rate": 9.780956944349374e-05, + "loss": 0.943, + "step": 2351 + }, + { + "epoch": 0.15936039027034352, + "grad_norm": 10.259852409362793, + "learning_rate": 9.780820042439592e-05, + "loss": 0.8822, + "step": 2352 + }, + { + "epoch": 0.1594281455383156, + "grad_norm": 8.31139087677002, + "learning_rate": 9.78068314052981e-05, + "loss": 1.2167, + "step": 2353 + }, + { + "epoch": 0.1594959008062877, + "grad_norm": 7.529703617095947, + "learning_rate": 9.78054623862003e-05, + "loss": 0.8913, + "step": 2354 + }, + { + "epoch": 0.15956365607425976, + "grad_norm": 8.792675971984863, + "learning_rate": 9.780409336710248e-05, + "loss": 0.966, + "step": 2355 + }, + { + "epoch": 0.15963141134223185, + "grad_norm": 9.329866409301758, + "learning_rate": 9.780272434800466e-05, + "loss": 1.1659, + "step": 2356 + }, + { + "epoch": 0.15969916661020395, + "grad_norm": 12.14089298248291, + "learning_rate": 9.780135532890684e-05, + "loss": 1.2019, + "step": 2357 + }, + { + "epoch": 0.15976692187817604, + "grad_norm": 9.12912654876709, + "learning_rate": 9.779998630980902e-05, + "loss": 1.1615, + "step": 2358 + }, + { + "epoch": 0.1598346771461481, + "grad_norm": 9.554464340209961, + "learning_rate": 9.779861729071121e-05, + "loss": 1.1695, + "step": 2359 + }, + { + "epoch": 0.1599024324141202, + "grad_norm": 9.317673683166504, + "learning_rate": 9.779724827161339e-05, + "loss": 0.9606, + "step": 2360 + }, + { + "epoch": 0.15997018768209229, + "grad_norm": 8.80395793914795, + "learning_rate": 9.779587925251557e-05, + "loss": 0.9693, + "step": 2361 + }, + { + "epoch": 0.16003794295006438, + "grad_norm": 11.990642547607422, + "learning_rate": 9.779451023341775e-05, + "loss": 1.2901, + "step": 2362 + }, + { + "epoch": 0.16010569821803644, + "grad_norm": 8.188547134399414, + "learning_rate": 9.779314121431995e-05, + "loss": 0.979, + "step": 2363 + }, + { + "epoch": 0.16017345348600853, + "grad_norm": 8.316620826721191, + "learning_rate": 9.779177219522213e-05, + "loss": 0.8601, + "step": 2364 + }, + { + "epoch": 0.16024120875398062, + "grad_norm": 7.58405876159668, + "learning_rate": 9.77904031761243e-05, + "loss": 1.1812, + "step": 2365 + }, + { + "epoch": 0.16030896402195272, + "grad_norm": 7.725598335266113, + "learning_rate": 9.77890341570265e-05, + "loss": 0.9335, + "step": 2366 + }, + { + "epoch": 0.16037671928992478, + "grad_norm": 8.6231107711792, + "learning_rate": 9.778766513792868e-05, + "loss": 1.0162, + "step": 2367 + }, + { + "epoch": 0.16044447455789687, + "grad_norm": 9.762526512145996, + "learning_rate": 9.778629611883086e-05, + "loss": 1.3186, + "step": 2368 + }, + { + "epoch": 0.16051222982586896, + "grad_norm": 11.384220123291016, + "learning_rate": 9.778492709973305e-05, + "loss": 1.2397, + "step": 2369 + }, + { + "epoch": 0.16057998509384105, + "grad_norm": 8.841899871826172, + "learning_rate": 9.778355808063523e-05, + "loss": 0.8708, + "step": 2370 + }, + { + "epoch": 0.16064774036181312, + "grad_norm": 7.778527736663818, + "learning_rate": 9.778218906153741e-05, + "loss": 0.9968, + "step": 2371 + }, + { + "epoch": 0.1607154956297852, + "grad_norm": 8.559181213378906, + "learning_rate": 9.77808200424396e-05, + "loss": 0.9759, + "step": 2372 + }, + { + "epoch": 0.1607832508977573, + "grad_norm": 10.273273468017578, + "learning_rate": 9.777945102334179e-05, + "loss": 0.9485, + "step": 2373 + }, + { + "epoch": 0.1608510061657294, + "grad_norm": 7.946044445037842, + "learning_rate": 9.777808200424397e-05, + "loss": 1.105, + "step": 2374 + }, + { + "epoch": 0.16091876143370146, + "grad_norm": 9.917662620544434, + "learning_rate": 9.777671298514615e-05, + "loss": 1.0272, + "step": 2375 + }, + { + "epoch": 0.16098651670167355, + "grad_norm": 10.438239097595215, + "learning_rate": 9.777534396604833e-05, + "loss": 1.0197, + "step": 2376 + }, + { + "epoch": 0.16105427196964564, + "grad_norm": 8.799901962280273, + "learning_rate": 9.777397494695052e-05, + "loss": 1.1401, + "step": 2377 + }, + { + "epoch": 0.16112202723761773, + "grad_norm": 8.569243431091309, + "learning_rate": 9.77726059278527e-05, + "loss": 1.2019, + "step": 2378 + }, + { + "epoch": 0.1611897825055898, + "grad_norm": 10.793002128601074, + "learning_rate": 9.777123690875488e-05, + "loss": 1.0932, + "step": 2379 + }, + { + "epoch": 0.1612575377735619, + "grad_norm": 7.825850963592529, + "learning_rate": 9.776986788965707e-05, + "loss": 1.2226, + "step": 2380 + }, + { + "epoch": 0.16132529304153398, + "grad_norm": 8.780813217163086, + "learning_rate": 9.776849887055925e-05, + "loss": 0.7939, + "step": 2381 + }, + { + "epoch": 0.16139304830950607, + "grad_norm": 8.927638053894043, + "learning_rate": 9.776712985146144e-05, + "loss": 1.0551, + "step": 2382 + }, + { + "epoch": 0.16146080357747816, + "grad_norm": 9.08043384552002, + "learning_rate": 9.776576083236362e-05, + "loss": 1.0037, + "step": 2383 + }, + { + "epoch": 0.16152855884545023, + "grad_norm": 9.362268447875977, + "learning_rate": 9.77643918132658e-05, + "loss": 1.332, + "step": 2384 + }, + { + "epoch": 0.16159631411342232, + "grad_norm": 10.533197402954102, + "learning_rate": 9.776302279416798e-05, + "loss": 1.0966, + "step": 2385 + }, + { + "epoch": 0.1616640693813944, + "grad_norm": 9.579266548156738, + "learning_rate": 9.776165377507017e-05, + "loss": 0.9325, + "step": 2386 + }, + { + "epoch": 0.1617318246493665, + "grad_norm": 10.071297645568848, + "learning_rate": 9.776028475597235e-05, + "loss": 0.9496, + "step": 2387 + }, + { + "epoch": 0.16179957991733857, + "grad_norm": 9.243900299072266, + "learning_rate": 9.775891573687453e-05, + "loss": 1.13, + "step": 2388 + }, + { + "epoch": 0.16186733518531066, + "grad_norm": 9.327018737792969, + "learning_rate": 9.775754671777672e-05, + "loss": 1.2587, + "step": 2389 + }, + { + "epoch": 0.16193509045328275, + "grad_norm": 7.614950180053711, + "learning_rate": 9.77561776986789e-05, + "loss": 1.0848, + "step": 2390 + }, + { + "epoch": 0.16200284572125484, + "grad_norm": 9.86501407623291, + "learning_rate": 9.775480867958109e-05, + "loss": 1.2504, + "step": 2391 + }, + { + "epoch": 0.1620706009892269, + "grad_norm": 9.08303451538086, + "learning_rate": 9.775343966048327e-05, + "loss": 1.4359, + "step": 2392 + }, + { + "epoch": 0.162138356257199, + "grad_norm": 8.417489051818848, + "learning_rate": 9.775207064138545e-05, + "loss": 1.244, + "step": 2393 + }, + { + "epoch": 0.1622061115251711, + "grad_norm": 8.35366439819336, + "learning_rate": 9.775070162228763e-05, + "loss": 1.0706, + "step": 2394 + }, + { + "epoch": 0.16227386679314318, + "grad_norm": 9.732915878295898, + "learning_rate": 9.774933260318981e-05, + "loss": 1.4237, + "step": 2395 + }, + { + "epoch": 0.16234162206111524, + "grad_norm": 9.131365776062012, + "learning_rate": 9.7747963584092e-05, + "loss": 0.9207, + "step": 2396 + }, + { + "epoch": 0.16240937732908733, + "grad_norm": 8.93538761138916, + "learning_rate": 9.774659456499419e-05, + "loss": 1.0054, + "step": 2397 + }, + { + "epoch": 0.16247713259705943, + "grad_norm": 8.939055442810059, + "learning_rate": 9.774522554589637e-05, + "loss": 0.9782, + "step": 2398 + }, + { + "epoch": 0.16254488786503152, + "grad_norm": 9.251758575439453, + "learning_rate": 9.774385652679855e-05, + "loss": 1.0968, + "step": 2399 + }, + { + "epoch": 0.16261264313300358, + "grad_norm": 9.240782737731934, + "learning_rate": 9.774248750770074e-05, + "loss": 0.7982, + "step": 2400 + }, + { + "epoch": 0.16268039840097567, + "grad_norm": 8.697726249694824, + "learning_rate": 9.774111848860292e-05, + "loss": 1.0545, + "step": 2401 + }, + { + "epoch": 0.16274815366894776, + "grad_norm": 7.9362053871154785, + "learning_rate": 9.77397494695051e-05, + "loss": 0.8364, + "step": 2402 + }, + { + "epoch": 0.16281590893691986, + "grad_norm": 11.944025993347168, + "learning_rate": 9.773838045040728e-05, + "loss": 1.2313, + "step": 2403 + }, + { + "epoch": 0.16288366420489192, + "grad_norm": 9.496225357055664, + "learning_rate": 9.773701143130946e-05, + "loss": 1.1363, + "step": 2404 + }, + { + "epoch": 0.162951419472864, + "grad_norm": 12.96069622039795, + "learning_rate": 9.773564241221165e-05, + "loss": 1.1205, + "step": 2405 + }, + { + "epoch": 0.1630191747408361, + "grad_norm": 8.915671348571777, + "learning_rate": 9.773427339311384e-05, + "loss": 1.1149, + "step": 2406 + }, + { + "epoch": 0.1630869300088082, + "grad_norm": 10.23763370513916, + "learning_rate": 9.773290437401602e-05, + "loss": 1.1867, + "step": 2407 + }, + { + "epoch": 0.16315468527678026, + "grad_norm": 9.117730140686035, + "learning_rate": 9.77315353549182e-05, + "loss": 1.1792, + "step": 2408 + }, + { + "epoch": 0.16322244054475235, + "grad_norm": 9.380385398864746, + "learning_rate": 9.773016633582039e-05, + "loss": 1.1621, + "step": 2409 + }, + { + "epoch": 0.16329019581272444, + "grad_norm": 8.56508731842041, + "learning_rate": 9.772879731672257e-05, + "loss": 0.8372, + "step": 2410 + }, + { + "epoch": 0.16335795108069653, + "grad_norm": 11.707832336425781, + "learning_rate": 9.772742829762475e-05, + "loss": 1.3191, + "step": 2411 + }, + { + "epoch": 0.1634257063486686, + "grad_norm": 7.720577716827393, + "learning_rate": 9.772605927852694e-05, + "loss": 0.9617, + "step": 2412 + }, + { + "epoch": 0.1634934616166407, + "grad_norm": 8.586542129516602, + "learning_rate": 9.772469025942912e-05, + "loss": 1.0978, + "step": 2413 + }, + { + "epoch": 0.16356121688461278, + "grad_norm": 9.021394729614258, + "learning_rate": 9.77233212403313e-05, + "loss": 0.9358, + "step": 2414 + }, + { + "epoch": 0.16362897215258487, + "grad_norm": 9.078686714172363, + "learning_rate": 9.77219522212335e-05, + "loss": 1.3338, + "step": 2415 + }, + { + "epoch": 0.16369672742055694, + "grad_norm": 9.810312271118164, + "learning_rate": 9.772058320213568e-05, + "loss": 1.2472, + "step": 2416 + }, + { + "epoch": 0.16376448268852903, + "grad_norm": 8.933609962463379, + "learning_rate": 9.771921418303786e-05, + "loss": 0.9115, + "step": 2417 + }, + { + "epoch": 0.16383223795650112, + "grad_norm": 7.044286251068115, + "learning_rate": 9.771784516394005e-05, + "loss": 0.7983, + "step": 2418 + }, + { + "epoch": 0.1638999932244732, + "grad_norm": 11.711495399475098, + "learning_rate": 9.771647614484223e-05, + "loss": 1.096, + "step": 2419 + }, + { + "epoch": 0.16396774849244528, + "grad_norm": 9.31049633026123, + "learning_rate": 9.771510712574441e-05, + "loss": 1.2711, + "step": 2420 + }, + { + "epoch": 0.16403550376041737, + "grad_norm": 8.10503101348877, + "learning_rate": 9.77137381066466e-05, + "loss": 0.9095, + "step": 2421 + }, + { + "epoch": 0.16410325902838946, + "grad_norm": 7.915055751800537, + "learning_rate": 9.771236908754877e-05, + "loss": 1.0161, + "step": 2422 + }, + { + "epoch": 0.16417101429636155, + "grad_norm": 8.185515403747559, + "learning_rate": 9.771100006845097e-05, + "loss": 1.1109, + "step": 2423 + }, + { + "epoch": 0.16423876956433361, + "grad_norm": 9.960200309753418, + "learning_rate": 9.770963104935315e-05, + "loss": 0.9757, + "step": 2424 + }, + { + "epoch": 0.1643065248323057, + "grad_norm": 9.646814346313477, + "learning_rate": 9.770826203025533e-05, + "loss": 1.0835, + "step": 2425 + }, + { + "epoch": 0.1643742801002778, + "grad_norm": 9.701393127441406, + "learning_rate": 9.770689301115751e-05, + "loss": 1.0717, + "step": 2426 + }, + { + "epoch": 0.1644420353682499, + "grad_norm": 7.887824058532715, + "learning_rate": 9.770552399205969e-05, + "loss": 1.0937, + "step": 2427 + }, + { + "epoch": 0.16450979063622195, + "grad_norm": 10.94339370727539, + "learning_rate": 9.770415497296188e-05, + "loss": 1.2766, + "step": 2428 + }, + { + "epoch": 0.16457754590419404, + "grad_norm": 10.051490783691406, + "learning_rate": 9.770278595386406e-05, + "loss": 1.248, + "step": 2429 + }, + { + "epoch": 0.16464530117216614, + "grad_norm": 8.380006790161133, + "learning_rate": 9.770141693476624e-05, + "loss": 1.1657, + "step": 2430 + }, + { + "epoch": 0.16471305644013823, + "grad_norm": 8.077753067016602, + "learning_rate": 9.770004791566843e-05, + "loss": 0.9511, + "step": 2431 + }, + { + "epoch": 0.1647808117081103, + "grad_norm": 8.744999885559082, + "learning_rate": 9.769867889657062e-05, + "loss": 1.0736, + "step": 2432 + }, + { + "epoch": 0.16484856697608238, + "grad_norm": 8.203909873962402, + "learning_rate": 9.76973098774728e-05, + "loss": 1.208, + "step": 2433 + }, + { + "epoch": 0.16491632224405448, + "grad_norm": 9.462398529052734, + "learning_rate": 9.769594085837498e-05, + "loss": 1.0011, + "step": 2434 + }, + { + "epoch": 0.16498407751202657, + "grad_norm": 11.190359115600586, + "learning_rate": 9.769457183927716e-05, + "loss": 1.139, + "step": 2435 + }, + { + "epoch": 0.16505183277999866, + "grad_norm": 10.454118728637695, + "learning_rate": 9.769320282017934e-05, + "loss": 1.0799, + "step": 2436 + }, + { + "epoch": 0.16511958804797072, + "grad_norm": 14.411054611206055, + "learning_rate": 9.769183380108153e-05, + "loss": 1.0369, + "step": 2437 + }, + { + "epoch": 0.16518734331594281, + "grad_norm": 11.42679214477539, + "learning_rate": 9.769046478198371e-05, + "loss": 1.2486, + "step": 2438 + }, + { + "epoch": 0.1652550985839149, + "grad_norm": 10.520325660705566, + "learning_rate": 9.76890957628859e-05, + "loss": 1.2355, + "step": 2439 + }, + { + "epoch": 0.165322853851887, + "grad_norm": 7.958998680114746, + "learning_rate": 9.768772674378808e-05, + "loss": 1.0822, + "step": 2440 + }, + { + "epoch": 0.16539060911985906, + "grad_norm": 8.649806022644043, + "learning_rate": 9.768635772469027e-05, + "loss": 0.9439, + "step": 2441 + }, + { + "epoch": 0.16545836438783115, + "grad_norm": 8.539712905883789, + "learning_rate": 9.768498870559245e-05, + "loss": 0.9541, + "step": 2442 + }, + { + "epoch": 0.16552611965580324, + "grad_norm": 11.40131950378418, + "learning_rate": 9.768361968649463e-05, + "loss": 1.0568, + "step": 2443 + }, + { + "epoch": 0.16559387492377534, + "grad_norm": 8.333579063415527, + "learning_rate": 9.768225066739681e-05, + "loss": 0.924, + "step": 2444 + }, + { + "epoch": 0.1656616301917474, + "grad_norm": 9.02564525604248, + "learning_rate": 9.768088164829899e-05, + "loss": 1.1901, + "step": 2445 + }, + { + "epoch": 0.1657293854597195, + "grad_norm": 9.721306800842285, + "learning_rate": 9.767951262920118e-05, + "loss": 1.2313, + "step": 2446 + }, + { + "epoch": 0.16579714072769158, + "grad_norm": 9.876781463623047, + "learning_rate": 9.767814361010336e-05, + "loss": 1.0165, + "step": 2447 + }, + { + "epoch": 0.16586489599566367, + "grad_norm": 11.69865894317627, + "learning_rate": 9.767677459100555e-05, + "loss": 1.1301, + "step": 2448 + }, + { + "epoch": 0.16593265126363574, + "grad_norm": 11.38391399383545, + "learning_rate": 9.767540557190773e-05, + "loss": 1.1522, + "step": 2449 + }, + { + "epoch": 0.16600040653160783, + "grad_norm": 9.18020248413086, + "learning_rate": 9.76740365528099e-05, + "loss": 1.0515, + "step": 2450 + }, + { + "epoch": 0.16606816179957992, + "grad_norm": 9.30802059173584, + "learning_rate": 9.76726675337121e-05, + "loss": 1.0789, + "step": 2451 + }, + { + "epoch": 0.166135917067552, + "grad_norm": 9.58259391784668, + "learning_rate": 9.767129851461428e-05, + "loss": 1.1299, + "step": 2452 + }, + { + "epoch": 0.16620367233552408, + "grad_norm": 9.137594223022461, + "learning_rate": 9.766992949551646e-05, + "loss": 1.0854, + "step": 2453 + }, + { + "epoch": 0.16627142760349617, + "grad_norm": 12.410299301147461, + "learning_rate": 9.766856047641864e-05, + "loss": 1.1717, + "step": 2454 + }, + { + "epoch": 0.16633918287146826, + "grad_norm": 9.016322135925293, + "learning_rate": 9.766719145732083e-05, + "loss": 1.1028, + "step": 2455 + }, + { + "epoch": 0.16640693813944035, + "grad_norm": 10.166184425354004, + "learning_rate": 9.766582243822301e-05, + "loss": 1.0626, + "step": 2456 + }, + { + "epoch": 0.16647469340741242, + "grad_norm": 9.030965805053711, + "learning_rate": 9.76644534191252e-05, + "loss": 1.1761, + "step": 2457 + }, + { + "epoch": 0.1665424486753845, + "grad_norm": 9.294576644897461, + "learning_rate": 9.766308440002739e-05, + "loss": 0.943, + "step": 2458 + }, + { + "epoch": 0.1666102039433566, + "grad_norm": 6.732856273651123, + "learning_rate": 9.766171538092957e-05, + "loss": 0.7342, + "step": 2459 + }, + { + "epoch": 0.1666779592113287, + "grad_norm": 8.178942680358887, + "learning_rate": 9.766034636183175e-05, + "loss": 0.7889, + "step": 2460 + }, + { + "epoch": 0.16674571447930076, + "grad_norm": 7.771929740905762, + "learning_rate": 9.765897734273394e-05, + "loss": 1.1129, + "step": 2461 + }, + { + "epoch": 0.16681346974727285, + "grad_norm": 8.695874214172363, + "learning_rate": 9.765760832363612e-05, + "loss": 1.1968, + "step": 2462 + }, + { + "epoch": 0.16688122501524494, + "grad_norm": 10.18800163269043, + "learning_rate": 9.76562393045383e-05, + "loss": 1.0835, + "step": 2463 + }, + { + "epoch": 0.16694898028321703, + "grad_norm": 9.310625076293945, + "learning_rate": 9.76548702854405e-05, + "loss": 1.2042, + "step": 2464 + }, + { + "epoch": 0.1670167355511891, + "grad_norm": 11.684195518493652, + "learning_rate": 9.765350126634268e-05, + "loss": 1.2518, + "step": 2465 + }, + { + "epoch": 0.16708449081916119, + "grad_norm": 10.9056978225708, + "learning_rate": 9.765213224724486e-05, + "loss": 1.2465, + "step": 2466 + }, + { + "epoch": 0.16715224608713328, + "grad_norm": 9.871830940246582, + "learning_rate": 9.765076322814704e-05, + "loss": 1.3192, + "step": 2467 + }, + { + "epoch": 0.16722000135510537, + "grad_norm": 7.725397109985352, + "learning_rate": 9.764939420904922e-05, + "loss": 0.9082, + "step": 2468 + }, + { + "epoch": 0.16728775662307743, + "grad_norm": 8.7415189743042, + "learning_rate": 9.764802518995141e-05, + "loss": 0.8569, + "step": 2469 + }, + { + "epoch": 0.16735551189104952, + "grad_norm": 7.8990888595581055, + "learning_rate": 9.76466561708536e-05, + "loss": 1.0175, + "step": 2470 + }, + { + "epoch": 0.16742326715902162, + "grad_norm": 8.688074111938477, + "learning_rate": 9.764528715175577e-05, + "loss": 1.0131, + "step": 2471 + }, + { + "epoch": 0.1674910224269937, + "grad_norm": 7.902133941650391, + "learning_rate": 9.764391813265795e-05, + "loss": 0.8956, + "step": 2472 + }, + { + "epoch": 0.16755877769496577, + "grad_norm": 9.179505348205566, + "learning_rate": 9.764254911356013e-05, + "loss": 1.175, + "step": 2473 + }, + { + "epoch": 0.16762653296293786, + "grad_norm": 10.701058387756348, + "learning_rate": 9.764118009446233e-05, + "loss": 1.003, + "step": 2474 + }, + { + "epoch": 0.16769428823090995, + "grad_norm": 8.76916217803955, + "learning_rate": 9.763981107536451e-05, + "loss": 1.102, + "step": 2475 + }, + { + "epoch": 0.16776204349888205, + "grad_norm": 8.682199478149414, + "learning_rate": 9.763844205626669e-05, + "loss": 1.2042, + "step": 2476 + }, + { + "epoch": 0.1678297987668541, + "grad_norm": 9.789544105529785, + "learning_rate": 9.763707303716887e-05, + "loss": 1.2469, + "step": 2477 + }, + { + "epoch": 0.1678975540348262, + "grad_norm": 8.250391960144043, + "learning_rate": 9.763570401807106e-05, + "loss": 0.9742, + "step": 2478 + }, + { + "epoch": 0.1679653093027983, + "grad_norm": 8.938610076904297, + "learning_rate": 9.763433499897324e-05, + "loss": 0.9968, + "step": 2479 + }, + { + "epoch": 0.16803306457077039, + "grad_norm": 9.956622123718262, + "learning_rate": 9.763296597987542e-05, + "loss": 1.0464, + "step": 2480 + }, + { + "epoch": 0.16810081983874245, + "grad_norm": 8.582858085632324, + "learning_rate": 9.76315969607776e-05, + "loss": 1.1915, + "step": 2481 + }, + { + "epoch": 0.16816857510671454, + "grad_norm": 7.73312520980835, + "learning_rate": 9.763022794167979e-05, + "loss": 0.8506, + "step": 2482 + }, + { + "epoch": 0.16823633037468663, + "grad_norm": 8.986891746520996, + "learning_rate": 9.762885892258198e-05, + "loss": 0.8959, + "step": 2483 + }, + { + "epoch": 0.16830408564265872, + "grad_norm": 10.047099113464355, + "learning_rate": 9.762748990348416e-05, + "loss": 1.24, + "step": 2484 + }, + { + "epoch": 0.1683718409106308, + "grad_norm": 8.517911911010742, + "learning_rate": 9.762612088438634e-05, + "loss": 0.8171, + "step": 2485 + }, + { + "epoch": 0.16843959617860288, + "grad_norm": 9.586174964904785, + "learning_rate": 9.762475186528852e-05, + "loss": 1.1603, + "step": 2486 + }, + { + "epoch": 0.16850735144657497, + "grad_norm": 9.85086441040039, + "learning_rate": 9.762338284619071e-05, + "loss": 0.9157, + "step": 2487 + }, + { + "epoch": 0.16857510671454706, + "grad_norm": 12.023639678955078, + "learning_rate": 9.76220138270929e-05, + "loss": 1.2032, + "step": 2488 + }, + { + "epoch": 0.16864286198251915, + "grad_norm": 8.932641983032227, + "learning_rate": 9.762064480799507e-05, + "loss": 1.0678, + "step": 2489 + }, + { + "epoch": 0.16871061725049122, + "grad_norm": 10.568282127380371, + "learning_rate": 9.761927578889725e-05, + "loss": 1.3213, + "step": 2490 + }, + { + "epoch": 0.1687783725184633, + "grad_norm": 9.116564750671387, + "learning_rate": 9.761790676979944e-05, + "loss": 1.0525, + "step": 2491 + }, + { + "epoch": 0.1688461277864354, + "grad_norm": 8.192644119262695, + "learning_rate": 9.761653775070163e-05, + "loss": 0.9554, + "step": 2492 + }, + { + "epoch": 0.1689138830544075, + "grad_norm": 9.146562576293945, + "learning_rate": 9.761516873160381e-05, + "loss": 0.8852, + "step": 2493 + }, + { + "epoch": 0.16898163832237956, + "grad_norm": 8.82610034942627, + "learning_rate": 9.761379971250599e-05, + "loss": 1.0521, + "step": 2494 + }, + { + "epoch": 0.16904939359035165, + "grad_norm": 9.051412582397461, + "learning_rate": 9.761243069340817e-05, + "loss": 0.913, + "step": 2495 + }, + { + "epoch": 0.16911714885832374, + "grad_norm": 8.97696304321289, + "learning_rate": 9.761106167431036e-05, + "loss": 0.887, + "step": 2496 + }, + { + "epoch": 0.16918490412629583, + "grad_norm": 10.083110809326172, + "learning_rate": 9.760969265521254e-05, + "loss": 1.3521, + "step": 2497 + }, + { + "epoch": 0.1692526593942679, + "grad_norm": 7.585256099700928, + "learning_rate": 9.760832363611472e-05, + "loss": 0.9001, + "step": 2498 + }, + { + "epoch": 0.16932041466224, + "grad_norm": 10.301995277404785, + "learning_rate": 9.76069546170169e-05, + "loss": 1.0915, + "step": 2499 + }, + { + "epoch": 0.16938816993021208, + "grad_norm": 8.517580032348633, + "learning_rate": 9.760558559791909e-05, + "loss": 1.0097, + "step": 2500 + }, + { + "epoch": 0.16945592519818417, + "grad_norm": 8.20002555847168, + "learning_rate": 9.760421657882128e-05, + "loss": 1.2988, + "step": 2501 + }, + { + "epoch": 0.16952368046615623, + "grad_norm": 9.705124855041504, + "learning_rate": 9.760284755972346e-05, + "loss": 1.2661, + "step": 2502 + }, + { + "epoch": 0.16959143573412833, + "grad_norm": 9.167060852050781, + "learning_rate": 9.760147854062564e-05, + "loss": 1.0556, + "step": 2503 + }, + { + "epoch": 0.16965919100210042, + "grad_norm": 8.415916442871094, + "learning_rate": 9.760010952152783e-05, + "loss": 0.7964, + "step": 2504 + }, + { + "epoch": 0.1697269462700725, + "grad_norm": 7.626298427581787, + "learning_rate": 9.759874050243001e-05, + "loss": 0.9153, + "step": 2505 + }, + { + "epoch": 0.16979470153804457, + "grad_norm": 10.595001220703125, + "learning_rate": 9.75973714833322e-05, + "loss": 0.9911, + "step": 2506 + }, + { + "epoch": 0.16986245680601667, + "grad_norm": 7.564423084259033, + "learning_rate": 9.759600246423439e-05, + "loss": 0.9843, + "step": 2507 + }, + { + "epoch": 0.16993021207398876, + "grad_norm": 7.644829273223877, + "learning_rate": 9.759463344513657e-05, + "loss": 0.9116, + "step": 2508 + }, + { + "epoch": 0.16999796734196085, + "grad_norm": 7.54351282119751, + "learning_rate": 9.759326442603875e-05, + "loss": 1.1528, + "step": 2509 + }, + { + "epoch": 0.1700657226099329, + "grad_norm": 9.259818077087402, + "learning_rate": 9.759189540694094e-05, + "loss": 1.0867, + "step": 2510 + }, + { + "epoch": 0.170133477877905, + "grad_norm": 8.022993087768555, + "learning_rate": 9.759052638784312e-05, + "loss": 0.7524, + "step": 2511 + }, + { + "epoch": 0.1702012331458771, + "grad_norm": 8.264616966247559, + "learning_rate": 9.75891573687453e-05, + "loss": 0.9906, + "step": 2512 + }, + { + "epoch": 0.1702689884138492, + "grad_norm": 9.606420516967773, + "learning_rate": 9.758778834964748e-05, + "loss": 0.9287, + "step": 2513 + }, + { + "epoch": 0.17033674368182125, + "grad_norm": 8.019355773925781, + "learning_rate": 9.758641933054966e-05, + "loss": 0.879, + "step": 2514 + }, + { + "epoch": 0.17040449894979334, + "grad_norm": 9.819777488708496, + "learning_rate": 9.758505031145186e-05, + "loss": 1.3677, + "step": 2515 + }, + { + "epoch": 0.17047225421776543, + "grad_norm": 9.9795560836792, + "learning_rate": 9.758368129235404e-05, + "loss": 1.1614, + "step": 2516 + }, + { + "epoch": 0.17054000948573753, + "grad_norm": 9.51271915435791, + "learning_rate": 9.758231227325622e-05, + "loss": 1.0423, + "step": 2517 + }, + { + "epoch": 0.1706077647537096, + "grad_norm": 10.511359214782715, + "learning_rate": 9.75809432541584e-05, + "loss": 0.9492, + "step": 2518 + }, + { + "epoch": 0.17067552002168168, + "grad_norm": 9.61755657196045, + "learning_rate": 9.757957423506059e-05, + "loss": 1.1481, + "step": 2519 + }, + { + "epoch": 0.17074327528965377, + "grad_norm": 11.246142387390137, + "learning_rate": 9.757820521596277e-05, + "loss": 1.3981, + "step": 2520 + }, + { + "epoch": 0.17081103055762586, + "grad_norm": 9.273181915283203, + "learning_rate": 9.757683619686495e-05, + "loss": 0.9773, + "step": 2521 + }, + { + "epoch": 0.17087878582559793, + "grad_norm": 11.215822219848633, + "learning_rate": 9.757546717776713e-05, + "loss": 1.2303, + "step": 2522 + }, + { + "epoch": 0.17094654109357002, + "grad_norm": 10.139853477478027, + "learning_rate": 9.757409815866931e-05, + "loss": 1.0807, + "step": 2523 + }, + { + "epoch": 0.1710142963615421, + "grad_norm": 13.275606155395508, + "learning_rate": 9.757272913957151e-05, + "loss": 1.3326, + "step": 2524 + }, + { + "epoch": 0.1710820516295142, + "grad_norm": 8.32109260559082, + "learning_rate": 9.757136012047369e-05, + "loss": 0.9428, + "step": 2525 + }, + { + "epoch": 0.17114980689748627, + "grad_norm": 9.275816917419434, + "learning_rate": 9.756999110137587e-05, + "loss": 1.1531, + "step": 2526 + }, + { + "epoch": 0.17121756216545836, + "grad_norm": 10.61928939819336, + "learning_rate": 9.756862208227805e-05, + "loss": 0.9422, + "step": 2527 + }, + { + "epoch": 0.17128531743343045, + "grad_norm": 7.793631553649902, + "learning_rate": 9.756725306318023e-05, + "loss": 1.216, + "step": 2528 + }, + { + "epoch": 0.17135307270140254, + "grad_norm": 7.453477382659912, + "learning_rate": 9.756588404408242e-05, + "loss": 0.7787, + "step": 2529 + }, + { + "epoch": 0.1714208279693746, + "grad_norm": 10.39784049987793, + "learning_rate": 9.75645150249846e-05, + "loss": 1.3171, + "step": 2530 + }, + { + "epoch": 0.1714885832373467, + "grad_norm": 8.57040786743164, + "learning_rate": 9.756314600588678e-05, + "loss": 0.8319, + "step": 2531 + }, + { + "epoch": 0.1715563385053188, + "grad_norm": 8.696785926818848, + "learning_rate": 9.756177698678896e-05, + "loss": 1.0078, + "step": 2532 + }, + { + "epoch": 0.17162409377329088, + "grad_norm": 8.212169647216797, + "learning_rate": 9.756040796769116e-05, + "loss": 0.7851, + "step": 2533 + }, + { + "epoch": 0.17169184904126294, + "grad_norm": 10.94201946258545, + "learning_rate": 9.755903894859334e-05, + "loss": 1.0065, + "step": 2534 + }, + { + "epoch": 0.17175960430923504, + "grad_norm": 12.041540145874023, + "learning_rate": 9.755766992949552e-05, + "loss": 1.2938, + "step": 2535 + }, + { + "epoch": 0.17182735957720713, + "grad_norm": 9.289467811584473, + "learning_rate": 9.75563009103977e-05, + "loss": 1.1521, + "step": 2536 + }, + { + "epoch": 0.17189511484517922, + "grad_norm": 9.231005668640137, + "learning_rate": 9.755493189129988e-05, + "loss": 0.9, + "step": 2537 + }, + { + "epoch": 0.17196287011315128, + "grad_norm": 8.934699058532715, + "learning_rate": 9.755356287220207e-05, + "loss": 1.2817, + "step": 2538 + }, + { + "epoch": 0.17203062538112338, + "grad_norm": 10.096917152404785, + "learning_rate": 9.755219385310425e-05, + "loss": 1.1587, + "step": 2539 + }, + { + "epoch": 0.17209838064909547, + "grad_norm": 8.864645004272461, + "learning_rate": 9.755082483400643e-05, + "loss": 1.1781, + "step": 2540 + }, + { + "epoch": 0.17216613591706756, + "grad_norm": 10.964715957641602, + "learning_rate": 9.754945581490861e-05, + "loss": 1.2729, + "step": 2541 + }, + { + "epoch": 0.17223389118503965, + "grad_norm": 10.845703125, + "learning_rate": 9.754808679581081e-05, + "loss": 1.4475, + "step": 2542 + }, + { + "epoch": 0.17230164645301171, + "grad_norm": 9.800530433654785, + "learning_rate": 9.754671777671299e-05, + "loss": 1.0296, + "step": 2543 + }, + { + "epoch": 0.1723694017209838, + "grad_norm": 8.789834022521973, + "learning_rate": 9.754534875761517e-05, + "loss": 0.9573, + "step": 2544 + }, + { + "epoch": 0.1724371569889559, + "grad_norm": 7.731616973876953, + "learning_rate": 9.754397973851735e-05, + "loss": 0.9875, + "step": 2545 + }, + { + "epoch": 0.172504912256928, + "grad_norm": 8.942553520202637, + "learning_rate": 9.754261071941953e-05, + "loss": 1.1897, + "step": 2546 + }, + { + "epoch": 0.17257266752490005, + "grad_norm": 7.468856334686279, + "learning_rate": 9.754124170032172e-05, + "loss": 0.8869, + "step": 2547 + }, + { + "epoch": 0.17264042279287214, + "grad_norm": 8.798864364624023, + "learning_rate": 9.75398726812239e-05, + "loss": 1.0034, + "step": 2548 + }, + { + "epoch": 0.17270817806084424, + "grad_norm": 8.579094886779785, + "learning_rate": 9.753850366212608e-05, + "loss": 1.0211, + "step": 2549 + }, + { + "epoch": 0.17277593332881633, + "grad_norm": 10.90807819366455, + "learning_rate": 9.753713464302827e-05, + "loss": 1.3639, + "step": 2550 + }, + { + "epoch": 0.1728436885967884, + "grad_norm": 8.337204933166504, + "learning_rate": 9.753576562393046e-05, + "loss": 1.323, + "step": 2551 + }, + { + "epoch": 0.17291144386476048, + "grad_norm": 8.731851577758789, + "learning_rate": 9.753439660483264e-05, + "loss": 0.8805, + "step": 2552 + }, + { + "epoch": 0.17297919913273258, + "grad_norm": 9.541427612304688, + "learning_rate": 9.753302758573482e-05, + "loss": 1.0368, + "step": 2553 + }, + { + "epoch": 0.17304695440070467, + "grad_norm": 7.938154697418213, + "learning_rate": 9.753165856663701e-05, + "loss": 0.9128, + "step": 2554 + }, + { + "epoch": 0.17311470966867673, + "grad_norm": 8.111212730407715, + "learning_rate": 9.75302895475392e-05, + "loss": 0.9857, + "step": 2555 + }, + { + "epoch": 0.17318246493664882, + "grad_norm": 8.583324432373047, + "learning_rate": 9.752892052844139e-05, + "loss": 1.0975, + "step": 2556 + }, + { + "epoch": 0.1732502202046209, + "grad_norm": 8.180643081665039, + "learning_rate": 9.752755150934357e-05, + "loss": 1.21, + "step": 2557 + }, + { + "epoch": 0.173317975472593, + "grad_norm": 9.740217208862305, + "learning_rate": 9.752618249024575e-05, + "loss": 1.18, + "step": 2558 + }, + { + "epoch": 0.17338573074056507, + "grad_norm": 8.725831031799316, + "learning_rate": 9.752481347114793e-05, + "loss": 0.9816, + "step": 2559 + }, + { + "epoch": 0.17345348600853716, + "grad_norm": 10.786824226379395, + "learning_rate": 9.752344445205011e-05, + "loss": 1.0402, + "step": 2560 + }, + { + "epoch": 0.17352124127650925, + "grad_norm": 8.91720962524414, + "learning_rate": 9.75220754329523e-05, + "loss": 1.2357, + "step": 2561 + }, + { + "epoch": 0.17358899654448134, + "grad_norm": 9.02492618560791, + "learning_rate": 9.752070641385448e-05, + "loss": 1.1264, + "step": 2562 + }, + { + "epoch": 0.1736567518124534, + "grad_norm": 7.917794227600098, + "learning_rate": 9.751933739475666e-05, + "loss": 1.0937, + "step": 2563 + }, + { + "epoch": 0.1737245070804255, + "grad_norm": 11.543112754821777, + "learning_rate": 9.751796837565884e-05, + "loss": 1.0851, + "step": 2564 + }, + { + "epoch": 0.1737922623483976, + "grad_norm": 9.114936828613281, + "learning_rate": 9.751659935656104e-05, + "loss": 0.9565, + "step": 2565 + }, + { + "epoch": 0.17386001761636968, + "grad_norm": 12.322575569152832, + "learning_rate": 9.751523033746322e-05, + "loss": 1.0356, + "step": 2566 + }, + { + "epoch": 0.17392777288434175, + "grad_norm": 11.39137077331543, + "learning_rate": 9.75138613183654e-05, + "loss": 1.2141, + "step": 2567 + }, + { + "epoch": 0.17399552815231384, + "grad_norm": 9.505644798278809, + "learning_rate": 9.751249229926758e-05, + "loss": 0.8307, + "step": 2568 + }, + { + "epoch": 0.17406328342028593, + "grad_norm": 11.05932331085205, + "learning_rate": 9.751112328016976e-05, + "loss": 1.1915, + "step": 2569 + }, + { + "epoch": 0.17413103868825802, + "grad_norm": 11.104498863220215, + "learning_rate": 9.750975426107195e-05, + "loss": 1.0032, + "step": 2570 + }, + { + "epoch": 0.17419879395623009, + "grad_norm": 11.366668701171875, + "learning_rate": 9.750838524197413e-05, + "loss": 1.0857, + "step": 2571 + }, + { + "epoch": 0.17426654922420218, + "grad_norm": 8.775167465209961, + "learning_rate": 9.750701622287631e-05, + "loss": 1.1545, + "step": 2572 + }, + { + "epoch": 0.17433430449217427, + "grad_norm": 8.820073127746582, + "learning_rate": 9.75056472037785e-05, + "loss": 1.0528, + "step": 2573 + }, + { + "epoch": 0.17440205976014636, + "grad_norm": 9.53591537475586, + "learning_rate": 9.750427818468069e-05, + "loss": 1.0562, + "step": 2574 + }, + { + "epoch": 0.17446981502811842, + "grad_norm": 10.08950138092041, + "learning_rate": 9.750290916558287e-05, + "loss": 1.0456, + "step": 2575 + }, + { + "epoch": 0.17453757029609052, + "grad_norm": 8.222607612609863, + "learning_rate": 9.750154014648505e-05, + "loss": 0.9168, + "step": 2576 + }, + { + "epoch": 0.1746053255640626, + "grad_norm": 10.067093849182129, + "learning_rate": 9.750017112738723e-05, + "loss": 0.9952, + "step": 2577 + }, + { + "epoch": 0.1746730808320347, + "grad_norm": 9.726996421813965, + "learning_rate": 9.749880210828941e-05, + "loss": 0.9428, + "step": 2578 + }, + { + "epoch": 0.17474083610000676, + "grad_norm": 9.970647811889648, + "learning_rate": 9.74974330891916e-05, + "loss": 1.1838, + "step": 2579 + }, + { + "epoch": 0.17480859136797886, + "grad_norm": 7.8667893409729, + "learning_rate": 9.749606407009378e-05, + "loss": 1.0341, + "step": 2580 + }, + { + "epoch": 0.17487634663595095, + "grad_norm": 9.227079391479492, + "learning_rate": 9.749469505099596e-05, + "loss": 1.0515, + "step": 2581 + }, + { + "epoch": 0.17494410190392304, + "grad_norm": 9.198224067687988, + "learning_rate": 9.749332603189814e-05, + "loss": 1.0935, + "step": 2582 + }, + { + "epoch": 0.1750118571718951, + "grad_norm": 11.105298042297363, + "learning_rate": 9.749195701280032e-05, + "loss": 1.2115, + "step": 2583 + }, + { + "epoch": 0.1750796124398672, + "grad_norm": 8.815799713134766, + "learning_rate": 9.749058799370252e-05, + "loss": 1.1308, + "step": 2584 + }, + { + "epoch": 0.17514736770783929, + "grad_norm": 10.571581840515137, + "learning_rate": 9.74892189746047e-05, + "loss": 1.1982, + "step": 2585 + }, + { + "epoch": 0.17521512297581138, + "grad_norm": 7.368075370788574, + "learning_rate": 9.748784995550688e-05, + "loss": 0.9352, + "step": 2586 + }, + { + "epoch": 0.17528287824378344, + "grad_norm": 7.103427410125732, + "learning_rate": 9.748648093640906e-05, + "loss": 1.0578, + "step": 2587 + }, + { + "epoch": 0.17535063351175553, + "grad_norm": 7.678786754608154, + "learning_rate": 9.748511191731125e-05, + "loss": 0.8549, + "step": 2588 + }, + { + "epoch": 0.17541838877972762, + "grad_norm": 9.514643669128418, + "learning_rate": 9.748374289821343e-05, + "loss": 1.1835, + "step": 2589 + }, + { + "epoch": 0.17548614404769972, + "grad_norm": 7.552379131317139, + "learning_rate": 9.748237387911561e-05, + "loss": 0.9425, + "step": 2590 + }, + { + "epoch": 0.17555389931567178, + "grad_norm": 7.663018226623535, + "learning_rate": 9.74810048600178e-05, + "loss": 1.0948, + "step": 2591 + }, + { + "epoch": 0.17562165458364387, + "grad_norm": 10.293536186218262, + "learning_rate": 9.747963584091997e-05, + "loss": 1.1636, + "step": 2592 + }, + { + "epoch": 0.17568940985161596, + "grad_norm": 9.024083137512207, + "learning_rate": 9.747826682182217e-05, + "loss": 1.1157, + "step": 2593 + }, + { + "epoch": 0.17575716511958805, + "grad_norm": 8.870935440063477, + "learning_rate": 9.747689780272435e-05, + "loss": 0.9634, + "step": 2594 + }, + { + "epoch": 0.17582492038756015, + "grad_norm": 8.553323745727539, + "learning_rate": 9.747552878362653e-05, + "loss": 1.1515, + "step": 2595 + }, + { + "epoch": 0.1758926756555322, + "grad_norm": 9.13661003112793, + "learning_rate": 9.747415976452871e-05, + "loss": 0.9267, + "step": 2596 + }, + { + "epoch": 0.1759604309235043, + "grad_norm": 8.66097640991211, + "learning_rate": 9.74727907454309e-05, + "loss": 0.6839, + "step": 2597 + }, + { + "epoch": 0.1760281861914764, + "grad_norm": 10.07618236541748, + "learning_rate": 9.747142172633308e-05, + "loss": 1.2822, + "step": 2598 + }, + { + "epoch": 0.17609594145944849, + "grad_norm": 9.136283874511719, + "learning_rate": 9.747005270723526e-05, + "loss": 1.0053, + "step": 2599 + }, + { + "epoch": 0.17616369672742055, + "grad_norm": 9.42113208770752, + "learning_rate": 9.746868368813746e-05, + "loss": 1.15, + "step": 2600 + }, + { + "epoch": 0.17623145199539264, + "grad_norm": 8.789713859558105, + "learning_rate": 9.746731466903964e-05, + "loss": 0.8504, + "step": 2601 + }, + { + "epoch": 0.17629920726336473, + "grad_norm": 8.704665184020996, + "learning_rate": 9.746594564994182e-05, + "loss": 1.1516, + "step": 2602 + }, + { + "epoch": 0.17636696253133682, + "grad_norm": 8.030630111694336, + "learning_rate": 9.746457663084401e-05, + "loss": 1.044, + "step": 2603 + }, + { + "epoch": 0.1764347177993089, + "grad_norm": 8.640777587890625, + "learning_rate": 9.746320761174619e-05, + "loss": 1.0013, + "step": 2604 + }, + { + "epoch": 0.17650247306728098, + "grad_norm": 7.806771278381348, + "learning_rate": 9.746183859264837e-05, + "loss": 1.0678, + "step": 2605 + }, + { + "epoch": 0.17657022833525307, + "grad_norm": 8.883776664733887, + "learning_rate": 9.746046957355055e-05, + "loss": 0.9277, + "step": 2606 + }, + { + "epoch": 0.17663798360322516, + "grad_norm": 7.539346694946289, + "learning_rate": 9.745910055445275e-05, + "loss": 1.0437, + "step": 2607 + }, + { + "epoch": 0.17670573887119723, + "grad_norm": 9.873644828796387, + "learning_rate": 9.745773153535493e-05, + "loss": 1.206, + "step": 2608 + }, + { + "epoch": 0.17677349413916932, + "grad_norm": 10.031026840209961, + "learning_rate": 9.745636251625711e-05, + "loss": 1.1934, + "step": 2609 + }, + { + "epoch": 0.1768412494071414, + "grad_norm": 9.194849014282227, + "learning_rate": 9.745499349715929e-05, + "loss": 1.2365, + "step": 2610 + }, + { + "epoch": 0.1769090046751135, + "grad_norm": 7.951476097106934, + "learning_rate": 9.745362447806148e-05, + "loss": 0.8829, + "step": 2611 + }, + { + "epoch": 0.17697675994308557, + "grad_norm": 8.77597427368164, + "learning_rate": 9.745225545896366e-05, + "loss": 1.0097, + "step": 2612 + }, + { + "epoch": 0.17704451521105766, + "grad_norm": 10.356508255004883, + "learning_rate": 9.745088643986584e-05, + "loss": 1.2603, + "step": 2613 + }, + { + "epoch": 0.17711227047902975, + "grad_norm": 9.24199390411377, + "learning_rate": 9.744951742076802e-05, + "loss": 1.0181, + "step": 2614 + }, + { + "epoch": 0.17718002574700184, + "grad_norm": 8.244451522827148, + "learning_rate": 9.74481484016702e-05, + "loss": 0.9602, + "step": 2615 + }, + { + "epoch": 0.1772477810149739, + "grad_norm": 10.769842147827148, + "learning_rate": 9.74467793825724e-05, + "loss": 1.0646, + "step": 2616 + }, + { + "epoch": 0.177315536282946, + "grad_norm": 9.381787300109863, + "learning_rate": 9.744541036347458e-05, + "loss": 0.8985, + "step": 2617 + }, + { + "epoch": 0.1773832915509181, + "grad_norm": 8.379899978637695, + "learning_rate": 9.744404134437676e-05, + "loss": 1.0368, + "step": 2618 + }, + { + "epoch": 0.17745104681889018, + "grad_norm": 9.59954833984375, + "learning_rate": 9.744267232527894e-05, + "loss": 1.1447, + "step": 2619 + }, + { + "epoch": 0.17751880208686224, + "grad_norm": 8.83703327178955, + "learning_rate": 9.744130330618113e-05, + "loss": 1.3435, + "step": 2620 + }, + { + "epoch": 0.17758655735483433, + "grad_norm": 9.448990821838379, + "learning_rate": 9.743993428708331e-05, + "loss": 1.1837, + "step": 2621 + }, + { + "epoch": 0.17765431262280643, + "grad_norm": 9.466961860656738, + "learning_rate": 9.74385652679855e-05, + "loss": 1.1265, + "step": 2622 + }, + { + "epoch": 0.17772206789077852, + "grad_norm": 8.68281364440918, + "learning_rate": 9.743719624888767e-05, + "loss": 0.8804, + "step": 2623 + }, + { + "epoch": 0.17778982315875058, + "grad_norm": 7.007611274719238, + "learning_rate": 9.743582722978985e-05, + "loss": 1.03, + "step": 2624 + }, + { + "epoch": 0.17785757842672267, + "grad_norm": 8.254279136657715, + "learning_rate": 9.743445821069205e-05, + "loss": 1.0328, + "step": 2625 + }, + { + "epoch": 0.17792533369469477, + "grad_norm": 9.134196281433105, + "learning_rate": 9.743308919159423e-05, + "loss": 0.9671, + "step": 2626 + }, + { + "epoch": 0.17799308896266686, + "grad_norm": 7.803997039794922, + "learning_rate": 9.743172017249641e-05, + "loss": 0.9692, + "step": 2627 + }, + { + "epoch": 0.17806084423063892, + "grad_norm": 8.37303638458252, + "learning_rate": 9.743035115339859e-05, + "loss": 0.8949, + "step": 2628 + }, + { + "epoch": 0.178128599498611, + "grad_norm": 9.928305625915527, + "learning_rate": 9.742898213430078e-05, + "loss": 1.2679, + "step": 2629 + }, + { + "epoch": 0.1781963547665831, + "grad_norm": 8.58604621887207, + "learning_rate": 9.742761311520296e-05, + "loss": 1.23, + "step": 2630 + }, + { + "epoch": 0.1782641100345552, + "grad_norm": 9.47903823852539, + "learning_rate": 9.742624409610514e-05, + "loss": 0.8701, + "step": 2631 + }, + { + "epoch": 0.17833186530252726, + "grad_norm": 8.013197898864746, + "learning_rate": 9.742487507700732e-05, + "loss": 0.8325, + "step": 2632 + }, + { + "epoch": 0.17839962057049935, + "grad_norm": 9.541396141052246, + "learning_rate": 9.74235060579095e-05, + "loss": 1.0996, + "step": 2633 + }, + { + "epoch": 0.17846737583847144, + "grad_norm": 7.515398979187012, + "learning_rate": 9.74221370388117e-05, + "loss": 1.1174, + "step": 2634 + }, + { + "epoch": 0.17853513110644353, + "grad_norm": 9.562670707702637, + "learning_rate": 9.742076801971388e-05, + "loss": 0.9727, + "step": 2635 + }, + { + "epoch": 0.1786028863744156, + "grad_norm": 10.072449684143066, + "learning_rate": 9.741939900061606e-05, + "loss": 1.0021, + "step": 2636 + }, + { + "epoch": 0.1786706416423877, + "grad_norm": 9.634955406188965, + "learning_rate": 9.741802998151824e-05, + "loss": 1.3998, + "step": 2637 + }, + { + "epoch": 0.17873839691035978, + "grad_norm": 8.78954792022705, + "learning_rate": 9.741666096242042e-05, + "loss": 0.9602, + "step": 2638 + }, + { + "epoch": 0.17880615217833187, + "grad_norm": 9.597916603088379, + "learning_rate": 9.741529194332261e-05, + "loss": 1.0174, + "step": 2639 + }, + { + "epoch": 0.17887390744630394, + "grad_norm": 9.543583869934082, + "learning_rate": 9.74139229242248e-05, + "loss": 1.0686, + "step": 2640 + }, + { + "epoch": 0.17894166271427603, + "grad_norm": 8.247551918029785, + "learning_rate": 9.741255390512697e-05, + "loss": 1.1026, + "step": 2641 + }, + { + "epoch": 0.17900941798224812, + "grad_norm": 8.487943649291992, + "learning_rate": 9.741118488602915e-05, + "loss": 0.8843, + "step": 2642 + }, + { + "epoch": 0.1790771732502202, + "grad_norm": 9.061832427978516, + "learning_rate": 9.740981586693135e-05, + "loss": 0.9708, + "step": 2643 + }, + { + "epoch": 0.17914492851819228, + "grad_norm": 9.242405891418457, + "learning_rate": 9.740844684783353e-05, + "loss": 1.1344, + "step": 2644 + }, + { + "epoch": 0.17921268378616437, + "grad_norm": 8.192344665527344, + "learning_rate": 9.740707782873571e-05, + "loss": 0.7587, + "step": 2645 + }, + { + "epoch": 0.17928043905413646, + "grad_norm": 6.6164445877075195, + "learning_rate": 9.74057088096379e-05, + "loss": 0.842, + "step": 2646 + }, + { + "epoch": 0.17934819432210855, + "grad_norm": 10.285326957702637, + "learning_rate": 9.740433979054008e-05, + "loss": 1.138, + "step": 2647 + }, + { + "epoch": 0.17941594959008064, + "grad_norm": 8.55659294128418, + "learning_rate": 9.740297077144226e-05, + "loss": 0.7925, + "step": 2648 + }, + { + "epoch": 0.1794837048580527, + "grad_norm": 8.384642601013184, + "learning_rate": 9.740160175234446e-05, + "loss": 0.9719, + "step": 2649 + }, + { + "epoch": 0.1795514601260248, + "grad_norm": 9.303678512573242, + "learning_rate": 9.740023273324664e-05, + "loss": 1.0688, + "step": 2650 + }, + { + "epoch": 0.1796192153939969, + "grad_norm": 9.642799377441406, + "learning_rate": 9.739886371414882e-05, + "loss": 0.9758, + "step": 2651 + }, + { + "epoch": 0.17968697066196898, + "grad_norm": 9.616509437561035, + "learning_rate": 9.739749469505101e-05, + "loss": 0.9999, + "step": 2652 + }, + { + "epoch": 0.17975472592994104, + "grad_norm": 10.511337280273438, + "learning_rate": 9.739612567595319e-05, + "loss": 1.0079, + "step": 2653 + }, + { + "epoch": 0.17982248119791314, + "grad_norm": 7.480882167816162, + "learning_rate": 9.739475665685537e-05, + "loss": 0.8353, + "step": 2654 + }, + { + "epoch": 0.17989023646588523, + "grad_norm": 10.277608871459961, + "learning_rate": 9.739338763775755e-05, + "loss": 0.9709, + "step": 2655 + }, + { + "epoch": 0.17995799173385732, + "grad_norm": 9.135882377624512, + "learning_rate": 9.739201861865973e-05, + "loss": 1.3888, + "step": 2656 + }, + { + "epoch": 0.18002574700182938, + "grad_norm": 8.147950172424316, + "learning_rate": 9.739064959956193e-05, + "loss": 1.2235, + "step": 2657 + }, + { + "epoch": 0.18009350226980148, + "grad_norm": 8.583501815795898, + "learning_rate": 9.738928058046411e-05, + "loss": 0.9123, + "step": 2658 + }, + { + "epoch": 0.18016125753777357, + "grad_norm": 8.202300071716309, + "learning_rate": 9.738791156136629e-05, + "loss": 1.0435, + "step": 2659 + }, + { + "epoch": 0.18022901280574566, + "grad_norm": 8.121417045593262, + "learning_rate": 9.738654254226847e-05, + "loss": 0.9351, + "step": 2660 + }, + { + "epoch": 0.18029676807371772, + "grad_norm": 10.645029067993164, + "learning_rate": 9.738517352317065e-05, + "loss": 1.5219, + "step": 2661 + }, + { + "epoch": 0.18036452334168981, + "grad_norm": 9.766422271728516, + "learning_rate": 9.738380450407284e-05, + "loss": 1.0499, + "step": 2662 + }, + { + "epoch": 0.1804322786096619, + "grad_norm": 8.137174606323242, + "learning_rate": 9.738243548497502e-05, + "loss": 1.1871, + "step": 2663 + }, + { + "epoch": 0.180500033877634, + "grad_norm": 8.422198295593262, + "learning_rate": 9.73810664658772e-05, + "loss": 0.9524, + "step": 2664 + }, + { + "epoch": 0.18056778914560606, + "grad_norm": 6.721381664276123, + "learning_rate": 9.737969744677938e-05, + "loss": 1.0449, + "step": 2665 + }, + { + "epoch": 0.18063554441357815, + "grad_norm": 9.175302505493164, + "learning_rate": 9.737832842768158e-05, + "loss": 1.0508, + "step": 2666 + }, + { + "epoch": 0.18070329968155024, + "grad_norm": 10.165428161621094, + "learning_rate": 9.737695940858376e-05, + "loss": 0.8744, + "step": 2667 + }, + { + "epoch": 0.18077105494952234, + "grad_norm": 9.396173477172852, + "learning_rate": 9.737559038948594e-05, + "loss": 1.0304, + "step": 2668 + }, + { + "epoch": 0.1808388102174944, + "grad_norm": 9.680516242980957, + "learning_rate": 9.737422137038812e-05, + "loss": 0.9773, + "step": 2669 + }, + { + "epoch": 0.1809065654854665, + "grad_norm": 7.570343017578125, + "learning_rate": 9.73728523512903e-05, + "loss": 0.9159, + "step": 2670 + }, + { + "epoch": 0.18097432075343858, + "grad_norm": 9.5789213180542, + "learning_rate": 9.737148333219249e-05, + "loss": 1.1269, + "step": 2671 + }, + { + "epoch": 0.18104207602141068, + "grad_norm": 8.559327125549316, + "learning_rate": 9.737011431309467e-05, + "loss": 1.2318, + "step": 2672 + }, + { + "epoch": 0.18110983128938274, + "grad_norm": 7.630974292755127, + "learning_rate": 9.736874529399685e-05, + "loss": 0.8996, + "step": 2673 + }, + { + "epoch": 0.18117758655735483, + "grad_norm": 8.078895568847656, + "learning_rate": 9.736737627489903e-05, + "loss": 0.9942, + "step": 2674 + }, + { + "epoch": 0.18124534182532692, + "grad_norm": 8.830656051635742, + "learning_rate": 9.736600725580123e-05, + "loss": 1.0614, + "step": 2675 + }, + { + "epoch": 0.181313097093299, + "grad_norm": 7.66297721862793, + "learning_rate": 9.736463823670341e-05, + "loss": 1.025, + "step": 2676 + }, + { + "epoch": 0.18138085236127108, + "grad_norm": 8.318365097045898, + "learning_rate": 9.736326921760559e-05, + "loss": 1.1286, + "step": 2677 + }, + { + "epoch": 0.18144860762924317, + "grad_norm": 10.180238723754883, + "learning_rate": 9.736190019850777e-05, + "loss": 1.1304, + "step": 2678 + }, + { + "epoch": 0.18151636289721526, + "grad_norm": 9.2420654296875, + "learning_rate": 9.736053117940995e-05, + "loss": 1.157, + "step": 2679 + }, + { + "epoch": 0.18158411816518735, + "grad_norm": 7.984904766082764, + "learning_rate": 9.735916216031214e-05, + "loss": 1.1325, + "step": 2680 + }, + { + "epoch": 0.18165187343315942, + "grad_norm": 11.136275291442871, + "learning_rate": 9.735779314121432e-05, + "loss": 1.0151, + "step": 2681 + }, + { + "epoch": 0.1817196287011315, + "grad_norm": 9.954483032226562, + "learning_rate": 9.73564241221165e-05, + "loss": 1.0934, + "step": 2682 + }, + { + "epoch": 0.1817873839691036, + "grad_norm": 8.491388320922852, + "learning_rate": 9.735505510301868e-05, + "loss": 1.2288, + "step": 2683 + }, + { + "epoch": 0.1818551392370757, + "grad_norm": 9.355586051940918, + "learning_rate": 9.735368608392088e-05, + "loss": 0.7999, + "step": 2684 + }, + { + "epoch": 0.18192289450504776, + "grad_norm": 8.829736709594727, + "learning_rate": 9.735231706482306e-05, + "loss": 0.9711, + "step": 2685 + }, + { + "epoch": 0.18199064977301985, + "grad_norm": 9.667959213256836, + "learning_rate": 9.735094804572524e-05, + "loss": 1.1132, + "step": 2686 + }, + { + "epoch": 0.18205840504099194, + "grad_norm": 6.971467971801758, + "learning_rate": 9.734957902662742e-05, + "loss": 1.0252, + "step": 2687 + }, + { + "epoch": 0.18212616030896403, + "grad_norm": 9.69013500213623, + "learning_rate": 9.73482100075296e-05, + "loss": 0.9562, + "step": 2688 + }, + { + "epoch": 0.1821939155769361, + "grad_norm": 9.47673511505127, + "learning_rate": 9.734684098843179e-05, + "loss": 0.9822, + "step": 2689 + }, + { + "epoch": 0.18226167084490819, + "grad_norm": 9.708051681518555, + "learning_rate": 9.734547196933397e-05, + "loss": 1.0772, + "step": 2690 + }, + { + "epoch": 0.18232942611288028, + "grad_norm": 10.617173194885254, + "learning_rate": 9.734410295023615e-05, + "loss": 1.2686, + "step": 2691 + }, + { + "epoch": 0.18239718138085237, + "grad_norm": 9.52670955657959, + "learning_rate": 9.734273393113835e-05, + "loss": 0.9477, + "step": 2692 + }, + { + "epoch": 0.18246493664882443, + "grad_norm": 10.090290069580078, + "learning_rate": 9.734136491204053e-05, + "loss": 1.1766, + "step": 2693 + }, + { + "epoch": 0.18253269191679652, + "grad_norm": 8.709790229797363, + "learning_rate": 9.733999589294271e-05, + "loss": 1.1327, + "step": 2694 + }, + { + "epoch": 0.18260044718476862, + "grad_norm": 12.07381534576416, + "learning_rate": 9.73386268738449e-05, + "loss": 1.3694, + "step": 2695 + }, + { + "epoch": 0.1826682024527407, + "grad_norm": 8.329826354980469, + "learning_rate": 9.733725785474708e-05, + "loss": 1.0448, + "step": 2696 + }, + { + "epoch": 0.18273595772071277, + "grad_norm": 7.551383972167969, + "learning_rate": 9.733588883564926e-05, + "loss": 1.029, + "step": 2697 + }, + { + "epoch": 0.18280371298868486, + "grad_norm": 9.393105506896973, + "learning_rate": 9.733451981655146e-05, + "loss": 0.9729, + "step": 2698 + }, + { + "epoch": 0.18287146825665695, + "grad_norm": 9.283944129943848, + "learning_rate": 9.733315079745364e-05, + "loss": 1.2459, + "step": 2699 + }, + { + "epoch": 0.18293922352462905, + "grad_norm": 9.02878189086914, + "learning_rate": 9.733178177835582e-05, + "loss": 1.3235, + "step": 2700 + }, + { + "epoch": 0.18300697879260114, + "grad_norm": 8.735793113708496, + "learning_rate": 9.7330412759258e-05, + "loss": 1.1447, + "step": 2701 + }, + { + "epoch": 0.1830747340605732, + "grad_norm": 8.511741638183594, + "learning_rate": 9.732904374016018e-05, + "loss": 1.0148, + "step": 2702 + }, + { + "epoch": 0.1831424893285453, + "grad_norm": 6.590076446533203, + "learning_rate": 9.732767472106237e-05, + "loss": 1.0904, + "step": 2703 + }, + { + "epoch": 0.18321024459651739, + "grad_norm": 11.523942947387695, + "learning_rate": 9.732630570196455e-05, + "loss": 0.9349, + "step": 2704 + }, + { + "epoch": 0.18327799986448948, + "grad_norm": 6.972995281219482, + "learning_rate": 9.732493668286673e-05, + "loss": 0.9837, + "step": 2705 + }, + { + "epoch": 0.18334575513246154, + "grad_norm": 7.834453105926514, + "learning_rate": 9.732356766376891e-05, + "loss": 1.0374, + "step": 2706 + }, + { + "epoch": 0.18341351040043363, + "grad_norm": 9.44674015045166, + "learning_rate": 9.732219864467111e-05, + "loss": 1.4479, + "step": 2707 + }, + { + "epoch": 0.18348126566840572, + "grad_norm": 9.017583847045898, + "learning_rate": 9.732082962557329e-05, + "loss": 1.136, + "step": 2708 + }, + { + "epoch": 0.18354902093637782, + "grad_norm": 8.40336799621582, + "learning_rate": 9.731946060647547e-05, + "loss": 0.9914, + "step": 2709 + }, + { + "epoch": 0.18361677620434988, + "grad_norm": 10.586843490600586, + "learning_rate": 9.731809158737765e-05, + "loss": 1.1515, + "step": 2710 + }, + { + "epoch": 0.18368453147232197, + "grad_norm": 7.611972808837891, + "learning_rate": 9.731672256827983e-05, + "loss": 1.0821, + "step": 2711 + }, + { + "epoch": 0.18375228674029406, + "grad_norm": 9.093097686767578, + "learning_rate": 9.731535354918202e-05, + "loss": 1.1295, + "step": 2712 + }, + { + "epoch": 0.18382004200826615, + "grad_norm": 8.013279914855957, + "learning_rate": 9.73139845300842e-05, + "loss": 0.839, + "step": 2713 + }, + { + "epoch": 0.18388779727623822, + "grad_norm": 7.941558837890625, + "learning_rate": 9.731261551098638e-05, + "loss": 1.0819, + "step": 2714 + }, + { + "epoch": 0.1839555525442103, + "grad_norm": 7.09537935256958, + "learning_rate": 9.731124649188856e-05, + "loss": 0.8606, + "step": 2715 + }, + { + "epoch": 0.1840233078121824, + "grad_norm": 7.867136478424072, + "learning_rate": 9.730987747279074e-05, + "loss": 0.9942, + "step": 2716 + }, + { + "epoch": 0.1840910630801545, + "grad_norm": 8.66762638092041, + "learning_rate": 9.730850845369294e-05, + "loss": 1.0792, + "step": 2717 + }, + { + "epoch": 0.18415881834812656, + "grad_norm": 7.97219705581665, + "learning_rate": 9.730713943459512e-05, + "loss": 1.0366, + "step": 2718 + }, + { + "epoch": 0.18422657361609865, + "grad_norm": 8.341264724731445, + "learning_rate": 9.73057704154973e-05, + "loss": 1.0833, + "step": 2719 + }, + { + "epoch": 0.18429432888407074, + "grad_norm": 8.704034805297852, + "learning_rate": 9.730440139639948e-05, + "loss": 1.2909, + "step": 2720 + }, + { + "epoch": 0.18436208415204283, + "grad_norm": 9.759525299072266, + "learning_rate": 9.730303237730167e-05, + "loss": 1.1749, + "step": 2721 + }, + { + "epoch": 0.1844298394200149, + "grad_norm": 9.2392578125, + "learning_rate": 9.730166335820385e-05, + "loss": 1.4037, + "step": 2722 + }, + { + "epoch": 0.184497594687987, + "grad_norm": 10.519861221313477, + "learning_rate": 9.730029433910603e-05, + "loss": 1.2692, + "step": 2723 + }, + { + "epoch": 0.18456534995595908, + "grad_norm": 8.285715103149414, + "learning_rate": 9.729892532000821e-05, + "loss": 0.8014, + "step": 2724 + }, + { + "epoch": 0.18463310522393117, + "grad_norm": 7.261229991912842, + "learning_rate": 9.72975563009104e-05, + "loss": 0.9302, + "step": 2725 + }, + { + "epoch": 0.18470086049190323, + "grad_norm": 10.149426460266113, + "learning_rate": 9.729618728181259e-05, + "loss": 1.2855, + "step": 2726 + }, + { + "epoch": 0.18476861575987533, + "grad_norm": 7.495121955871582, + "learning_rate": 9.729481826271477e-05, + "loss": 0.9771, + "step": 2727 + }, + { + "epoch": 0.18483637102784742, + "grad_norm": 9.102607727050781, + "learning_rate": 9.729344924361695e-05, + "loss": 1.1974, + "step": 2728 + }, + { + "epoch": 0.1849041262958195, + "grad_norm": 9.57135009765625, + "learning_rate": 9.729208022451913e-05, + "loss": 0.9758, + "step": 2729 + }, + { + "epoch": 0.18497188156379157, + "grad_norm": 8.745903015136719, + "learning_rate": 9.729071120542132e-05, + "loss": 1.1719, + "step": 2730 + }, + { + "epoch": 0.18503963683176367, + "grad_norm": 10.697607040405273, + "learning_rate": 9.72893421863235e-05, + "loss": 0.952, + "step": 2731 + }, + { + "epoch": 0.18510739209973576, + "grad_norm": 9.907280921936035, + "learning_rate": 9.728797316722568e-05, + "loss": 0.9571, + "step": 2732 + }, + { + "epoch": 0.18517514736770785, + "grad_norm": 7.285250186920166, + "learning_rate": 9.728660414812786e-05, + "loss": 0.9002, + "step": 2733 + }, + { + "epoch": 0.1852429026356799, + "grad_norm": 8.134112358093262, + "learning_rate": 9.728523512903004e-05, + "loss": 0.8645, + "step": 2734 + }, + { + "epoch": 0.185310657903652, + "grad_norm": 9.427742004394531, + "learning_rate": 9.728386610993224e-05, + "loss": 1.2333, + "step": 2735 + }, + { + "epoch": 0.1853784131716241, + "grad_norm": 8.804586410522461, + "learning_rate": 9.728249709083442e-05, + "loss": 1.2601, + "step": 2736 + }, + { + "epoch": 0.1854461684395962, + "grad_norm": 9.144674301147461, + "learning_rate": 9.72811280717366e-05, + "loss": 0.9503, + "step": 2737 + }, + { + "epoch": 0.18551392370756825, + "grad_norm": 9.399337768554688, + "learning_rate": 9.727975905263879e-05, + "loss": 0.8586, + "step": 2738 + }, + { + "epoch": 0.18558167897554034, + "grad_norm": 7.7377119064331055, + "learning_rate": 9.727839003354097e-05, + "loss": 0.7511, + "step": 2739 + }, + { + "epoch": 0.18564943424351243, + "grad_norm": 9.146937370300293, + "learning_rate": 9.727702101444315e-05, + "loss": 1.0037, + "step": 2740 + }, + { + "epoch": 0.18571718951148453, + "grad_norm": 7.722357273101807, + "learning_rate": 9.727565199534535e-05, + "loss": 1.077, + "step": 2741 + }, + { + "epoch": 0.1857849447794566, + "grad_norm": 7.808566093444824, + "learning_rate": 9.727428297624753e-05, + "loss": 0.8953, + "step": 2742 + }, + { + "epoch": 0.18585270004742868, + "grad_norm": 8.765763282775879, + "learning_rate": 9.727291395714971e-05, + "loss": 1.0691, + "step": 2743 + }, + { + "epoch": 0.18592045531540077, + "grad_norm": 11.350706100463867, + "learning_rate": 9.72715449380519e-05, + "loss": 0.9308, + "step": 2744 + }, + { + "epoch": 0.18598821058337286, + "grad_norm": 7.838237285614014, + "learning_rate": 9.727017591895408e-05, + "loss": 0.9452, + "step": 2745 + }, + { + "epoch": 0.18605596585134493, + "grad_norm": 9.660740852355957, + "learning_rate": 9.726880689985626e-05, + "loss": 0.8705, + "step": 2746 + }, + { + "epoch": 0.18612372111931702, + "grad_norm": 8.146308898925781, + "learning_rate": 9.726743788075844e-05, + "loss": 1.0362, + "step": 2747 + }, + { + "epoch": 0.1861914763872891, + "grad_norm": 9.901165008544922, + "learning_rate": 9.726606886166062e-05, + "loss": 1.0842, + "step": 2748 + }, + { + "epoch": 0.1862592316552612, + "grad_norm": 7.650402069091797, + "learning_rate": 9.726469984256282e-05, + "loss": 0.997, + "step": 2749 + }, + { + "epoch": 0.18632698692323327, + "grad_norm": 7.760092735290527, + "learning_rate": 9.7263330823465e-05, + "loss": 0.9903, + "step": 2750 + }, + { + "epoch": 0.18639474219120536, + "grad_norm": 9.523726463317871, + "learning_rate": 9.726196180436718e-05, + "loss": 0.9276, + "step": 2751 + }, + { + "epoch": 0.18646249745917745, + "grad_norm": 7.759490489959717, + "learning_rate": 9.726059278526936e-05, + "loss": 0.9356, + "step": 2752 + }, + { + "epoch": 0.18653025272714954, + "grad_norm": 10.949979782104492, + "learning_rate": 9.725922376617155e-05, + "loss": 1.1451, + "step": 2753 + }, + { + "epoch": 0.18659800799512163, + "grad_norm": 8.312686920166016, + "learning_rate": 9.725785474707373e-05, + "loss": 0.973, + "step": 2754 + }, + { + "epoch": 0.1866657632630937, + "grad_norm": 6.999983787536621, + "learning_rate": 9.725648572797591e-05, + "loss": 1.0699, + "step": 2755 + }, + { + "epoch": 0.1867335185310658, + "grad_norm": 9.422745704650879, + "learning_rate": 9.725511670887809e-05, + "loss": 1.0697, + "step": 2756 + }, + { + "epoch": 0.18680127379903788, + "grad_norm": 8.054603576660156, + "learning_rate": 9.725374768978027e-05, + "loss": 0.8764, + "step": 2757 + }, + { + "epoch": 0.18686902906700997, + "grad_norm": 7.902176856994629, + "learning_rate": 9.725237867068247e-05, + "loss": 0.9893, + "step": 2758 + }, + { + "epoch": 0.18693678433498204, + "grad_norm": 8.409537315368652, + "learning_rate": 9.725100965158465e-05, + "loss": 1.0526, + "step": 2759 + }, + { + "epoch": 0.18700453960295413, + "grad_norm": 9.867463111877441, + "learning_rate": 9.724964063248683e-05, + "loss": 1.1218, + "step": 2760 + }, + { + "epoch": 0.18707229487092622, + "grad_norm": 6.852199554443359, + "learning_rate": 9.724827161338901e-05, + "loss": 0.9838, + "step": 2761 + }, + { + "epoch": 0.1871400501388983, + "grad_norm": 9.132448196411133, + "learning_rate": 9.72469025942912e-05, + "loss": 1.038, + "step": 2762 + }, + { + "epoch": 0.18720780540687038, + "grad_norm": 8.699213981628418, + "learning_rate": 9.724553357519338e-05, + "loss": 1.0532, + "step": 2763 + }, + { + "epoch": 0.18727556067484247, + "grad_norm": 8.520672798156738, + "learning_rate": 9.724416455609556e-05, + "loss": 1.2942, + "step": 2764 + }, + { + "epoch": 0.18734331594281456, + "grad_norm": 9.690433502197266, + "learning_rate": 9.724279553699774e-05, + "loss": 0.9552, + "step": 2765 + }, + { + "epoch": 0.18741107121078665, + "grad_norm": 7.540090560913086, + "learning_rate": 9.724142651789992e-05, + "loss": 1.0718, + "step": 2766 + }, + { + "epoch": 0.18747882647875871, + "grad_norm": 9.072039604187012, + "learning_rate": 9.724005749880212e-05, + "loss": 1.0313, + "step": 2767 + }, + { + "epoch": 0.1875465817467308, + "grad_norm": 10.155011177062988, + "learning_rate": 9.72386884797043e-05, + "loss": 1.3498, + "step": 2768 + }, + { + "epoch": 0.1876143370147029, + "grad_norm": 7.816718101501465, + "learning_rate": 9.723731946060648e-05, + "loss": 0.9594, + "step": 2769 + }, + { + "epoch": 0.187682092282675, + "grad_norm": 10.980911254882812, + "learning_rate": 9.723595044150866e-05, + "loss": 1.1675, + "step": 2770 + }, + { + "epoch": 0.18774984755064705, + "grad_norm": 8.865739822387695, + "learning_rate": 9.723458142241084e-05, + "loss": 1.1796, + "step": 2771 + }, + { + "epoch": 0.18781760281861914, + "grad_norm": 8.97850227355957, + "learning_rate": 9.723321240331303e-05, + "loss": 1.0907, + "step": 2772 + }, + { + "epoch": 0.18788535808659124, + "grad_norm": 8.16921329498291, + "learning_rate": 9.723184338421521e-05, + "loss": 0.95, + "step": 2773 + }, + { + "epoch": 0.18795311335456333, + "grad_norm": 8.766203880310059, + "learning_rate": 9.723047436511739e-05, + "loss": 0.9087, + "step": 2774 + }, + { + "epoch": 0.1880208686225354, + "grad_norm": 7.410607814788818, + "learning_rate": 9.722910534601957e-05, + "loss": 0.8082, + "step": 2775 + }, + { + "epoch": 0.18808862389050748, + "grad_norm": 9.640182495117188, + "learning_rate": 9.722773632692177e-05, + "loss": 0.99, + "step": 2776 + }, + { + "epoch": 0.18815637915847958, + "grad_norm": 9.038297653198242, + "learning_rate": 9.722636730782395e-05, + "loss": 1.0017, + "step": 2777 + }, + { + "epoch": 0.18822413442645167, + "grad_norm": 10.167421340942383, + "learning_rate": 9.722499828872613e-05, + "loss": 1.1588, + "step": 2778 + }, + { + "epoch": 0.18829188969442373, + "grad_norm": 9.911538124084473, + "learning_rate": 9.722362926962831e-05, + "loss": 1.052, + "step": 2779 + }, + { + "epoch": 0.18835964496239582, + "grad_norm": 8.78661060333252, + "learning_rate": 9.722226025053049e-05, + "loss": 0.9827, + "step": 2780 + }, + { + "epoch": 0.18842740023036791, + "grad_norm": 8.58356761932373, + "learning_rate": 9.722089123143268e-05, + "loss": 0.9663, + "step": 2781 + }, + { + "epoch": 0.18849515549834, + "grad_norm": 7.882653713226318, + "learning_rate": 9.721952221233486e-05, + "loss": 1.0282, + "step": 2782 + }, + { + "epoch": 0.18856291076631207, + "grad_norm": 8.180728912353516, + "learning_rate": 9.721815319323704e-05, + "loss": 0.8983, + "step": 2783 + }, + { + "epoch": 0.18863066603428416, + "grad_norm": 10.785475730895996, + "learning_rate": 9.721678417413924e-05, + "loss": 1.0949, + "step": 2784 + }, + { + "epoch": 0.18869842130225625, + "grad_norm": 8.2493257522583, + "learning_rate": 9.721541515504142e-05, + "loss": 0.8891, + "step": 2785 + }, + { + "epoch": 0.18876617657022834, + "grad_norm": 8.298515319824219, + "learning_rate": 9.72140461359436e-05, + "loss": 0.9384, + "step": 2786 + }, + { + "epoch": 0.1888339318382004, + "grad_norm": 11.643486022949219, + "learning_rate": 9.721267711684579e-05, + "loss": 1.0915, + "step": 2787 + }, + { + "epoch": 0.1889016871061725, + "grad_norm": 10.728472709655762, + "learning_rate": 9.721130809774797e-05, + "loss": 1.562, + "step": 2788 + }, + { + "epoch": 0.1889694423741446, + "grad_norm": 7.9253435134887695, + "learning_rate": 9.720993907865015e-05, + "loss": 0.876, + "step": 2789 + }, + { + "epoch": 0.18903719764211668, + "grad_norm": 9.075439453125, + "learning_rate": 9.720857005955235e-05, + "loss": 1.1712, + "step": 2790 + }, + { + "epoch": 0.18910495291008875, + "grad_norm": 8.572853088378906, + "learning_rate": 9.720720104045453e-05, + "loss": 0.9697, + "step": 2791 + }, + { + "epoch": 0.18917270817806084, + "grad_norm": 8.40988826751709, + "learning_rate": 9.720583202135671e-05, + "loss": 1.0904, + "step": 2792 + }, + { + "epoch": 0.18924046344603293, + "grad_norm": 8.015021324157715, + "learning_rate": 9.720446300225889e-05, + "loss": 1.0264, + "step": 2793 + }, + { + "epoch": 0.18930821871400502, + "grad_norm": 10.295256614685059, + "learning_rate": 9.720309398316107e-05, + "loss": 0.9608, + "step": 2794 + }, + { + "epoch": 0.18937597398197709, + "grad_norm": 9.377728462219238, + "learning_rate": 9.720172496406326e-05, + "loss": 1.2169, + "step": 2795 + }, + { + "epoch": 0.18944372924994918, + "grad_norm": 8.55238151550293, + "learning_rate": 9.720035594496544e-05, + "loss": 1.0951, + "step": 2796 + }, + { + "epoch": 0.18951148451792127, + "grad_norm": 10.069438934326172, + "learning_rate": 9.719898692586762e-05, + "loss": 1.3206, + "step": 2797 + }, + { + "epoch": 0.18957923978589336, + "grad_norm": 9.229057312011719, + "learning_rate": 9.71976179067698e-05, + "loss": 1.18, + "step": 2798 + }, + { + "epoch": 0.18964699505386542, + "grad_norm": 8.911051750183105, + "learning_rate": 9.7196248887672e-05, + "loss": 1.1351, + "step": 2799 + }, + { + "epoch": 0.18971475032183752, + "grad_norm": 9.285752296447754, + "learning_rate": 9.719487986857418e-05, + "loss": 1.0626, + "step": 2800 + }, + { + "epoch": 0.1897825055898096, + "grad_norm": 8.793561935424805, + "learning_rate": 9.719351084947636e-05, + "loss": 1.033, + "step": 2801 + }, + { + "epoch": 0.1898502608577817, + "grad_norm": 10.635384559631348, + "learning_rate": 9.719214183037854e-05, + "loss": 1.5004, + "step": 2802 + }, + { + "epoch": 0.18991801612575376, + "grad_norm": 7.9627275466918945, + "learning_rate": 9.719077281128072e-05, + "loss": 1.0246, + "step": 2803 + }, + { + "epoch": 0.18998577139372586, + "grad_norm": 8.960352897644043, + "learning_rate": 9.718940379218291e-05, + "loss": 1.0316, + "step": 2804 + }, + { + "epoch": 0.19005352666169795, + "grad_norm": 9.522171020507812, + "learning_rate": 9.718803477308509e-05, + "loss": 1.0219, + "step": 2805 + }, + { + "epoch": 0.19012128192967004, + "grad_norm": 8.412702560424805, + "learning_rate": 9.718666575398727e-05, + "loss": 1.0609, + "step": 2806 + }, + { + "epoch": 0.19018903719764213, + "grad_norm": 8.777839660644531, + "learning_rate": 9.718529673488945e-05, + "loss": 1.0854, + "step": 2807 + }, + { + "epoch": 0.1902567924656142, + "grad_norm": 8.932796478271484, + "learning_rate": 9.718392771579165e-05, + "loss": 1.1661, + "step": 2808 + }, + { + "epoch": 0.19032454773358629, + "grad_norm": 8.514800071716309, + "learning_rate": 9.718255869669383e-05, + "loss": 1.0584, + "step": 2809 + }, + { + "epoch": 0.19039230300155838, + "grad_norm": 7.001948356628418, + "learning_rate": 9.718118967759601e-05, + "loss": 0.8479, + "step": 2810 + }, + { + "epoch": 0.19046005826953047, + "grad_norm": 8.977307319641113, + "learning_rate": 9.717982065849819e-05, + "loss": 1.2556, + "step": 2811 + }, + { + "epoch": 0.19052781353750253, + "grad_norm": 8.513920783996582, + "learning_rate": 9.717845163940037e-05, + "loss": 1.2076, + "step": 2812 + }, + { + "epoch": 0.19059556880547462, + "grad_norm": 8.556622505187988, + "learning_rate": 9.717708262030256e-05, + "loss": 1.0491, + "step": 2813 + }, + { + "epoch": 0.19066332407344672, + "grad_norm": 9.8518648147583, + "learning_rate": 9.717571360120474e-05, + "loss": 1.1179, + "step": 2814 + }, + { + "epoch": 0.1907310793414188, + "grad_norm": 8.887413024902344, + "learning_rate": 9.717434458210692e-05, + "loss": 1.0675, + "step": 2815 + }, + { + "epoch": 0.19079883460939087, + "grad_norm": 7.888981342315674, + "learning_rate": 9.71729755630091e-05, + "loss": 1.0489, + "step": 2816 + }, + { + "epoch": 0.19086658987736296, + "grad_norm": 7.692848205566406, + "learning_rate": 9.71716065439113e-05, + "loss": 1.0602, + "step": 2817 + }, + { + "epoch": 0.19093434514533505, + "grad_norm": 8.591787338256836, + "learning_rate": 9.717023752481348e-05, + "loss": 0.8737, + "step": 2818 + }, + { + "epoch": 0.19100210041330715, + "grad_norm": 9.622870445251465, + "learning_rate": 9.716886850571566e-05, + "loss": 0.9717, + "step": 2819 + }, + { + "epoch": 0.1910698556812792, + "grad_norm": 7.949582576751709, + "learning_rate": 9.716749948661784e-05, + "loss": 0.8651, + "step": 2820 + }, + { + "epoch": 0.1911376109492513, + "grad_norm": 9.46272087097168, + "learning_rate": 9.716613046752002e-05, + "loss": 0.9975, + "step": 2821 + }, + { + "epoch": 0.1912053662172234, + "grad_norm": 8.86549186706543, + "learning_rate": 9.716476144842221e-05, + "loss": 1.0805, + "step": 2822 + }, + { + "epoch": 0.19127312148519549, + "grad_norm": 8.039673805236816, + "learning_rate": 9.716339242932439e-05, + "loss": 1.121, + "step": 2823 + }, + { + "epoch": 0.19134087675316755, + "grad_norm": 8.606979370117188, + "learning_rate": 9.716202341022657e-05, + "loss": 0.9217, + "step": 2824 + }, + { + "epoch": 0.19140863202113964, + "grad_norm": 10.429420471191406, + "learning_rate": 9.716065439112875e-05, + "loss": 1.2266, + "step": 2825 + }, + { + "epoch": 0.19147638728911173, + "grad_norm": 7.339411735534668, + "learning_rate": 9.715928537203093e-05, + "loss": 1.0235, + "step": 2826 + }, + { + "epoch": 0.19154414255708382, + "grad_norm": 8.247300148010254, + "learning_rate": 9.715791635293313e-05, + "loss": 0.8993, + "step": 2827 + }, + { + "epoch": 0.1916118978250559, + "grad_norm": 8.475278854370117, + "learning_rate": 9.715654733383531e-05, + "loss": 0.9611, + "step": 2828 + }, + { + "epoch": 0.19167965309302798, + "grad_norm": 9.792519569396973, + "learning_rate": 9.715517831473749e-05, + "loss": 1.0831, + "step": 2829 + }, + { + "epoch": 0.19174740836100007, + "grad_norm": 6.757070541381836, + "learning_rate": 9.715380929563967e-05, + "loss": 1.0282, + "step": 2830 + }, + { + "epoch": 0.19181516362897216, + "grad_norm": 6.707785129547119, + "learning_rate": 9.715244027654186e-05, + "loss": 0.7813, + "step": 2831 + }, + { + "epoch": 0.19188291889694423, + "grad_norm": 9.990489959716797, + "learning_rate": 9.715107125744404e-05, + "loss": 1.0504, + "step": 2832 + }, + { + "epoch": 0.19195067416491632, + "grad_norm": 10.57358169555664, + "learning_rate": 9.714970223834622e-05, + "loss": 0.9355, + "step": 2833 + }, + { + "epoch": 0.1920184294328884, + "grad_norm": 9.225300788879395, + "learning_rate": 9.714833321924842e-05, + "loss": 1.2404, + "step": 2834 + }, + { + "epoch": 0.1920861847008605, + "grad_norm": 8.36042594909668, + "learning_rate": 9.71469642001506e-05, + "loss": 0.9012, + "step": 2835 + }, + { + "epoch": 0.19215393996883257, + "grad_norm": 9.624984741210938, + "learning_rate": 9.714559518105278e-05, + "loss": 1.0509, + "step": 2836 + }, + { + "epoch": 0.19222169523680466, + "grad_norm": 12.569930076599121, + "learning_rate": 9.714422616195497e-05, + "loss": 1.2663, + "step": 2837 + }, + { + "epoch": 0.19228945050477675, + "grad_norm": 12.278885841369629, + "learning_rate": 9.714285714285715e-05, + "loss": 1.1547, + "step": 2838 + }, + { + "epoch": 0.19235720577274884, + "grad_norm": 9.3023681640625, + "learning_rate": 9.714148812375933e-05, + "loss": 1.0873, + "step": 2839 + }, + { + "epoch": 0.1924249610407209, + "grad_norm": 8.241714477539062, + "learning_rate": 9.714011910466153e-05, + "loss": 0.8222, + "step": 2840 + }, + { + "epoch": 0.192492716308693, + "grad_norm": 10.45174503326416, + "learning_rate": 9.71387500855637e-05, + "loss": 1.1974, + "step": 2841 + }, + { + "epoch": 0.1925604715766651, + "grad_norm": 8.096826553344727, + "learning_rate": 9.713738106646589e-05, + "loss": 1.1147, + "step": 2842 + }, + { + "epoch": 0.19262822684463718, + "grad_norm": 9.053191184997559, + "learning_rate": 9.713601204736807e-05, + "loss": 0.9835, + "step": 2843 + }, + { + "epoch": 0.19269598211260924, + "grad_norm": 8.050823211669922, + "learning_rate": 9.713464302827025e-05, + "loss": 1.0826, + "step": 2844 + }, + { + "epoch": 0.19276373738058133, + "grad_norm": 10.000917434692383, + "learning_rate": 9.713327400917244e-05, + "loss": 1.1723, + "step": 2845 + }, + { + "epoch": 0.19283149264855343, + "grad_norm": 8.992589950561523, + "learning_rate": 9.713190499007462e-05, + "loss": 1.0203, + "step": 2846 + }, + { + "epoch": 0.19289924791652552, + "grad_norm": 10.340285301208496, + "learning_rate": 9.71305359709768e-05, + "loss": 1.2523, + "step": 2847 + }, + { + "epoch": 0.19296700318449758, + "grad_norm": 8.250594139099121, + "learning_rate": 9.712916695187898e-05, + "loss": 1.0606, + "step": 2848 + }, + { + "epoch": 0.19303475845246967, + "grad_norm": 7.16335916519165, + "learning_rate": 9.712779793278116e-05, + "loss": 0.9988, + "step": 2849 + }, + { + "epoch": 0.19310251372044177, + "grad_norm": 9.089055061340332, + "learning_rate": 9.712642891368336e-05, + "loss": 0.9368, + "step": 2850 + }, + { + "epoch": 0.19317026898841386, + "grad_norm": 7.568434238433838, + "learning_rate": 9.712505989458554e-05, + "loss": 0.9417, + "step": 2851 + }, + { + "epoch": 0.19323802425638592, + "grad_norm": 8.470823287963867, + "learning_rate": 9.712369087548772e-05, + "loss": 1.0271, + "step": 2852 + }, + { + "epoch": 0.193305779524358, + "grad_norm": 7.746623992919922, + "learning_rate": 9.71223218563899e-05, + "loss": 0.7641, + "step": 2853 + }, + { + "epoch": 0.1933735347923301, + "grad_norm": 7.44852352142334, + "learning_rate": 9.712095283729209e-05, + "loss": 0.9984, + "step": 2854 + }, + { + "epoch": 0.1934412900603022, + "grad_norm": 7.393777847290039, + "learning_rate": 9.711958381819427e-05, + "loss": 0.8515, + "step": 2855 + }, + { + "epoch": 0.19350904532827426, + "grad_norm": 8.247236251831055, + "learning_rate": 9.711821479909645e-05, + "loss": 0.9867, + "step": 2856 + }, + { + "epoch": 0.19357680059624635, + "grad_norm": 8.484920501708984, + "learning_rate": 9.711684577999863e-05, + "loss": 0.9099, + "step": 2857 + }, + { + "epoch": 0.19364455586421844, + "grad_norm": 9.85857105255127, + "learning_rate": 9.711547676090081e-05, + "loss": 1.0134, + "step": 2858 + }, + { + "epoch": 0.19371231113219053, + "grad_norm": 12.191691398620605, + "learning_rate": 9.7114107741803e-05, + "loss": 1.15, + "step": 2859 + }, + { + "epoch": 0.19378006640016263, + "grad_norm": 9.691742897033691, + "learning_rate": 9.711273872270519e-05, + "loss": 1.2415, + "step": 2860 + }, + { + "epoch": 0.1938478216681347, + "grad_norm": 11.395289421081543, + "learning_rate": 9.711136970360737e-05, + "loss": 1.3239, + "step": 2861 + }, + { + "epoch": 0.19391557693610678, + "grad_norm": 9.222856521606445, + "learning_rate": 9.711000068450955e-05, + "loss": 0.9087, + "step": 2862 + }, + { + "epoch": 0.19398333220407887, + "grad_norm": 8.062904357910156, + "learning_rate": 9.710863166541174e-05, + "loss": 1.093, + "step": 2863 + }, + { + "epoch": 0.19405108747205096, + "grad_norm": 8.160481452941895, + "learning_rate": 9.710726264631392e-05, + "loss": 0.842, + "step": 2864 + }, + { + "epoch": 0.19411884274002303, + "grad_norm": 8.165858268737793, + "learning_rate": 9.71058936272161e-05, + "loss": 0.8807, + "step": 2865 + }, + { + "epoch": 0.19418659800799512, + "grad_norm": 8.120240211486816, + "learning_rate": 9.710452460811828e-05, + "loss": 0.9733, + "step": 2866 + }, + { + "epoch": 0.1942543532759672, + "grad_norm": 11.363536834716797, + "learning_rate": 9.710315558902046e-05, + "loss": 1.1144, + "step": 2867 + }, + { + "epoch": 0.1943221085439393, + "grad_norm": 8.776150703430176, + "learning_rate": 9.710178656992266e-05, + "loss": 0.7561, + "step": 2868 + }, + { + "epoch": 0.19438986381191137, + "grad_norm": 9.393696784973145, + "learning_rate": 9.710041755082484e-05, + "loss": 0.8092, + "step": 2869 + }, + { + "epoch": 0.19445761907988346, + "grad_norm": 10.820277214050293, + "learning_rate": 9.709904853172702e-05, + "loss": 0.9905, + "step": 2870 + }, + { + "epoch": 0.19452537434785555, + "grad_norm": 8.041844367980957, + "learning_rate": 9.70976795126292e-05, + "loss": 0.8801, + "step": 2871 + }, + { + "epoch": 0.19459312961582764, + "grad_norm": 8.05355167388916, + "learning_rate": 9.709631049353138e-05, + "loss": 0.9946, + "step": 2872 + }, + { + "epoch": 0.1946608848837997, + "grad_norm": 9.46949577331543, + "learning_rate": 9.709494147443357e-05, + "loss": 1.0582, + "step": 2873 + }, + { + "epoch": 0.1947286401517718, + "grad_norm": 8.796204566955566, + "learning_rate": 9.709357245533575e-05, + "loss": 1.1542, + "step": 2874 + }, + { + "epoch": 0.1947963954197439, + "grad_norm": 10.45006275177002, + "learning_rate": 9.709220343623793e-05, + "loss": 1.2363, + "step": 2875 + }, + { + "epoch": 0.19486415068771598, + "grad_norm": 10.490102767944336, + "learning_rate": 9.709083441714011e-05, + "loss": 1.1977, + "step": 2876 + }, + { + "epoch": 0.19493190595568805, + "grad_norm": 9.048376083374023, + "learning_rate": 9.708946539804231e-05, + "loss": 0.8127, + "step": 2877 + }, + { + "epoch": 0.19499966122366014, + "grad_norm": 6.523111343383789, + "learning_rate": 9.708809637894449e-05, + "loss": 0.8182, + "step": 2878 + }, + { + "epoch": 0.19506741649163223, + "grad_norm": 7.170145511627197, + "learning_rate": 9.708672735984667e-05, + "loss": 0.9643, + "step": 2879 + }, + { + "epoch": 0.19513517175960432, + "grad_norm": 7.9981818199157715, + "learning_rate": 9.708535834074886e-05, + "loss": 1.3179, + "step": 2880 + }, + { + "epoch": 0.19520292702757638, + "grad_norm": 8.786405563354492, + "learning_rate": 9.708398932165104e-05, + "loss": 0.9217, + "step": 2881 + }, + { + "epoch": 0.19527068229554848, + "grad_norm": 6.7907867431640625, + "learning_rate": 9.708262030255322e-05, + "loss": 1.0381, + "step": 2882 + }, + { + "epoch": 0.19533843756352057, + "grad_norm": 7.1751556396484375, + "learning_rate": 9.708125128345542e-05, + "loss": 0.8334, + "step": 2883 + }, + { + "epoch": 0.19540619283149266, + "grad_norm": 8.236610412597656, + "learning_rate": 9.70798822643576e-05, + "loss": 1.2058, + "step": 2884 + }, + { + "epoch": 0.19547394809946472, + "grad_norm": 7.576273441314697, + "learning_rate": 9.707851324525978e-05, + "loss": 1.091, + "step": 2885 + }, + { + "epoch": 0.19554170336743681, + "grad_norm": 8.698029518127441, + "learning_rate": 9.707714422616197e-05, + "loss": 1.0262, + "step": 2886 + }, + { + "epoch": 0.1956094586354089, + "grad_norm": 8.583345413208008, + "learning_rate": 9.707577520706415e-05, + "loss": 1.0401, + "step": 2887 + }, + { + "epoch": 0.195677213903381, + "grad_norm": 7.242405891418457, + "learning_rate": 9.707440618796633e-05, + "loss": 0.785, + "step": 2888 + }, + { + "epoch": 0.19574496917135306, + "grad_norm": 8.4541654586792, + "learning_rate": 9.707303716886851e-05, + "loss": 1.0537, + "step": 2889 + }, + { + "epoch": 0.19581272443932515, + "grad_norm": 7.838657855987549, + "learning_rate": 9.707166814977069e-05, + "loss": 1.0062, + "step": 2890 + }, + { + "epoch": 0.19588047970729724, + "grad_norm": 6.812248229980469, + "learning_rate": 9.707029913067289e-05, + "loss": 0.885, + "step": 2891 + }, + { + "epoch": 0.19594823497526934, + "grad_norm": 8.789966583251953, + "learning_rate": 9.706893011157507e-05, + "loss": 1.0366, + "step": 2892 + }, + { + "epoch": 0.1960159902432414, + "grad_norm": 8.860052108764648, + "learning_rate": 9.706756109247725e-05, + "loss": 1.2247, + "step": 2893 + }, + { + "epoch": 0.1960837455112135, + "grad_norm": 11.235320091247559, + "learning_rate": 9.706619207337943e-05, + "loss": 1.1216, + "step": 2894 + }, + { + "epoch": 0.19615150077918558, + "grad_norm": 8.865259170532227, + "learning_rate": 9.706482305428162e-05, + "loss": 0.9467, + "step": 2895 + }, + { + "epoch": 0.19621925604715768, + "grad_norm": 8.345112800598145, + "learning_rate": 9.70634540351838e-05, + "loss": 0.9849, + "step": 2896 + }, + { + "epoch": 0.19628701131512974, + "grad_norm": 8.98128604888916, + "learning_rate": 9.706208501608598e-05, + "loss": 1.1421, + "step": 2897 + }, + { + "epoch": 0.19635476658310183, + "grad_norm": 7.786384582519531, + "learning_rate": 9.706071599698816e-05, + "loss": 1.0926, + "step": 2898 + }, + { + "epoch": 0.19642252185107392, + "grad_norm": 8.816730499267578, + "learning_rate": 9.705934697789034e-05, + "loss": 1.2626, + "step": 2899 + }, + { + "epoch": 0.19649027711904601, + "grad_norm": 7.831095218658447, + "learning_rate": 9.705797795879254e-05, + "loss": 0.9355, + "step": 2900 + }, + { + "epoch": 0.19655803238701808, + "grad_norm": 8.788371086120605, + "learning_rate": 9.705660893969472e-05, + "loss": 0.8883, + "step": 2901 + }, + { + "epoch": 0.19662578765499017, + "grad_norm": 8.88425064086914, + "learning_rate": 9.70552399205969e-05, + "loss": 1.1169, + "step": 2902 + }, + { + "epoch": 0.19669354292296226, + "grad_norm": 9.648268699645996, + "learning_rate": 9.705387090149908e-05, + "loss": 1.0628, + "step": 2903 + }, + { + "epoch": 0.19676129819093435, + "grad_norm": 8.19676685333252, + "learning_rate": 9.705250188240126e-05, + "loss": 0.9395, + "step": 2904 + }, + { + "epoch": 0.19682905345890642, + "grad_norm": 7.420725345611572, + "learning_rate": 9.705113286330345e-05, + "loss": 0.947, + "step": 2905 + }, + { + "epoch": 0.1968968087268785, + "grad_norm": 7.221796989440918, + "learning_rate": 9.704976384420563e-05, + "loss": 0.8491, + "step": 2906 + }, + { + "epoch": 0.1969645639948506, + "grad_norm": 9.932676315307617, + "learning_rate": 9.704839482510781e-05, + "loss": 1.117, + "step": 2907 + }, + { + "epoch": 0.1970323192628227, + "grad_norm": 11.816266059875488, + "learning_rate": 9.704702580600999e-05, + "loss": 1.4347, + "step": 2908 + }, + { + "epoch": 0.19710007453079476, + "grad_norm": 8.804407119750977, + "learning_rate": 9.704565678691219e-05, + "loss": 0.808, + "step": 2909 + }, + { + "epoch": 0.19716782979876685, + "grad_norm": 7.740353584289551, + "learning_rate": 9.704428776781437e-05, + "loss": 1.0732, + "step": 2910 + }, + { + "epoch": 0.19723558506673894, + "grad_norm": 12.040196418762207, + "learning_rate": 9.704291874871655e-05, + "loss": 0.947, + "step": 2911 + }, + { + "epoch": 0.19730334033471103, + "grad_norm": 7.727171421051025, + "learning_rate": 9.704154972961873e-05, + "loss": 0.8211, + "step": 2912 + }, + { + "epoch": 0.19737109560268312, + "grad_norm": 8.779428482055664, + "learning_rate": 9.704018071052091e-05, + "loss": 1.2204, + "step": 2913 + }, + { + "epoch": 0.19743885087065519, + "grad_norm": 7.907576084136963, + "learning_rate": 9.70388116914231e-05, + "loss": 1.0594, + "step": 2914 + }, + { + "epoch": 0.19750660613862728, + "grad_norm": 6.769292831420898, + "learning_rate": 9.703744267232528e-05, + "loss": 0.8743, + "step": 2915 + }, + { + "epoch": 0.19757436140659937, + "grad_norm": 8.966355323791504, + "learning_rate": 9.703607365322746e-05, + "loss": 1.1607, + "step": 2916 + }, + { + "epoch": 0.19764211667457146, + "grad_norm": 7.389810085296631, + "learning_rate": 9.703470463412964e-05, + "loss": 0.9728, + "step": 2917 + }, + { + "epoch": 0.19770987194254352, + "grad_norm": 10.877386093139648, + "learning_rate": 9.703333561503184e-05, + "loss": 1.0623, + "step": 2918 + }, + { + "epoch": 0.19777762721051562, + "grad_norm": 9.160116195678711, + "learning_rate": 9.703196659593402e-05, + "loss": 1.1374, + "step": 2919 + }, + { + "epoch": 0.1978453824784877, + "grad_norm": 7.6289167404174805, + "learning_rate": 9.70305975768362e-05, + "loss": 0.9498, + "step": 2920 + }, + { + "epoch": 0.1979131377464598, + "grad_norm": 8.663583755493164, + "learning_rate": 9.702922855773838e-05, + "loss": 1.1595, + "step": 2921 + }, + { + "epoch": 0.19798089301443186, + "grad_norm": 7.595486640930176, + "learning_rate": 9.702785953864056e-05, + "loss": 1.0623, + "step": 2922 + }, + { + "epoch": 0.19804864828240396, + "grad_norm": 8.680171966552734, + "learning_rate": 9.702649051954275e-05, + "loss": 1.0138, + "step": 2923 + }, + { + "epoch": 0.19811640355037605, + "grad_norm": 9.998015403747559, + "learning_rate": 9.702512150044493e-05, + "loss": 1.2006, + "step": 2924 + }, + { + "epoch": 0.19818415881834814, + "grad_norm": 7.801429271697998, + "learning_rate": 9.702375248134711e-05, + "loss": 0.8467, + "step": 2925 + }, + { + "epoch": 0.1982519140863202, + "grad_norm": 8.570688247680664, + "learning_rate": 9.70223834622493e-05, + "loss": 1.1439, + "step": 2926 + }, + { + "epoch": 0.1983196693542923, + "grad_norm": 7.835936069488525, + "learning_rate": 9.702101444315149e-05, + "loss": 0.9671, + "step": 2927 + }, + { + "epoch": 0.19838742462226439, + "grad_norm": 8.415708541870117, + "learning_rate": 9.701964542405367e-05, + "loss": 1.0381, + "step": 2928 + }, + { + "epoch": 0.19845517989023648, + "grad_norm": 10.370524406433105, + "learning_rate": 9.701827640495586e-05, + "loss": 1.151, + "step": 2929 + }, + { + "epoch": 0.19852293515820854, + "grad_norm": 8.228797912597656, + "learning_rate": 9.701690738585804e-05, + "loss": 1.093, + "step": 2930 + }, + { + "epoch": 0.19859069042618063, + "grad_norm": 8.070756912231445, + "learning_rate": 9.701553836676022e-05, + "loss": 1.0272, + "step": 2931 + }, + { + "epoch": 0.19865844569415272, + "grad_norm": 9.19532585144043, + "learning_rate": 9.701416934766242e-05, + "loss": 1.1195, + "step": 2932 + }, + { + "epoch": 0.19872620096212482, + "grad_norm": 10.692606925964355, + "learning_rate": 9.70128003285646e-05, + "loss": 1.3148, + "step": 2933 + }, + { + "epoch": 0.19879395623009688, + "grad_norm": 10.058424949645996, + "learning_rate": 9.701143130946678e-05, + "loss": 1.4236, + "step": 2934 + }, + { + "epoch": 0.19886171149806897, + "grad_norm": 6.197395324707031, + "learning_rate": 9.701006229036896e-05, + "loss": 0.8822, + "step": 2935 + }, + { + "epoch": 0.19892946676604106, + "grad_norm": 7.603270053863525, + "learning_rate": 9.700869327127114e-05, + "loss": 0.9655, + "step": 2936 + }, + { + "epoch": 0.19899722203401315, + "grad_norm": 8.192676544189453, + "learning_rate": 9.700732425217333e-05, + "loss": 0.9668, + "step": 2937 + }, + { + "epoch": 0.19906497730198522, + "grad_norm": 7.121623992919922, + "learning_rate": 9.700595523307551e-05, + "loss": 1.0037, + "step": 2938 + }, + { + "epoch": 0.1991327325699573, + "grad_norm": 6.8974127769470215, + "learning_rate": 9.700458621397769e-05, + "loss": 0.876, + "step": 2939 + }, + { + "epoch": 0.1992004878379294, + "grad_norm": 7.590656757354736, + "learning_rate": 9.700321719487987e-05, + "loss": 1.0346, + "step": 2940 + }, + { + "epoch": 0.1992682431059015, + "grad_norm": 8.530266761779785, + "learning_rate": 9.700184817578207e-05, + "loss": 1.0882, + "step": 2941 + }, + { + "epoch": 0.19933599837387356, + "grad_norm": 8.064129829406738, + "learning_rate": 9.700047915668425e-05, + "loss": 0.9949, + "step": 2942 + }, + { + "epoch": 0.19940375364184565, + "grad_norm": 7.23117208480835, + "learning_rate": 9.699911013758643e-05, + "loss": 0.7269, + "step": 2943 + }, + { + "epoch": 0.19947150890981774, + "grad_norm": 7.326268196105957, + "learning_rate": 9.69977411184886e-05, + "loss": 0.8542, + "step": 2944 + }, + { + "epoch": 0.19953926417778983, + "grad_norm": 7.708505153656006, + "learning_rate": 9.699637209939079e-05, + "loss": 0.8206, + "step": 2945 + }, + { + "epoch": 0.1996070194457619, + "grad_norm": 10.134513854980469, + "learning_rate": 9.699500308029298e-05, + "loss": 1.0079, + "step": 2946 + }, + { + "epoch": 0.199674774713734, + "grad_norm": 9.264663696289062, + "learning_rate": 9.699363406119516e-05, + "loss": 1.1702, + "step": 2947 + }, + { + "epoch": 0.19974252998170608, + "grad_norm": 8.894827842712402, + "learning_rate": 9.699226504209734e-05, + "loss": 0.9184, + "step": 2948 + }, + { + "epoch": 0.19981028524967817, + "grad_norm": 6.935434341430664, + "learning_rate": 9.699089602299952e-05, + "loss": 0.8554, + "step": 2949 + }, + { + "epoch": 0.19987804051765024, + "grad_norm": 9.57607364654541, + "learning_rate": 9.698952700390172e-05, + "loss": 1.0937, + "step": 2950 + }, + { + "epoch": 0.19994579578562233, + "grad_norm": 7.99752140045166, + "learning_rate": 9.69881579848039e-05, + "loss": 1.0797, + "step": 2951 + }, + { + "epoch": 0.20001355105359442, + "grad_norm": 8.067659378051758, + "learning_rate": 9.698678896570608e-05, + "loss": 1.0515, + "step": 2952 + }, + { + "epoch": 0.2000813063215665, + "grad_norm": 9.66697883605957, + "learning_rate": 9.698541994660826e-05, + "loss": 1.289, + "step": 2953 + }, + { + "epoch": 0.20014906158953857, + "grad_norm": 7.3660664558410645, + "learning_rate": 9.698405092751044e-05, + "loss": 0.7409, + "step": 2954 + }, + { + "epoch": 0.20021681685751067, + "grad_norm": 8.254073143005371, + "learning_rate": 9.698268190841263e-05, + "loss": 1.1095, + "step": 2955 + }, + { + "epoch": 0.20028457212548276, + "grad_norm": 8.221102714538574, + "learning_rate": 9.698131288931481e-05, + "loss": 1.0083, + "step": 2956 + }, + { + "epoch": 0.20035232739345485, + "grad_norm": 9.813411712646484, + "learning_rate": 9.697994387021699e-05, + "loss": 1.2888, + "step": 2957 + }, + { + "epoch": 0.2004200826614269, + "grad_norm": 10.87628173828125, + "learning_rate": 9.697857485111917e-05, + "loss": 1.16, + "step": 2958 + }, + { + "epoch": 0.200487837929399, + "grad_norm": 7.094732284545898, + "learning_rate": 9.697720583202135e-05, + "loss": 0.8959, + "step": 2959 + }, + { + "epoch": 0.2005555931973711, + "grad_norm": 9.814677238464355, + "learning_rate": 9.697583681292355e-05, + "loss": 1.1011, + "step": 2960 + }, + { + "epoch": 0.2006233484653432, + "grad_norm": 8.982966423034668, + "learning_rate": 9.697446779382573e-05, + "loss": 1.001, + "step": 2961 + }, + { + "epoch": 0.20069110373331525, + "grad_norm": 8.49453067779541, + "learning_rate": 9.697309877472791e-05, + "loss": 0.9848, + "step": 2962 + }, + { + "epoch": 0.20075885900128734, + "grad_norm": 7.239814758300781, + "learning_rate": 9.697172975563009e-05, + "loss": 0.8252, + "step": 2963 + }, + { + "epoch": 0.20082661426925943, + "grad_norm": 10.874746322631836, + "learning_rate": 9.697036073653228e-05, + "loss": 1.0683, + "step": 2964 + }, + { + "epoch": 0.20089436953723153, + "grad_norm": 7.679197311401367, + "learning_rate": 9.696899171743446e-05, + "loss": 1.2797, + "step": 2965 + }, + { + "epoch": 0.20096212480520362, + "grad_norm": 10.089177131652832, + "learning_rate": 9.696762269833664e-05, + "loss": 1.1905, + "step": 2966 + }, + { + "epoch": 0.20102988007317568, + "grad_norm": 8.182350158691406, + "learning_rate": 9.696625367923882e-05, + "loss": 0.9917, + "step": 2967 + }, + { + "epoch": 0.20109763534114777, + "grad_norm": 7.8756256103515625, + "learning_rate": 9.6964884660141e-05, + "loss": 1.0368, + "step": 2968 + }, + { + "epoch": 0.20116539060911987, + "grad_norm": 9.193910598754883, + "learning_rate": 9.69635156410432e-05, + "loss": 1.094, + "step": 2969 + }, + { + "epoch": 0.20123314587709196, + "grad_norm": 8.854869842529297, + "learning_rate": 9.696214662194538e-05, + "loss": 1.1344, + "step": 2970 + }, + { + "epoch": 0.20130090114506402, + "grad_norm": 10.419108390808105, + "learning_rate": 9.696077760284756e-05, + "loss": 0.9937, + "step": 2971 + }, + { + "epoch": 0.2013686564130361, + "grad_norm": 9.329347610473633, + "learning_rate": 9.695940858374975e-05, + "loss": 0.9454, + "step": 2972 + }, + { + "epoch": 0.2014364116810082, + "grad_norm": 8.23154067993164, + "learning_rate": 9.695803956465193e-05, + "loss": 0.8691, + "step": 2973 + }, + { + "epoch": 0.2015041669489803, + "grad_norm": 8.016939163208008, + "learning_rate": 9.695667054555411e-05, + "loss": 0.9182, + "step": 2974 + }, + { + "epoch": 0.20157192221695236, + "grad_norm": 9.717400550842285, + "learning_rate": 9.69553015264563e-05, + "loss": 1.1251, + "step": 2975 + }, + { + "epoch": 0.20163967748492445, + "grad_norm": 10.470111846923828, + "learning_rate": 9.695393250735849e-05, + "loss": 1.1634, + "step": 2976 + }, + { + "epoch": 0.20170743275289654, + "grad_norm": 8.540326118469238, + "learning_rate": 9.695256348826067e-05, + "loss": 0.9229, + "step": 2977 + }, + { + "epoch": 0.20177518802086863, + "grad_norm": 6.997597694396973, + "learning_rate": 9.695119446916286e-05, + "loss": 0.9911, + "step": 2978 + }, + { + "epoch": 0.2018429432888407, + "grad_norm": 7.23951530456543, + "learning_rate": 9.694982545006504e-05, + "loss": 0.8372, + "step": 2979 + }, + { + "epoch": 0.2019106985568128, + "grad_norm": 9.217951774597168, + "learning_rate": 9.694845643096722e-05, + "loss": 0.8154, + "step": 2980 + }, + { + "epoch": 0.20197845382478488, + "grad_norm": 8.128033638000488, + "learning_rate": 9.69470874118694e-05, + "loss": 0.9494, + "step": 2981 + }, + { + "epoch": 0.20204620909275697, + "grad_norm": 8.328935623168945, + "learning_rate": 9.694571839277158e-05, + "loss": 1.0537, + "step": 2982 + }, + { + "epoch": 0.20211396436072904, + "grad_norm": 8.877389907836914, + "learning_rate": 9.694434937367378e-05, + "loss": 0.8683, + "step": 2983 + }, + { + "epoch": 0.20218171962870113, + "grad_norm": 7.285436630249023, + "learning_rate": 9.694298035457596e-05, + "loss": 0.7514, + "step": 2984 + }, + { + "epoch": 0.20224947489667322, + "grad_norm": 9.209798812866211, + "learning_rate": 9.694161133547814e-05, + "loss": 1.0771, + "step": 2985 + }, + { + "epoch": 0.2023172301646453, + "grad_norm": 8.625777244567871, + "learning_rate": 9.694024231638032e-05, + "loss": 1.0085, + "step": 2986 + }, + { + "epoch": 0.20238498543261738, + "grad_norm": 8.50123405456543, + "learning_rate": 9.693887329728251e-05, + "loss": 0.969, + "step": 2987 + }, + { + "epoch": 0.20245274070058947, + "grad_norm": 7.314642429351807, + "learning_rate": 9.693750427818469e-05, + "loss": 0.8075, + "step": 2988 + }, + { + "epoch": 0.20252049596856156, + "grad_norm": 9.474241256713867, + "learning_rate": 9.693613525908687e-05, + "loss": 1.0989, + "step": 2989 + }, + { + "epoch": 0.20258825123653365, + "grad_norm": 7.3510637283325195, + "learning_rate": 9.693476623998905e-05, + "loss": 0.8044, + "step": 2990 + }, + { + "epoch": 0.20265600650450571, + "grad_norm": 7.203106880187988, + "learning_rate": 9.693339722089123e-05, + "loss": 1.0359, + "step": 2991 + }, + { + "epoch": 0.2027237617724778, + "grad_norm": 6.71024227142334, + "learning_rate": 9.693202820179343e-05, + "loss": 0.8687, + "step": 2992 + }, + { + "epoch": 0.2027915170404499, + "grad_norm": 8.327759742736816, + "learning_rate": 9.69306591826956e-05, + "loss": 1.0042, + "step": 2993 + }, + { + "epoch": 0.202859272308422, + "grad_norm": 8.682476997375488, + "learning_rate": 9.692929016359779e-05, + "loss": 1.0273, + "step": 2994 + }, + { + "epoch": 0.20292702757639405, + "grad_norm": 8.555792808532715, + "learning_rate": 9.692792114449997e-05, + "loss": 0.8712, + "step": 2995 + }, + { + "epoch": 0.20299478284436615, + "grad_norm": 11.036639213562012, + "learning_rate": 9.692655212540216e-05, + "loss": 1.1452, + "step": 2996 + }, + { + "epoch": 0.20306253811233824, + "grad_norm": 10.207952499389648, + "learning_rate": 9.692518310630434e-05, + "loss": 1.2267, + "step": 2997 + }, + { + "epoch": 0.20313029338031033, + "grad_norm": 8.715107917785645, + "learning_rate": 9.692381408720652e-05, + "loss": 1.151, + "step": 2998 + }, + { + "epoch": 0.2031980486482824, + "grad_norm": 8.72461986541748, + "learning_rate": 9.69224450681087e-05, + "loss": 0.9787, + "step": 2999 + }, + { + "epoch": 0.20326580391625448, + "grad_norm": 8.717243194580078, + "learning_rate": 9.692107604901088e-05, + "loss": 1.0734, + "step": 3000 + }, + { + "epoch": 0.20333355918422658, + "grad_norm": 7.039597511291504, + "learning_rate": 9.691970702991308e-05, + "loss": 0.9215, + "step": 3001 + }, + { + "epoch": 0.20340131445219867, + "grad_norm": 10.568238258361816, + "learning_rate": 9.691833801081526e-05, + "loss": 1.2248, + "step": 3002 + }, + { + "epoch": 0.20346906972017073, + "grad_norm": 9.515549659729004, + "learning_rate": 9.691696899171744e-05, + "loss": 1.1401, + "step": 3003 + }, + { + "epoch": 0.20353682498814282, + "grad_norm": 9.650483131408691, + "learning_rate": 9.691559997261962e-05, + "loss": 0.9839, + "step": 3004 + }, + { + "epoch": 0.20360458025611491, + "grad_norm": 6.817119598388672, + "learning_rate": 9.691423095352181e-05, + "loss": 0.8542, + "step": 3005 + }, + { + "epoch": 0.203672335524087, + "grad_norm": 7.616591930389404, + "learning_rate": 9.691286193442399e-05, + "loss": 0.9647, + "step": 3006 + }, + { + "epoch": 0.20374009079205907, + "grad_norm": 7.2600274085998535, + "learning_rate": 9.691149291532617e-05, + "loss": 0.962, + "step": 3007 + }, + { + "epoch": 0.20380784606003116, + "grad_norm": 9.714008331298828, + "learning_rate": 9.691012389622835e-05, + "loss": 1.1567, + "step": 3008 + }, + { + "epoch": 0.20387560132800325, + "grad_norm": 9.095394134521484, + "learning_rate": 9.690875487713053e-05, + "loss": 1.1315, + "step": 3009 + }, + { + "epoch": 0.20394335659597534, + "grad_norm": 9.683954238891602, + "learning_rate": 9.690738585803273e-05, + "loss": 1.4039, + "step": 3010 + }, + { + "epoch": 0.2040111118639474, + "grad_norm": 8.829015731811523, + "learning_rate": 9.69060168389349e-05, + "loss": 0.9176, + "step": 3011 + }, + { + "epoch": 0.2040788671319195, + "grad_norm": 7.835269927978516, + "learning_rate": 9.690464781983709e-05, + "loss": 1.0077, + "step": 3012 + }, + { + "epoch": 0.2041466223998916, + "grad_norm": 9.94642448425293, + "learning_rate": 9.690327880073927e-05, + "loss": 1.0815, + "step": 3013 + }, + { + "epoch": 0.20421437766786368, + "grad_norm": 8.184757232666016, + "learning_rate": 9.690190978164145e-05, + "loss": 0.84, + "step": 3014 + }, + { + "epoch": 0.20428213293583575, + "grad_norm": 9.060220718383789, + "learning_rate": 9.690054076254364e-05, + "loss": 0.907, + "step": 3015 + }, + { + "epoch": 0.20434988820380784, + "grad_norm": 6.848534107208252, + "learning_rate": 9.689917174344582e-05, + "loss": 0.7549, + "step": 3016 + }, + { + "epoch": 0.20441764347177993, + "grad_norm": 7.820966720581055, + "learning_rate": 9.6897802724348e-05, + "loss": 0.9899, + "step": 3017 + }, + { + "epoch": 0.20448539873975202, + "grad_norm": 10.148963928222656, + "learning_rate": 9.68964337052502e-05, + "loss": 1.2403, + "step": 3018 + }, + { + "epoch": 0.20455315400772411, + "grad_norm": 8.273184776306152, + "learning_rate": 9.689506468615238e-05, + "loss": 1.0002, + "step": 3019 + }, + { + "epoch": 0.20462090927569618, + "grad_norm": 7.618801593780518, + "learning_rate": 9.689369566705456e-05, + "loss": 0.8864, + "step": 3020 + }, + { + "epoch": 0.20468866454366827, + "grad_norm": 7.952611446380615, + "learning_rate": 9.689232664795675e-05, + "loss": 0.8002, + "step": 3021 + }, + { + "epoch": 0.20475641981164036, + "grad_norm": 9.938977241516113, + "learning_rate": 9.689095762885893e-05, + "loss": 0.8956, + "step": 3022 + }, + { + "epoch": 0.20482417507961245, + "grad_norm": 7.807236194610596, + "learning_rate": 9.688958860976111e-05, + "loss": 1.1777, + "step": 3023 + }, + { + "epoch": 0.20489193034758452, + "grad_norm": 7.7249369621276855, + "learning_rate": 9.68882195906633e-05, + "loss": 0.8881, + "step": 3024 + }, + { + "epoch": 0.2049596856155566, + "grad_norm": 7.747461795806885, + "learning_rate": 9.688685057156549e-05, + "loss": 0.9844, + "step": 3025 + }, + { + "epoch": 0.2050274408835287, + "grad_norm": 10.017412185668945, + "learning_rate": 9.688548155246767e-05, + "loss": 1.0815, + "step": 3026 + }, + { + "epoch": 0.2050951961515008, + "grad_norm": 6.54990816116333, + "learning_rate": 9.688411253336985e-05, + "loss": 0.9319, + "step": 3027 + }, + { + "epoch": 0.20516295141947286, + "grad_norm": 7.358734130859375, + "learning_rate": 9.688274351427204e-05, + "loss": 0.9313, + "step": 3028 + }, + { + "epoch": 0.20523070668744495, + "grad_norm": 7.608468055725098, + "learning_rate": 9.688137449517422e-05, + "loss": 1.0071, + "step": 3029 + }, + { + "epoch": 0.20529846195541704, + "grad_norm": 7.013155937194824, + "learning_rate": 9.68800054760764e-05, + "loss": 0.9419, + "step": 3030 + }, + { + "epoch": 0.20536621722338913, + "grad_norm": 9.907796859741211, + "learning_rate": 9.687863645697858e-05, + "loss": 1.2082, + "step": 3031 + }, + { + "epoch": 0.2054339724913612, + "grad_norm": 9.059138298034668, + "learning_rate": 9.687726743788076e-05, + "loss": 1.0712, + "step": 3032 + }, + { + "epoch": 0.20550172775933329, + "grad_norm": 10.519928932189941, + "learning_rate": 9.687589841878296e-05, + "loss": 1.1627, + "step": 3033 + }, + { + "epoch": 0.20556948302730538, + "grad_norm": 8.381184577941895, + "learning_rate": 9.687452939968514e-05, + "loss": 1.087, + "step": 3034 + }, + { + "epoch": 0.20563723829527747, + "grad_norm": 6.514460563659668, + "learning_rate": 9.687316038058732e-05, + "loss": 0.8657, + "step": 3035 + }, + { + "epoch": 0.20570499356324953, + "grad_norm": 10.034708023071289, + "learning_rate": 9.68717913614895e-05, + "loss": 1.1529, + "step": 3036 + }, + { + "epoch": 0.20577274883122162, + "grad_norm": 7.202263355255127, + "learning_rate": 9.687042234239168e-05, + "loss": 0.7923, + "step": 3037 + }, + { + "epoch": 0.20584050409919372, + "grad_norm": 9.7435302734375, + "learning_rate": 9.686905332329387e-05, + "loss": 1.5071, + "step": 3038 + }, + { + "epoch": 0.2059082593671658, + "grad_norm": 9.57016658782959, + "learning_rate": 9.686768430419605e-05, + "loss": 0.883, + "step": 3039 + }, + { + "epoch": 0.20597601463513787, + "grad_norm": 7.3575029373168945, + "learning_rate": 9.686631528509823e-05, + "loss": 0.9999, + "step": 3040 + }, + { + "epoch": 0.20604376990310996, + "grad_norm": 9.0224027633667, + "learning_rate": 9.686494626600041e-05, + "loss": 1.0121, + "step": 3041 + }, + { + "epoch": 0.20611152517108206, + "grad_norm": 11.173224449157715, + "learning_rate": 9.68635772469026e-05, + "loss": 1.0556, + "step": 3042 + }, + { + "epoch": 0.20617928043905415, + "grad_norm": 8.858287811279297, + "learning_rate": 9.686220822780479e-05, + "loss": 0.9377, + "step": 3043 + }, + { + "epoch": 0.2062470357070262, + "grad_norm": 7.3096795082092285, + "learning_rate": 9.686083920870697e-05, + "loss": 0.8808, + "step": 3044 + }, + { + "epoch": 0.2063147909749983, + "grad_norm": 8.700214385986328, + "learning_rate": 9.685947018960915e-05, + "loss": 1.2121, + "step": 3045 + }, + { + "epoch": 0.2063825462429704, + "grad_norm": 10.57944107055664, + "learning_rate": 9.685810117051133e-05, + "loss": 1.0931, + "step": 3046 + }, + { + "epoch": 0.20645030151094249, + "grad_norm": 8.765487670898438, + "learning_rate": 9.685673215141352e-05, + "loss": 1.0522, + "step": 3047 + }, + { + "epoch": 0.20651805677891455, + "grad_norm": 7.717139720916748, + "learning_rate": 9.68553631323157e-05, + "loss": 0.9492, + "step": 3048 + }, + { + "epoch": 0.20658581204688664, + "grad_norm": 9.301026344299316, + "learning_rate": 9.685399411321788e-05, + "loss": 1.0191, + "step": 3049 + }, + { + "epoch": 0.20665356731485873, + "grad_norm": 10.251668930053711, + "learning_rate": 9.685262509412006e-05, + "loss": 0.991, + "step": 3050 + }, + { + "epoch": 0.20672132258283082, + "grad_norm": 11.597551345825195, + "learning_rate": 9.685125607502226e-05, + "loss": 1.0169, + "step": 3051 + }, + { + "epoch": 0.2067890778508029, + "grad_norm": 10.293901443481445, + "learning_rate": 9.684988705592444e-05, + "loss": 1.0623, + "step": 3052 + }, + { + "epoch": 0.20685683311877498, + "grad_norm": 8.496854782104492, + "learning_rate": 9.684851803682662e-05, + "loss": 1.0466, + "step": 3053 + }, + { + "epoch": 0.20692458838674707, + "grad_norm": 6.76383638381958, + "learning_rate": 9.68471490177288e-05, + "loss": 0.8155, + "step": 3054 + }, + { + "epoch": 0.20699234365471916, + "grad_norm": 8.168519020080566, + "learning_rate": 9.684577999863098e-05, + "loss": 0.9663, + "step": 3055 + }, + { + "epoch": 0.20706009892269123, + "grad_norm": 8.0086030960083, + "learning_rate": 9.684441097953317e-05, + "loss": 0.889, + "step": 3056 + }, + { + "epoch": 0.20712785419066332, + "grad_norm": 8.347359657287598, + "learning_rate": 9.684304196043535e-05, + "loss": 0.938, + "step": 3057 + }, + { + "epoch": 0.2071956094586354, + "grad_norm": 8.0283203125, + "learning_rate": 9.684167294133753e-05, + "loss": 0.7633, + "step": 3058 + }, + { + "epoch": 0.2072633647266075, + "grad_norm": 8.509317398071289, + "learning_rate": 9.684030392223971e-05, + "loss": 1.1052, + "step": 3059 + }, + { + "epoch": 0.20733111999457957, + "grad_norm": 9.441505432128906, + "learning_rate": 9.683893490314189e-05, + "loss": 1.1177, + "step": 3060 + }, + { + "epoch": 0.20739887526255166, + "grad_norm": 8.131098747253418, + "learning_rate": 9.683756588404409e-05, + "loss": 0.9272, + "step": 3061 + }, + { + "epoch": 0.20746663053052375, + "grad_norm": 8.423643112182617, + "learning_rate": 9.683619686494627e-05, + "loss": 0.875, + "step": 3062 + }, + { + "epoch": 0.20753438579849584, + "grad_norm": 7.513223171234131, + "learning_rate": 9.683482784584845e-05, + "loss": 1.0659, + "step": 3063 + }, + { + "epoch": 0.2076021410664679, + "grad_norm": 7.5881171226501465, + "learning_rate": 9.683345882675063e-05, + "loss": 0.9466, + "step": 3064 + }, + { + "epoch": 0.20766989633444, + "grad_norm": 8.717775344848633, + "learning_rate": 9.683208980765282e-05, + "loss": 1.1019, + "step": 3065 + }, + { + "epoch": 0.2077376516024121, + "grad_norm": 11.917694091796875, + "learning_rate": 9.6830720788555e-05, + "loss": 1.1085, + "step": 3066 + }, + { + "epoch": 0.20780540687038418, + "grad_norm": 9.28741455078125, + "learning_rate": 9.682935176945718e-05, + "loss": 1.1458, + "step": 3067 + }, + { + "epoch": 0.20787316213835624, + "grad_norm": 6.746860980987549, + "learning_rate": 9.682798275035938e-05, + "loss": 0.929, + "step": 3068 + }, + { + "epoch": 0.20794091740632833, + "grad_norm": 8.48763370513916, + "learning_rate": 9.682661373126156e-05, + "loss": 1.0702, + "step": 3069 + }, + { + "epoch": 0.20800867267430043, + "grad_norm": 8.871308326721191, + "learning_rate": 9.682524471216374e-05, + "loss": 0.9199, + "step": 3070 + }, + { + "epoch": 0.20807642794227252, + "grad_norm": 8.275801658630371, + "learning_rate": 9.682387569306593e-05, + "loss": 1.0756, + "step": 3071 + }, + { + "epoch": 0.2081441832102446, + "grad_norm": 8.985222816467285, + "learning_rate": 9.682250667396811e-05, + "loss": 1.2364, + "step": 3072 + }, + { + "epoch": 0.20821193847821667, + "grad_norm": 7.749682426452637, + "learning_rate": 9.682113765487029e-05, + "loss": 0.9444, + "step": 3073 + }, + { + "epoch": 0.20827969374618877, + "grad_norm": 9.111614227294922, + "learning_rate": 9.681976863577248e-05, + "loss": 1.338, + "step": 3074 + }, + { + "epoch": 0.20834744901416086, + "grad_norm": 7.2874674797058105, + "learning_rate": 9.681839961667466e-05, + "loss": 0.838, + "step": 3075 + }, + { + "epoch": 0.20841520428213295, + "grad_norm": 6.873099327087402, + "learning_rate": 9.681703059757685e-05, + "loss": 0.9194, + "step": 3076 + }, + { + "epoch": 0.208482959550105, + "grad_norm": 7.564418792724609, + "learning_rate": 9.681566157847903e-05, + "loss": 1.1209, + "step": 3077 + }, + { + "epoch": 0.2085507148180771, + "grad_norm": 9.088560104370117, + "learning_rate": 9.68142925593812e-05, + "loss": 0.8425, + "step": 3078 + }, + { + "epoch": 0.2086184700860492, + "grad_norm": 7.182369709014893, + "learning_rate": 9.68129235402834e-05, + "loss": 1.0938, + "step": 3079 + }, + { + "epoch": 0.2086862253540213, + "grad_norm": 8.853677749633789, + "learning_rate": 9.681155452118558e-05, + "loss": 1.2611, + "step": 3080 + }, + { + "epoch": 0.20875398062199335, + "grad_norm": 8.56440258026123, + "learning_rate": 9.681018550208776e-05, + "loss": 1.1112, + "step": 3081 + }, + { + "epoch": 0.20882173588996544, + "grad_norm": 8.356021881103516, + "learning_rate": 9.680881648298994e-05, + "loss": 1.0044, + "step": 3082 + }, + { + "epoch": 0.20888949115793753, + "grad_norm": 9.083736419677734, + "learning_rate": 9.680744746389213e-05, + "loss": 1.2065, + "step": 3083 + }, + { + "epoch": 0.20895724642590963, + "grad_norm": 7.990222454071045, + "learning_rate": 9.680607844479432e-05, + "loss": 1.0179, + "step": 3084 + }, + { + "epoch": 0.2090250016938817, + "grad_norm": 8.381364822387695, + "learning_rate": 9.68047094256965e-05, + "loss": 0.8392, + "step": 3085 + }, + { + "epoch": 0.20909275696185378, + "grad_norm": 9.017950057983398, + "learning_rate": 9.680334040659868e-05, + "loss": 0.9757, + "step": 3086 + }, + { + "epoch": 0.20916051222982587, + "grad_norm": 8.525566101074219, + "learning_rate": 9.680197138750086e-05, + "loss": 1.1521, + "step": 3087 + }, + { + "epoch": 0.20922826749779797, + "grad_norm": 8.432148933410645, + "learning_rate": 9.680060236840305e-05, + "loss": 0.8559, + "step": 3088 + }, + { + "epoch": 0.20929602276577003, + "grad_norm": 9.985367774963379, + "learning_rate": 9.679923334930523e-05, + "loss": 1.2035, + "step": 3089 + }, + { + "epoch": 0.20936377803374212, + "grad_norm": 9.771974563598633, + "learning_rate": 9.679786433020741e-05, + "loss": 1.1614, + "step": 3090 + }, + { + "epoch": 0.2094315333017142, + "grad_norm": 10.646146774291992, + "learning_rate": 9.679649531110959e-05, + "loss": 1.2165, + "step": 3091 + }, + { + "epoch": 0.2094992885696863, + "grad_norm": 8.93340015411377, + "learning_rate": 9.679512629201177e-05, + "loss": 0.9883, + "step": 3092 + }, + { + "epoch": 0.20956704383765837, + "grad_norm": 8.498669624328613, + "learning_rate": 9.679375727291397e-05, + "loss": 1.0318, + "step": 3093 + }, + { + "epoch": 0.20963479910563046, + "grad_norm": 7.536258220672607, + "learning_rate": 9.679238825381615e-05, + "loss": 0.9949, + "step": 3094 + }, + { + "epoch": 0.20970255437360255, + "grad_norm": 8.314896583557129, + "learning_rate": 9.679101923471833e-05, + "loss": 1.0502, + "step": 3095 + }, + { + "epoch": 0.20977030964157464, + "grad_norm": 7.470542907714844, + "learning_rate": 9.67896502156205e-05, + "loss": 0.8871, + "step": 3096 + }, + { + "epoch": 0.2098380649095467, + "grad_norm": 8.951095581054688, + "learning_rate": 9.67882811965227e-05, + "loss": 0.9793, + "step": 3097 + }, + { + "epoch": 0.2099058201775188, + "grad_norm": 7.879035472869873, + "learning_rate": 9.678691217742488e-05, + "loss": 1.0193, + "step": 3098 + }, + { + "epoch": 0.2099735754454909, + "grad_norm": 8.890814781188965, + "learning_rate": 9.678554315832706e-05, + "loss": 0.9738, + "step": 3099 + }, + { + "epoch": 0.21004133071346298, + "grad_norm": 8.863816261291504, + "learning_rate": 9.678417413922924e-05, + "loss": 1.1213, + "step": 3100 + }, + { + "epoch": 0.21010908598143505, + "grad_norm": 9.59538745880127, + "learning_rate": 9.678280512013142e-05, + "loss": 1.0215, + "step": 3101 + }, + { + "epoch": 0.21017684124940714, + "grad_norm": 8.811614990234375, + "learning_rate": 9.678143610103362e-05, + "loss": 0.8703, + "step": 3102 + }, + { + "epoch": 0.21024459651737923, + "grad_norm": 7.274720191955566, + "learning_rate": 9.67800670819358e-05, + "loss": 0.898, + "step": 3103 + }, + { + "epoch": 0.21031235178535132, + "grad_norm": 11.239364624023438, + "learning_rate": 9.677869806283798e-05, + "loss": 0.9823, + "step": 3104 + }, + { + "epoch": 0.21038010705332338, + "grad_norm": 8.807086944580078, + "learning_rate": 9.677732904374016e-05, + "loss": 1.0074, + "step": 3105 + }, + { + "epoch": 0.21044786232129548, + "grad_norm": 9.065536499023438, + "learning_rate": 9.677596002464235e-05, + "loss": 1.1861, + "step": 3106 + }, + { + "epoch": 0.21051561758926757, + "grad_norm": 11.035104751586914, + "learning_rate": 9.677459100554453e-05, + "loss": 1.0067, + "step": 3107 + }, + { + "epoch": 0.21058337285723966, + "grad_norm": 8.010696411132812, + "learning_rate": 9.677322198644671e-05, + "loss": 1.0855, + "step": 3108 + }, + { + "epoch": 0.21065112812521172, + "grad_norm": 9.104195594787598, + "learning_rate": 9.677185296734889e-05, + "loss": 1.0497, + "step": 3109 + }, + { + "epoch": 0.21071888339318381, + "grad_norm": 8.731512069702148, + "learning_rate": 9.677048394825107e-05, + "loss": 1.1108, + "step": 3110 + }, + { + "epoch": 0.2107866386611559, + "grad_norm": 8.823514938354492, + "learning_rate": 9.676911492915327e-05, + "loss": 1.0271, + "step": 3111 + }, + { + "epoch": 0.210854393929128, + "grad_norm": 7.446425914764404, + "learning_rate": 9.676774591005545e-05, + "loss": 0.9182, + "step": 3112 + }, + { + "epoch": 0.21092214919710006, + "grad_norm": 10.75915241241455, + "learning_rate": 9.676637689095763e-05, + "loss": 0.93, + "step": 3113 + }, + { + "epoch": 0.21098990446507215, + "grad_norm": 10.065240859985352, + "learning_rate": 9.676500787185982e-05, + "loss": 1.0508, + "step": 3114 + }, + { + "epoch": 0.21105765973304424, + "grad_norm": 8.790117263793945, + "learning_rate": 9.6763638852762e-05, + "loss": 0.8971, + "step": 3115 + }, + { + "epoch": 0.21112541500101634, + "grad_norm": 8.286596298217773, + "learning_rate": 9.676226983366418e-05, + "loss": 0.9403, + "step": 3116 + }, + { + "epoch": 0.2111931702689884, + "grad_norm": 8.542399406433105, + "learning_rate": 9.676090081456637e-05, + "loss": 1.1638, + "step": 3117 + }, + { + "epoch": 0.2112609255369605, + "grad_norm": 9.461727142333984, + "learning_rate": 9.675953179546856e-05, + "loss": 1.0859, + "step": 3118 + }, + { + "epoch": 0.21132868080493258, + "grad_norm": 7.0917229652404785, + "learning_rate": 9.675816277637074e-05, + "loss": 0.9931, + "step": 3119 + }, + { + "epoch": 0.21139643607290468, + "grad_norm": 9.192744255065918, + "learning_rate": 9.675679375727293e-05, + "loss": 1.2138, + "step": 3120 + }, + { + "epoch": 0.21146419134087674, + "grad_norm": 7.744256973266602, + "learning_rate": 9.675542473817511e-05, + "loss": 1.3615, + "step": 3121 + }, + { + "epoch": 0.21153194660884883, + "grad_norm": 6.915426254272461, + "learning_rate": 9.675405571907729e-05, + "loss": 0.89, + "step": 3122 + }, + { + "epoch": 0.21159970187682092, + "grad_norm": 9.888227462768555, + "learning_rate": 9.675268669997947e-05, + "loss": 0.9569, + "step": 3123 + }, + { + "epoch": 0.21166745714479301, + "grad_norm": 7.541590690612793, + "learning_rate": 9.675131768088165e-05, + "loss": 0.9638, + "step": 3124 + }, + { + "epoch": 0.2117352124127651, + "grad_norm": 7.883132457733154, + "learning_rate": 9.674994866178384e-05, + "loss": 0.8901, + "step": 3125 + }, + { + "epoch": 0.21180296768073717, + "grad_norm": 8.493675231933594, + "learning_rate": 9.674857964268602e-05, + "loss": 0.8894, + "step": 3126 + }, + { + "epoch": 0.21187072294870926, + "grad_norm": 7.348284721374512, + "learning_rate": 9.67472106235882e-05, + "loss": 0.8721, + "step": 3127 + }, + { + "epoch": 0.21193847821668135, + "grad_norm": 9.094710350036621, + "learning_rate": 9.674584160449039e-05, + "loss": 0.9255, + "step": 3128 + }, + { + "epoch": 0.21200623348465344, + "grad_norm": 7.316446304321289, + "learning_rate": 9.674447258539258e-05, + "loss": 1.2577, + "step": 3129 + }, + { + "epoch": 0.2120739887526255, + "grad_norm": 8.105271339416504, + "learning_rate": 9.674310356629476e-05, + "loss": 0.8157, + "step": 3130 + }, + { + "epoch": 0.2121417440205976, + "grad_norm": 8.433457374572754, + "learning_rate": 9.674173454719694e-05, + "loss": 1.0398, + "step": 3131 + }, + { + "epoch": 0.2122094992885697, + "grad_norm": 7.787237644195557, + "learning_rate": 9.674036552809912e-05, + "loss": 0.7095, + "step": 3132 + }, + { + "epoch": 0.21227725455654178, + "grad_norm": 10.60180950164795, + "learning_rate": 9.67389965090013e-05, + "loss": 1.0482, + "step": 3133 + }, + { + "epoch": 0.21234500982451385, + "grad_norm": 8.428773880004883, + "learning_rate": 9.67376274899035e-05, + "loss": 0.8677, + "step": 3134 + }, + { + "epoch": 0.21241276509248594, + "grad_norm": 8.204195022583008, + "learning_rate": 9.673625847080568e-05, + "loss": 1.14, + "step": 3135 + }, + { + "epoch": 0.21248052036045803, + "grad_norm": 6.449087619781494, + "learning_rate": 9.673488945170786e-05, + "loss": 1.0181, + "step": 3136 + }, + { + "epoch": 0.21254827562843012, + "grad_norm": 9.588041305541992, + "learning_rate": 9.673352043261004e-05, + "loss": 0.9996, + "step": 3137 + }, + { + "epoch": 0.21261603089640219, + "grad_norm": 9.626228332519531, + "learning_rate": 9.673215141351223e-05, + "loss": 1.0501, + "step": 3138 + }, + { + "epoch": 0.21268378616437428, + "grad_norm": 9.036309242248535, + "learning_rate": 9.673078239441441e-05, + "loss": 1.093, + "step": 3139 + }, + { + "epoch": 0.21275154143234637, + "grad_norm": 9.415257453918457, + "learning_rate": 9.672941337531659e-05, + "loss": 1.0983, + "step": 3140 + }, + { + "epoch": 0.21281929670031846, + "grad_norm": 8.309000015258789, + "learning_rate": 9.672804435621877e-05, + "loss": 1.0803, + "step": 3141 + }, + { + "epoch": 0.21288705196829052, + "grad_norm": 7.420774459838867, + "learning_rate": 9.672667533712095e-05, + "loss": 1.1027, + "step": 3142 + }, + { + "epoch": 0.21295480723626262, + "grad_norm": 9.291664123535156, + "learning_rate": 9.672530631802314e-05, + "loss": 1.1617, + "step": 3143 + }, + { + "epoch": 0.2130225625042347, + "grad_norm": 7.656317234039307, + "learning_rate": 9.672393729892533e-05, + "loss": 0.9423, + "step": 3144 + }, + { + "epoch": 0.2130903177722068, + "grad_norm": 5.812994003295898, + "learning_rate": 9.67225682798275e-05, + "loss": 0.9187, + "step": 3145 + }, + { + "epoch": 0.21315807304017886, + "grad_norm": 9.492706298828125, + "learning_rate": 9.672119926072969e-05, + "loss": 1.1804, + "step": 3146 + }, + { + "epoch": 0.21322582830815096, + "grad_norm": 9.37720012664795, + "learning_rate": 9.671983024163187e-05, + "loss": 0.8885, + "step": 3147 + }, + { + "epoch": 0.21329358357612305, + "grad_norm": 9.183650016784668, + "learning_rate": 9.671846122253406e-05, + "loss": 0.8967, + "step": 3148 + }, + { + "epoch": 0.21336133884409514, + "grad_norm": 8.037003517150879, + "learning_rate": 9.671709220343624e-05, + "loss": 1.2605, + "step": 3149 + }, + { + "epoch": 0.2134290941120672, + "grad_norm": 8.694345474243164, + "learning_rate": 9.671572318433842e-05, + "loss": 1.003, + "step": 3150 + }, + { + "epoch": 0.2134968493800393, + "grad_norm": 6.429176330566406, + "learning_rate": 9.67143541652406e-05, + "loss": 1.1018, + "step": 3151 + }, + { + "epoch": 0.21356460464801139, + "grad_norm": 7.964774131774902, + "learning_rate": 9.67129851461428e-05, + "loss": 0.9121, + "step": 3152 + }, + { + "epoch": 0.21363235991598348, + "grad_norm": 7.5509033203125, + "learning_rate": 9.671161612704498e-05, + "loss": 0.9577, + "step": 3153 + }, + { + "epoch": 0.21370011518395554, + "grad_norm": 7.968616962432861, + "learning_rate": 9.671024710794716e-05, + "loss": 1.1337, + "step": 3154 + }, + { + "epoch": 0.21376787045192763, + "grad_norm": 7.340758800506592, + "learning_rate": 9.670887808884934e-05, + "loss": 0.8068, + "step": 3155 + }, + { + "epoch": 0.21383562571989972, + "grad_norm": 7.890449523925781, + "learning_rate": 9.670750906975152e-05, + "loss": 1.0263, + "step": 3156 + }, + { + "epoch": 0.21390338098787182, + "grad_norm": 10.908242225646973, + "learning_rate": 9.670614005065371e-05, + "loss": 1.0384, + "step": 3157 + }, + { + "epoch": 0.21397113625584388, + "grad_norm": 7.62697696685791, + "learning_rate": 9.670477103155589e-05, + "loss": 0.918, + "step": 3158 + }, + { + "epoch": 0.21403889152381597, + "grad_norm": 10.487192153930664, + "learning_rate": 9.670340201245807e-05, + "loss": 1.1135, + "step": 3159 + }, + { + "epoch": 0.21410664679178806, + "grad_norm": 7.076718807220459, + "learning_rate": 9.670203299336026e-05, + "loss": 0.9286, + "step": 3160 + }, + { + "epoch": 0.21417440205976015, + "grad_norm": 10.05949592590332, + "learning_rate": 9.670066397426245e-05, + "loss": 1.1317, + "step": 3161 + }, + { + "epoch": 0.21424215732773222, + "grad_norm": 8.656492233276367, + "learning_rate": 9.669929495516463e-05, + "loss": 0.9864, + "step": 3162 + }, + { + "epoch": 0.2143099125957043, + "grad_norm": 10.591787338256836, + "learning_rate": 9.669792593606682e-05, + "loss": 1.0801, + "step": 3163 + }, + { + "epoch": 0.2143776678636764, + "grad_norm": 8.19133472442627, + "learning_rate": 9.6696556916969e-05, + "loss": 0.9377, + "step": 3164 + }, + { + "epoch": 0.2144454231316485, + "grad_norm": 8.402669906616211, + "learning_rate": 9.669518789787118e-05, + "loss": 0.9283, + "step": 3165 + }, + { + "epoch": 0.21451317839962056, + "grad_norm": 9.713315963745117, + "learning_rate": 9.669381887877337e-05, + "loss": 1.1865, + "step": 3166 + }, + { + "epoch": 0.21458093366759265, + "grad_norm": 6.777700424194336, + "learning_rate": 9.669244985967555e-05, + "loss": 0.9094, + "step": 3167 + }, + { + "epoch": 0.21464868893556474, + "grad_norm": 9.381082534790039, + "learning_rate": 9.669108084057773e-05, + "loss": 0.7461, + "step": 3168 + }, + { + "epoch": 0.21471644420353683, + "grad_norm": 8.568790435791016, + "learning_rate": 9.668971182147992e-05, + "loss": 0.7339, + "step": 3169 + }, + { + "epoch": 0.2147841994715089, + "grad_norm": 9.257226943969727, + "learning_rate": 9.66883428023821e-05, + "loss": 1.1892, + "step": 3170 + }, + { + "epoch": 0.214851954739481, + "grad_norm": 8.544146537780762, + "learning_rate": 9.668697378328429e-05, + "loss": 1.0307, + "step": 3171 + }, + { + "epoch": 0.21491971000745308, + "grad_norm": 9.540971755981445, + "learning_rate": 9.668560476418647e-05, + "loss": 0.8464, + "step": 3172 + }, + { + "epoch": 0.21498746527542517, + "grad_norm": 9.180089950561523, + "learning_rate": 9.668423574508865e-05, + "loss": 0.951, + "step": 3173 + }, + { + "epoch": 0.21505522054339726, + "grad_norm": 10.706409454345703, + "learning_rate": 9.668286672599083e-05, + "loss": 1.141, + "step": 3174 + }, + { + "epoch": 0.21512297581136933, + "grad_norm": 9.421865463256836, + "learning_rate": 9.668149770689302e-05, + "loss": 1.0701, + "step": 3175 + }, + { + "epoch": 0.21519073107934142, + "grad_norm": 9.120182037353516, + "learning_rate": 9.66801286877952e-05, + "loss": 1.1618, + "step": 3176 + }, + { + "epoch": 0.2152584863473135, + "grad_norm": 7.879681587219238, + "learning_rate": 9.667875966869738e-05, + "loss": 0.8048, + "step": 3177 + }, + { + "epoch": 0.2153262416152856, + "grad_norm": 7.233819007873535, + "learning_rate": 9.667739064959957e-05, + "loss": 0.9718, + "step": 3178 + }, + { + "epoch": 0.21539399688325767, + "grad_norm": 7.79316520690918, + "learning_rate": 9.667602163050175e-05, + "loss": 1.0999, + "step": 3179 + }, + { + "epoch": 0.21546175215122976, + "grad_norm": 8.873052597045898, + "learning_rate": 9.667465261140394e-05, + "loss": 0.9444, + "step": 3180 + }, + { + "epoch": 0.21552950741920185, + "grad_norm": 7.386806488037109, + "learning_rate": 9.667328359230612e-05, + "loss": 0.8661, + "step": 3181 + }, + { + "epoch": 0.21559726268717394, + "grad_norm": 7.682179927825928, + "learning_rate": 9.66719145732083e-05, + "loss": 0.7904, + "step": 3182 + }, + { + "epoch": 0.215665017955146, + "grad_norm": 7.9192914962768555, + "learning_rate": 9.667054555411048e-05, + "loss": 1.0563, + "step": 3183 + }, + { + "epoch": 0.2157327732231181, + "grad_norm": 6.972530364990234, + "learning_rate": 9.666917653501267e-05, + "loss": 0.7482, + "step": 3184 + }, + { + "epoch": 0.2158005284910902, + "grad_norm": 9.844091415405273, + "learning_rate": 9.666780751591485e-05, + "loss": 1.0289, + "step": 3185 + }, + { + "epoch": 0.21586828375906228, + "grad_norm": 8.724159240722656, + "learning_rate": 9.666643849681704e-05, + "loss": 1.1419, + "step": 3186 + }, + { + "epoch": 0.21593603902703434, + "grad_norm": 8.859795570373535, + "learning_rate": 9.666506947771922e-05, + "loss": 0.9252, + "step": 3187 + }, + { + "epoch": 0.21600379429500643, + "grad_norm": 10.741375923156738, + "learning_rate": 9.66637004586214e-05, + "loss": 1.2328, + "step": 3188 + }, + { + "epoch": 0.21607154956297853, + "grad_norm": 8.468697547912598, + "learning_rate": 9.666233143952359e-05, + "loss": 1.1779, + "step": 3189 + }, + { + "epoch": 0.21613930483095062, + "grad_norm": 7.220376014709473, + "learning_rate": 9.666096242042577e-05, + "loss": 0.9271, + "step": 3190 + }, + { + "epoch": 0.21620706009892268, + "grad_norm": 8.3795804977417, + "learning_rate": 9.665959340132795e-05, + "loss": 1.0941, + "step": 3191 + }, + { + "epoch": 0.21627481536689477, + "grad_norm": 9.743870735168457, + "learning_rate": 9.665822438223013e-05, + "loss": 0.9396, + "step": 3192 + }, + { + "epoch": 0.21634257063486687, + "grad_norm": 8.927998542785645, + "learning_rate": 9.665685536313231e-05, + "loss": 1.0915, + "step": 3193 + }, + { + "epoch": 0.21641032590283896, + "grad_norm": 8.214877128601074, + "learning_rate": 9.66554863440345e-05, + "loss": 0.8767, + "step": 3194 + }, + { + "epoch": 0.21647808117081102, + "grad_norm": 9.605053901672363, + "learning_rate": 9.665411732493669e-05, + "loss": 1.1829, + "step": 3195 + }, + { + "epoch": 0.2165458364387831, + "grad_norm": 8.430122375488281, + "learning_rate": 9.665274830583887e-05, + "loss": 0.8234, + "step": 3196 + }, + { + "epoch": 0.2166135917067552, + "grad_norm": 8.868674278259277, + "learning_rate": 9.665137928674105e-05, + "loss": 1.0467, + "step": 3197 + }, + { + "epoch": 0.2166813469747273, + "grad_norm": 8.451126098632812, + "learning_rate": 9.665001026764324e-05, + "loss": 1.134, + "step": 3198 + }, + { + "epoch": 0.21674910224269936, + "grad_norm": 8.749373435974121, + "learning_rate": 9.664864124854542e-05, + "loss": 0.9825, + "step": 3199 + }, + { + "epoch": 0.21681685751067145, + "grad_norm": 8.478997230529785, + "learning_rate": 9.66472722294476e-05, + "loss": 0.9823, + "step": 3200 + }, + { + "epoch": 0.21688461277864354, + "grad_norm": 6.554266929626465, + "learning_rate": 9.664590321034978e-05, + "loss": 1.2293, + "step": 3201 + }, + { + "epoch": 0.21695236804661563, + "grad_norm": 7.415618896484375, + "learning_rate": 9.664453419125196e-05, + "loss": 0.9767, + "step": 3202 + }, + { + "epoch": 0.2170201233145877, + "grad_norm": 9.277578353881836, + "learning_rate": 9.664316517215416e-05, + "loss": 1.0042, + "step": 3203 + }, + { + "epoch": 0.2170878785825598, + "grad_norm": 7.1379241943359375, + "learning_rate": 9.664179615305634e-05, + "loss": 0.924, + "step": 3204 + }, + { + "epoch": 0.21715563385053188, + "grad_norm": 8.837613105773926, + "learning_rate": 9.664042713395852e-05, + "loss": 0.9889, + "step": 3205 + }, + { + "epoch": 0.21722338911850397, + "grad_norm": 8.118345260620117, + "learning_rate": 9.663905811486071e-05, + "loss": 1.2376, + "step": 3206 + }, + { + "epoch": 0.21729114438647604, + "grad_norm": 10.383713722229004, + "learning_rate": 9.663768909576289e-05, + "loss": 0.7988, + "step": 3207 + }, + { + "epoch": 0.21735889965444813, + "grad_norm": 7.0938262939453125, + "learning_rate": 9.663632007666507e-05, + "loss": 0.8589, + "step": 3208 + }, + { + "epoch": 0.21742665492242022, + "grad_norm": 7.732020854949951, + "learning_rate": 9.663495105756726e-05, + "loss": 1.0837, + "step": 3209 + }, + { + "epoch": 0.2174944101903923, + "grad_norm": 7.718788146972656, + "learning_rate": 9.663358203846944e-05, + "loss": 0.8949, + "step": 3210 + }, + { + "epoch": 0.21756216545836438, + "grad_norm": 9.187674522399902, + "learning_rate": 9.663221301937162e-05, + "loss": 0.9443, + "step": 3211 + }, + { + "epoch": 0.21762992072633647, + "grad_norm": 8.596949577331543, + "learning_rate": 9.663084400027382e-05, + "loss": 1.0855, + "step": 3212 + }, + { + "epoch": 0.21769767599430856, + "grad_norm": 7.854368209838867, + "learning_rate": 9.6629474981176e-05, + "loss": 0.9203, + "step": 3213 + }, + { + "epoch": 0.21776543126228065, + "grad_norm": 9.96597957611084, + "learning_rate": 9.662810596207818e-05, + "loss": 0.836, + "step": 3214 + }, + { + "epoch": 0.21783318653025271, + "grad_norm": 7.254636764526367, + "learning_rate": 9.662673694298036e-05, + "loss": 0.7926, + "step": 3215 + }, + { + "epoch": 0.2179009417982248, + "grad_norm": 9.079703330993652, + "learning_rate": 9.662536792388255e-05, + "loss": 1.2072, + "step": 3216 + }, + { + "epoch": 0.2179686970661969, + "grad_norm": 8.051166534423828, + "learning_rate": 9.662399890478473e-05, + "loss": 0.9719, + "step": 3217 + }, + { + "epoch": 0.218036452334169, + "grad_norm": 9.189994812011719, + "learning_rate": 9.662262988568691e-05, + "loss": 0.9878, + "step": 3218 + }, + { + "epoch": 0.21810420760214105, + "grad_norm": 9.806111335754395, + "learning_rate": 9.66212608665891e-05, + "loss": 1.1621, + "step": 3219 + }, + { + "epoch": 0.21817196287011315, + "grad_norm": 7.833159923553467, + "learning_rate": 9.661989184749128e-05, + "loss": 0.9811, + "step": 3220 + }, + { + "epoch": 0.21823971813808524, + "grad_norm": 7.537334442138672, + "learning_rate": 9.661852282839347e-05, + "loss": 0.9916, + "step": 3221 + }, + { + "epoch": 0.21830747340605733, + "grad_norm": 9.828896522521973, + "learning_rate": 9.661715380929565e-05, + "loss": 1.09, + "step": 3222 + }, + { + "epoch": 0.2183752286740294, + "grad_norm": 7.99267578125, + "learning_rate": 9.661578479019783e-05, + "loss": 1.0852, + "step": 3223 + }, + { + "epoch": 0.21844298394200148, + "grad_norm": 8.085976600646973, + "learning_rate": 9.661441577110001e-05, + "loss": 1.1762, + "step": 3224 + }, + { + "epoch": 0.21851073920997358, + "grad_norm": 6.523219585418701, + "learning_rate": 9.661304675200219e-05, + "loss": 1.0814, + "step": 3225 + }, + { + "epoch": 0.21857849447794567, + "grad_norm": 9.796062469482422, + "learning_rate": 9.661167773290438e-05, + "loss": 1.1447, + "step": 3226 + }, + { + "epoch": 0.21864624974591776, + "grad_norm": 8.570687294006348, + "learning_rate": 9.661030871380656e-05, + "loss": 1.0552, + "step": 3227 + }, + { + "epoch": 0.21871400501388982, + "grad_norm": 9.70113468170166, + "learning_rate": 9.660893969470874e-05, + "loss": 1.0379, + "step": 3228 + }, + { + "epoch": 0.21878176028186191, + "grad_norm": 8.195375442504883, + "learning_rate": 9.660757067561093e-05, + "loss": 0.9348, + "step": 3229 + }, + { + "epoch": 0.218849515549834, + "grad_norm": 7.33494758605957, + "learning_rate": 9.660620165651312e-05, + "loss": 0.8977, + "step": 3230 + }, + { + "epoch": 0.2189172708178061, + "grad_norm": 7.543430328369141, + "learning_rate": 9.66048326374153e-05, + "loss": 1.023, + "step": 3231 + }, + { + "epoch": 0.21898502608577816, + "grad_norm": 8.45494556427002, + "learning_rate": 9.660346361831748e-05, + "loss": 1.2995, + "step": 3232 + }, + { + "epoch": 0.21905278135375025, + "grad_norm": 8.197903633117676, + "learning_rate": 9.660209459921966e-05, + "loss": 0.9651, + "step": 3233 + }, + { + "epoch": 0.21912053662172234, + "grad_norm": 8.392546653747559, + "learning_rate": 9.660072558012184e-05, + "loss": 0.9858, + "step": 3234 + }, + { + "epoch": 0.21918829188969444, + "grad_norm": 6.499540328979492, + "learning_rate": 9.659935656102403e-05, + "loss": 0.9679, + "step": 3235 + }, + { + "epoch": 0.2192560471576665, + "grad_norm": 7.784618377685547, + "learning_rate": 9.659798754192621e-05, + "loss": 0.9316, + "step": 3236 + }, + { + "epoch": 0.2193238024256386, + "grad_norm": 9.014801979064941, + "learning_rate": 9.65966185228284e-05, + "loss": 0.9261, + "step": 3237 + }, + { + "epoch": 0.21939155769361068, + "grad_norm": 8.238675117492676, + "learning_rate": 9.659524950373058e-05, + "loss": 1.0156, + "step": 3238 + }, + { + "epoch": 0.21945931296158278, + "grad_norm": 9.447864532470703, + "learning_rate": 9.659388048463277e-05, + "loss": 1.2186, + "step": 3239 + }, + { + "epoch": 0.21952706822955484, + "grad_norm": 6.894035816192627, + "learning_rate": 9.659251146553495e-05, + "loss": 0.9157, + "step": 3240 + }, + { + "epoch": 0.21959482349752693, + "grad_norm": 7.24024772644043, + "learning_rate": 9.659114244643713e-05, + "loss": 1.1564, + "step": 3241 + }, + { + "epoch": 0.21966257876549902, + "grad_norm": 9.135485649108887, + "learning_rate": 9.658977342733931e-05, + "loss": 1.2002, + "step": 3242 + }, + { + "epoch": 0.21973033403347111, + "grad_norm": 6.119181156158447, + "learning_rate": 9.658840440824149e-05, + "loss": 0.788, + "step": 3243 + }, + { + "epoch": 0.21979808930144318, + "grad_norm": 8.550108909606934, + "learning_rate": 9.658703538914368e-05, + "loss": 0.9251, + "step": 3244 + }, + { + "epoch": 0.21986584456941527, + "grad_norm": 9.546792984008789, + "learning_rate": 9.658566637004586e-05, + "loss": 1.2231, + "step": 3245 + }, + { + "epoch": 0.21993359983738736, + "grad_norm": 9.451306343078613, + "learning_rate": 9.658429735094805e-05, + "loss": 0.8991, + "step": 3246 + }, + { + "epoch": 0.22000135510535945, + "grad_norm": 9.914873123168945, + "learning_rate": 9.658292833185023e-05, + "loss": 1.3102, + "step": 3247 + }, + { + "epoch": 0.22006911037333152, + "grad_norm": 10.038002967834473, + "learning_rate": 9.65815593127524e-05, + "loss": 1.254, + "step": 3248 + }, + { + "epoch": 0.2201368656413036, + "grad_norm": 7.590802192687988, + "learning_rate": 9.65801902936546e-05, + "loss": 1.0032, + "step": 3249 + }, + { + "epoch": 0.2202046209092757, + "grad_norm": 8.632338523864746, + "learning_rate": 9.657882127455678e-05, + "loss": 0.8922, + "step": 3250 + }, + { + "epoch": 0.2202723761772478, + "grad_norm": 6.483764171600342, + "learning_rate": 9.657745225545896e-05, + "loss": 0.9167, + "step": 3251 + }, + { + "epoch": 0.22034013144521986, + "grad_norm": 8.28577709197998, + "learning_rate": 9.657608323636115e-05, + "loss": 1.0413, + "step": 3252 + }, + { + "epoch": 0.22040788671319195, + "grad_norm": 9.218433380126953, + "learning_rate": 9.657471421726333e-05, + "loss": 1.1681, + "step": 3253 + }, + { + "epoch": 0.22047564198116404, + "grad_norm": 6.653755187988281, + "learning_rate": 9.657334519816552e-05, + "loss": 0.9645, + "step": 3254 + }, + { + "epoch": 0.22054339724913613, + "grad_norm": 8.784723281860352, + "learning_rate": 9.657197617906771e-05, + "loss": 0.7489, + "step": 3255 + }, + { + "epoch": 0.2206111525171082, + "grad_norm": 9.041324615478516, + "learning_rate": 9.657060715996989e-05, + "loss": 1.0478, + "step": 3256 + }, + { + "epoch": 0.22067890778508029, + "grad_norm": 7.811746597290039, + "learning_rate": 9.656923814087207e-05, + "loss": 0.9511, + "step": 3257 + }, + { + "epoch": 0.22074666305305238, + "grad_norm": 9.811897277832031, + "learning_rate": 9.656786912177426e-05, + "loss": 1.4235, + "step": 3258 + }, + { + "epoch": 0.22081441832102447, + "grad_norm": 7.313972473144531, + "learning_rate": 9.656650010267644e-05, + "loss": 1.0075, + "step": 3259 + }, + { + "epoch": 0.22088217358899653, + "grad_norm": 9.360795021057129, + "learning_rate": 9.656513108357862e-05, + "loss": 0.9634, + "step": 3260 + }, + { + "epoch": 0.22094992885696862, + "grad_norm": 9.322724342346191, + "learning_rate": 9.65637620644808e-05, + "loss": 1.0669, + "step": 3261 + }, + { + "epoch": 0.22101768412494072, + "grad_norm": 10.269414901733398, + "learning_rate": 9.6562393045383e-05, + "loss": 1.1288, + "step": 3262 + }, + { + "epoch": 0.2210854393929128, + "grad_norm": 7.672987937927246, + "learning_rate": 9.656102402628518e-05, + "loss": 1.0517, + "step": 3263 + }, + { + "epoch": 0.22115319466088487, + "grad_norm": 8.725695610046387, + "learning_rate": 9.655965500718736e-05, + "loss": 1.0987, + "step": 3264 + }, + { + "epoch": 0.22122094992885696, + "grad_norm": 8.577162742614746, + "learning_rate": 9.655828598808954e-05, + "loss": 0.821, + "step": 3265 + }, + { + "epoch": 0.22128870519682906, + "grad_norm": 8.497530937194824, + "learning_rate": 9.655691696899172e-05, + "loss": 0.9171, + "step": 3266 + }, + { + "epoch": 0.22135646046480115, + "grad_norm": 9.665203094482422, + "learning_rate": 9.655554794989391e-05, + "loss": 1.105, + "step": 3267 + }, + { + "epoch": 0.2214242157327732, + "grad_norm": 9.42651081085205, + "learning_rate": 9.65541789307961e-05, + "loss": 0.9526, + "step": 3268 + }, + { + "epoch": 0.2214919710007453, + "grad_norm": 7.461474418640137, + "learning_rate": 9.655280991169827e-05, + "loss": 0.9554, + "step": 3269 + }, + { + "epoch": 0.2215597262687174, + "grad_norm": 10.56189250946045, + "learning_rate": 9.655144089260045e-05, + "loss": 1.1504, + "step": 3270 + }, + { + "epoch": 0.22162748153668949, + "grad_norm": 8.245696067810059, + "learning_rate": 9.655007187350265e-05, + "loss": 1.0352, + "step": 3271 + }, + { + "epoch": 0.22169523680466155, + "grad_norm": 9.182881355285645, + "learning_rate": 9.654870285440483e-05, + "loss": 1.1153, + "step": 3272 + }, + { + "epoch": 0.22176299207263364, + "grad_norm": 8.101743698120117, + "learning_rate": 9.654733383530701e-05, + "loss": 1.1399, + "step": 3273 + }, + { + "epoch": 0.22183074734060573, + "grad_norm": 9.072972297668457, + "learning_rate": 9.654596481620919e-05, + "loss": 1.1508, + "step": 3274 + }, + { + "epoch": 0.22189850260857782, + "grad_norm": 8.480910301208496, + "learning_rate": 9.654459579711137e-05, + "loss": 0.9659, + "step": 3275 + }, + { + "epoch": 0.2219662578765499, + "grad_norm": 7.095332622528076, + "learning_rate": 9.654322677801356e-05, + "loss": 0.9417, + "step": 3276 + }, + { + "epoch": 0.22203401314452198, + "grad_norm": 8.183112144470215, + "learning_rate": 9.654185775891574e-05, + "loss": 0.9307, + "step": 3277 + }, + { + "epoch": 0.22210176841249407, + "grad_norm": 8.60648250579834, + "learning_rate": 9.654048873981792e-05, + "loss": 1.141, + "step": 3278 + }, + { + "epoch": 0.22216952368046616, + "grad_norm": 6.992738246917725, + "learning_rate": 9.65391197207201e-05, + "loss": 0.9429, + "step": 3279 + }, + { + "epoch": 0.22223727894843825, + "grad_norm": 6.998913764953613, + "learning_rate": 9.653775070162229e-05, + "loss": 1.0159, + "step": 3280 + }, + { + "epoch": 0.22230503421641032, + "grad_norm": 7.922050952911377, + "learning_rate": 9.653638168252448e-05, + "loss": 1.1349, + "step": 3281 + }, + { + "epoch": 0.2223727894843824, + "grad_norm": 9.171984672546387, + "learning_rate": 9.653501266342666e-05, + "loss": 1.149, + "step": 3282 + }, + { + "epoch": 0.2224405447523545, + "grad_norm": 9.88930606842041, + "learning_rate": 9.653364364432884e-05, + "loss": 1.1036, + "step": 3283 + }, + { + "epoch": 0.2225083000203266, + "grad_norm": 10.978727340698242, + "learning_rate": 9.653227462523102e-05, + "loss": 1.0345, + "step": 3284 + }, + { + "epoch": 0.22257605528829866, + "grad_norm": 8.712138175964355, + "learning_rate": 9.653090560613321e-05, + "loss": 1.1404, + "step": 3285 + }, + { + "epoch": 0.22264381055627075, + "grad_norm": 8.79491901397705, + "learning_rate": 9.65295365870354e-05, + "loss": 1.4334, + "step": 3286 + }, + { + "epoch": 0.22271156582424284, + "grad_norm": 9.580904960632324, + "learning_rate": 9.652816756793757e-05, + "loss": 1.0693, + "step": 3287 + }, + { + "epoch": 0.22277932109221493, + "grad_norm": 7.335433006286621, + "learning_rate": 9.652679854883976e-05, + "loss": 1.0559, + "step": 3288 + }, + { + "epoch": 0.222847076360187, + "grad_norm": 8.534089088439941, + "learning_rate": 9.652542952974194e-05, + "loss": 0.9835, + "step": 3289 + }, + { + "epoch": 0.2229148316281591, + "grad_norm": 6.626898288726807, + "learning_rate": 9.652406051064413e-05, + "loss": 0.8056, + "step": 3290 + }, + { + "epoch": 0.22298258689613118, + "grad_norm": 7.774187088012695, + "learning_rate": 9.652269149154631e-05, + "loss": 0.8032, + "step": 3291 + }, + { + "epoch": 0.22305034216410327, + "grad_norm": 8.029982566833496, + "learning_rate": 9.652132247244849e-05, + "loss": 1.0536, + "step": 3292 + }, + { + "epoch": 0.22311809743207534, + "grad_norm": 8.701712608337402, + "learning_rate": 9.651995345335067e-05, + "loss": 0.9422, + "step": 3293 + }, + { + "epoch": 0.22318585270004743, + "grad_norm": 9.17507553100586, + "learning_rate": 9.651858443425286e-05, + "loss": 1.0094, + "step": 3294 + }, + { + "epoch": 0.22325360796801952, + "grad_norm": 8.464656829833984, + "learning_rate": 9.651721541515504e-05, + "loss": 0.9546, + "step": 3295 + }, + { + "epoch": 0.2233213632359916, + "grad_norm": 6.839638710021973, + "learning_rate": 9.651584639605722e-05, + "loss": 0.8296, + "step": 3296 + }, + { + "epoch": 0.22338911850396367, + "grad_norm": 7.773430824279785, + "learning_rate": 9.65144773769594e-05, + "loss": 1.1344, + "step": 3297 + }, + { + "epoch": 0.22345687377193577, + "grad_norm": 9.725523948669434, + "learning_rate": 9.65131083578616e-05, + "loss": 0.9814, + "step": 3298 + }, + { + "epoch": 0.22352462903990786, + "grad_norm": 8.995366096496582, + "learning_rate": 9.651173933876378e-05, + "loss": 1.2263, + "step": 3299 + }, + { + "epoch": 0.22359238430787995, + "grad_norm": 8.735838890075684, + "learning_rate": 9.651037031966596e-05, + "loss": 1.1707, + "step": 3300 + }, + { + "epoch": 0.223660139575852, + "grad_norm": 7.177740097045898, + "learning_rate": 9.650900130056815e-05, + "loss": 0.9491, + "step": 3301 + }, + { + "epoch": 0.2237278948438241, + "grad_norm": 8.393296241760254, + "learning_rate": 9.650763228147033e-05, + "loss": 0.9765, + "step": 3302 + }, + { + "epoch": 0.2237956501117962, + "grad_norm": 9.773447036743164, + "learning_rate": 9.650626326237251e-05, + "loss": 1.0409, + "step": 3303 + }, + { + "epoch": 0.2238634053797683, + "grad_norm": 6.951483249664307, + "learning_rate": 9.650489424327471e-05, + "loss": 0.9848, + "step": 3304 + }, + { + "epoch": 0.22393116064774035, + "grad_norm": 8.579717636108398, + "learning_rate": 9.650352522417689e-05, + "loss": 0.9722, + "step": 3305 + }, + { + "epoch": 0.22399891591571244, + "grad_norm": 7.798871040344238, + "learning_rate": 9.650215620507907e-05, + "loss": 0.8046, + "step": 3306 + }, + { + "epoch": 0.22406667118368453, + "grad_norm": 9.504851341247559, + "learning_rate": 9.650078718598125e-05, + "loss": 1.1972, + "step": 3307 + }, + { + "epoch": 0.22413442645165663, + "grad_norm": 8.384056091308594, + "learning_rate": 9.649941816688344e-05, + "loss": 1.0654, + "step": 3308 + }, + { + "epoch": 0.2242021817196287, + "grad_norm": 10.469207763671875, + "learning_rate": 9.649804914778562e-05, + "loss": 1.0694, + "step": 3309 + }, + { + "epoch": 0.22426993698760078, + "grad_norm": 8.5567045211792, + "learning_rate": 9.64966801286878e-05, + "loss": 1.0371, + "step": 3310 + }, + { + "epoch": 0.22433769225557287, + "grad_norm": 9.427155494689941, + "learning_rate": 9.649531110958998e-05, + "loss": 1.0185, + "step": 3311 + }, + { + "epoch": 0.22440544752354497, + "grad_norm": 8.238412857055664, + "learning_rate": 9.649394209049216e-05, + "loss": 0.9744, + "step": 3312 + }, + { + "epoch": 0.22447320279151703, + "grad_norm": 9.36423110961914, + "learning_rate": 9.649257307139436e-05, + "loss": 0.9184, + "step": 3313 + }, + { + "epoch": 0.22454095805948912, + "grad_norm": 7.485356330871582, + "learning_rate": 9.649120405229654e-05, + "loss": 0.9948, + "step": 3314 + }, + { + "epoch": 0.2246087133274612, + "grad_norm": 7.247208118438721, + "learning_rate": 9.648983503319872e-05, + "loss": 0.8592, + "step": 3315 + }, + { + "epoch": 0.2246764685954333, + "grad_norm": 9.266085624694824, + "learning_rate": 9.64884660141009e-05, + "loss": 1.0314, + "step": 3316 + }, + { + "epoch": 0.22474422386340537, + "grad_norm": 11.147274017333984, + "learning_rate": 9.64870969950031e-05, + "loss": 1.0403, + "step": 3317 + }, + { + "epoch": 0.22481197913137746, + "grad_norm": 10.961670875549316, + "learning_rate": 9.648572797590527e-05, + "loss": 1.3147, + "step": 3318 + }, + { + "epoch": 0.22487973439934955, + "grad_norm": 7.002868175506592, + "learning_rate": 9.648435895680745e-05, + "loss": 0.9044, + "step": 3319 + }, + { + "epoch": 0.22494748966732164, + "grad_norm": 7.886247634887695, + "learning_rate": 9.648298993770963e-05, + "loss": 0.8901, + "step": 3320 + }, + { + "epoch": 0.2250152449352937, + "grad_norm": 8.933539390563965, + "learning_rate": 9.648162091861181e-05, + "loss": 0.8758, + "step": 3321 + }, + { + "epoch": 0.2250830002032658, + "grad_norm": 7.223681449890137, + "learning_rate": 9.648025189951401e-05, + "loss": 1.1502, + "step": 3322 + }, + { + "epoch": 0.2251507554712379, + "grad_norm": 8.808623313903809, + "learning_rate": 9.647888288041619e-05, + "loss": 1.0706, + "step": 3323 + }, + { + "epoch": 0.22521851073920998, + "grad_norm": 9.557942390441895, + "learning_rate": 9.647751386131837e-05, + "loss": 0.8113, + "step": 3324 + }, + { + "epoch": 0.22528626600718205, + "grad_norm": 9.855717658996582, + "learning_rate": 9.647614484222055e-05, + "loss": 1.251, + "step": 3325 + }, + { + "epoch": 0.22535402127515414, + "grad_norm": 9.588946342468262, + "learning_rate": 9.647477582312273e-05, + "loss": 1.3029, + "step": 3326 + }, + { + "epoch": 0.22542177654312623, + "grad_norm": 9.277341842651367, + "learning_rate": 9.647340680402492e-05, + "loss": 0.9708, + "step": 3327 + }, + { + "epoch": 0.22548953181109832, + "grad_norm": 9.607316970825195, + "learning_rate": 9.64720377849271e-05, + "loss": 0.9614, + "step": 3328 + }, + { + "epoch": 0.22555728707907038, + "grad_norm": 7.949220657348633, + "learning_rate": 9.647066876582928e-05, + "loss": 1.0066, + "step": 3329 + }, + { + "epoch": 0.22562504234704248, + "grad_norm": 8.383744239807129, + "learning_rate": 9.646929974673146e-05, + "loss": 0.797, + "step": 3330 + }, + { + "epoch": 0.22569279761501457, + "grad_norm": 7.203914642333984, + "learning_rate": 9.646793072763366e-05, + "loss": 0.9625, + "step": 3331 + }, + { + "epoch": 0.22576055288298666, + "grad_norm": 8.95102596282959, + "learning_rate": 9.646656170853584e-05, + "loss": 1.0728, + "step": 3332 + }, + { + "epoch": 0.22582830815095875, + "grad_norm": 8.044751167297363, + "learning_rate": 9.646519268943802e-05, + "loss": 1.2155, + "step": 3333 + }, + { + "epoch": 0.22589606341893081, + "grad_norm": 9.164902687072754, + "learning_rate": 9.64638236703402e-05, + "loss": 1.2923, + "step": 3334 + }, + { + "epoch": 0.2259638186869029, + "grad_norm": 6.792164325714111, + "learning_rate": 9.646245465124238e-05, + "loss": 0.9956, + "step": 3335 + }, + { + "epoch": 0.226031573954875, + "grad_norm": 7.76467752456665, + "learning_rate": 9.646108563214457e-05, + "loss": 1.0057, + "step": 3336 + }, + { + "epoch": 0.2260993292228471, + "grad_norm": 8.541545867919922, + "learning_rate": 9.645971661304675e-05, + "loss": 1.1013, + "step": 3337 + }, + { + "epoch": 0.22616708449081915, + "grad_norm": 7.8545050621032715, + "learning_rate": 9.645834759394893e-05, + "loss": 0.8565, + "step": 3338 + }, + { + "epoch": 0.22623483975879125, + "grad_norm": 9.322896003723145, + "learning_rate": 9.645697857485112e-05, + "loss": 1.0509, + "step": 3339 + }, + { + "epoch": 0.22630259502676334, + "grad_norm": 9.540191650390625, + "learning_rate": 9.645560955575331e-05, + "loss": 1.1757, + "step": 3340 + }, + { + "epoch": 0.22637035029473543, + "grad_norm": 8.97028923034668, + "learning_rate": 9.645424053665549e-05, + "loss": 1.0342, + "step": 3341 + }, + { + "epoch": 0.2264381055627075, + "grad_norm": 6.444105625152588, + "learning_rate": 9.645287151755767e-05, + "loss": 0.8377, + "step": 3342 + }, + { + "epoch": 0.22650586083067958, + "grad_norm": 9.237889289855957, + "learning_rate": 9.645150249845985e-05, + "loss": 1.0468, + "step": 3343 + }, + { + "epoch": 0.22657361609865168, + "grad_norm": 9.567046165466309, + "learning_rate": 9.645013347936203e-05, + "loss": 1.3875, + "step": 3344 + }, + { + "epoch": 0.22664137136662377, + "grad_norm": 8.302481651306152, + "learning_rate": 9.644876446026422e-05, + "loss": 1.0455, + "step": 3345 + }, + { + "epoch": 0.22670912663459583, + "grad_norm": 9.124307632446289, + "learning_rate": 9.64473954411664e-05, + "loss": 0.9828, + "step": 3346 + }, + { + "epoch": 0.22677688190256792, + "grad_norm": 7.700011730194092, + "learning_rate": 9.644602642206858e-05, + "loss": 0.8475, + "step": 3347 + }, + { + "epoch": 0.22684463717054001, + "grad_norm": 6.1064276695251465, + "learning_rate": 9.644465740297078e-05, + "loss": 0.7779, + "step": 3348 + }, + { + "epoch": 0.2269123924385121, + "grad_norm": 8.924312591552734, + "learning_rate": 9.644328838387296e-05, + "loss": 1.3825, + "step": 3349 + }, + { + "epoch": 0.22698014770648417, + "grad_norm": 8.169050216674805, + "learning_rate": 9.644191936477514e-05, + "loss": 1.1198, + "step": 3350 + }, + { + "epoch": 0.22704790297445626, + "grad_norm": 8.368382453918457, + "learning_rate": 9.644055034567733e-05, + "loss": 1.0729, + "step": 3351 + }, + { + "epoch": 0.22711565824242835, + "grad_norm": 10.206897735595703, + "learning_rate": 9.643918132657951e-05, + "loss": 0.9022, + "step": 3352 + }, + { + "epoch": 0.22718341351040044, + "grad_norm": 6.288288116455078, + "learning_rate": 9.64378123074817e-05, + "loss": 0.7868, + "step": 3353 + }, + { + "epoch": 0.2272511687783725, + "grad_norm": 8.446578025817871, + "learning_rate": 9.643644328838389e-05, + "loss": 1.0426, + "step": 3354 + }, + { + "epoch": 0.2273189240463446, + "grad_norm": 8.425249099731445, + "learning_rate": 9.643507426928607e-05, + "loss": 0.823, + "step": 3355 + }, + { + "epoch": 0.2273866793143167, + "grad_norm": 6.610576629638672, + "learning_rate": 9.643370525018825e-05, + "loss": 0.9251, + "step": 3356 + }, + { + "epoch": 0.22745443458228878, + "grad_norm": 8.365503311157227, + "learning_rate": 9.643233623109043e-05, + "loss": 1.0258, + "step": 3357 + }, + { + "epoch": 0.22752218985026085, + "grad_norm": 8.654791831970215, + "learning_rate": 9.643096721199261e-05, + "loss": 0.8781, + "step": 3358 + }, + { + "epoch": 0.22758994511823294, + "grad_norm": 6.612964153289795, + "learning_rate": 9.64295981928948e-05, + "loss": 0.7472, + "step": 3359 + }, + { + "epoch": 0.22765770038620503, + "grad_norm": 9.4874267578125, + "learning_rate": 9.642822917379698e-05, + "loss": 0.994, + "step": 3360 + }, + { + "epoch": 0.22772545565417712, + "grad_norm": 9.7284574508667, + "learning_rate": 9.642686015469916e-05, + "loss": 0.9293, + "step": 3361 + }, + { + "epoch": 0.2277932109221492, + "grad_norm": 8.787792205810547, + "learning_rate": 9.642549113560134e-05, + "loss": 1.1368, + "step": 3362 + }, + { + "epoch": 0.22786096619012128, + "grad_norm": 9.218424797058105, + "learning_rate": 9.642412211650354e-05, + "loss": 1.2437, + "step": 3363 + }, + { + "epoch": 0.22792872145809337, + "grad_norm": 7.9437127113342285, + "learning_rate": 9.642275309740572e-05, + "loss": 0.925, + "step": 3364 + }, + { + "epoch": 0.22799647672606546, + "grad_norm": 10.552806854248047, + "learning_rate": 9.64213840783079e-05, + "loss": 0.936, + "step": 3365 + }, + { + "epoch": 0.22806423199403753, + "grad_norm": 6.572626113891602, + "learning_rate": 9.642001505921008e-05, + "loss": 0.8968, + "step": 3366 + }, + { + "epoch": 0.22813198726200962, + "grad_norm": 8.689478874206543, + "learning_rate": 9.641864604011226e-05, + "loss": 0.8605, + "step": 3367 + }, + { + "epoch": 0.2281997425299817, + "grad_norm": 10.450199127197266, + "learning_rate": 9.641727702101445e-05, + "loss": 1.0871, + "step": 3368 + }, + { + "epoch": 0.2282674977979538, + "grad_norm": 9.260807991027832, + "learning_rate": 9.641590800191663e-05, + "loss": 0.8469, + "step": 3369 + }, + { + "epoch": 0.22833525306592586, + "grad_norm": 13.626687049865723, + "learning_rate": 9.641453898281881e-05, + "loss": 1.0615, + "step": 3370 + }, + { + "epoch": 0.22840300833389796, + "grad_norm": 7.419554710388184, + "learning_rate": 9.6413169963721e-05, + "loss": 0.9351, + "step": 3371 + }, + { + "epoch": 0.22847076360187005, + "grad_norm": 9.33206558227539, + "learning_rate": 9.641180094462319e-05, + "loss": 1.1826, + "step": 3372 + }, + { + "epoch": 0.22853851886984214, + "grad_norm": 7.148665904998779, + "learning_rate": 9.641043192552537e-05, + "loss": 1.1121, + "step": 3373 + }, + { + "epoch": 0.2286062741378142, + "grad_norm": 6.784035682678223, + "learning_rate": 9.640906290642755e-05, + "loss": 0.9649, + "step": 3374 + }, + { + "epoch": 0.2286740294057863, + "grad_norm": 7.357334613800049, + "learning_rate": 9.640769388732973e-05, + "loss": 0.9005, + "step": 3375 + }, + { + "epoch": 0.22874178467375839, + "grad_norm": 9.157288551330566, + "learning_rate": 9.640632486823191e-05, + "loss": 0.9585, + "step": 3376 + }, + { + "epoch": 0.22880953994173048, + "grad_norm": 9.068450927734375, + "learning_rate": 9.64049558491341e-05, + "loss": 0.9248, + "step": 3377 + }, + { + "epoch": 0.22887729520970254, + "grad_norm": 9.446860313415527, + "learning_rate": 9.640358683003628e-05, + "loss": 1.0016, + "step": 3378 + }, + { + "epoch": 0.22894505047767463, + "grad_norm": 7.63693904876709, + "learning_rate": 9.640221781093846e-05, + "loss": 1.0158, + "step": 3379 + }, + { + "epoch": 0.22901280574564672, + "grad_norm": 7.569469451904297, + "learning_rate": 9.640084879184064e-05, + "loss": 0.9149, + "step": 3380 + }, + { + "epoch": 0.22908056101361882, + "grad_norm": 8.837080955505371, + "learning_rate": 9.639947977274282e-05, + "loss": 0.9379, + "step": 3381 + }, + { + "epoch": 0.22914831628159088, + "grad_norm": 9.141901969909668, + "learning_rate": 9.639811075364502e-05, + "loss": 0.9121, + "step": 3382 + }, + { + "epoch": 0.22921607154956297, + "grad_norm": 7.68120813369751, + "learning_rate": 9.63967417345472e-05, + "loss": 1.0235, + "step": 3383 + }, + { + "epoch": 0.22928382681753506, + "grad_norm": 8.82229995727539, + "learning_rate": 9.639537271544938e-05, + "loss": 1.1573, + "step": 3384 + }, + { + "epoch": 0.22935158208550716, + "grad_norm": 9.490239143371582, + "learning_rate": 9.639400369635156e-05, + "loss": 0.8776, + "step": 3385 + }, + { + "epoch": 0.22941933735347925, + "grad_norm": 10.257567405700684, + "learning_rate": 9.639263467725375e-05, + "loss": 1.0696, + "step": 3386 + }, + { + "epoch": 0.2294870926214513, + "grad_norm": 7.793918609619141, + "learning_rate": 9.639126565815593e-05, + "loss": 1.0892, + "step": 3387 + }, + { + "epoch": 0.2295548478894234, + "grad_norm": 8.983718872070312, + "learning_rate": 9.638989663905811e-05, + "loss": 0.908, + "step": 3388 + }, + { + "epoch": 0.2296226031573955, + "grad_norm": 8.31851577758789, + "learning_rate": 9.63885276199603e-05, + "loss": 1.0168, + "step": 3389 + }, + { + "epoch": 0.22969035842536759, + "grad_norm": 9.343503952026367, + "learning_rate": 9.638715860086248e-05, + "loss": 1.1174, + "step": 3390 + }, + { + "epoch": 0.22975811369333965, + "grad_norm": 7.617075443267822, + "learning_rate": 9.638578958176467e-05, + "loss": 0.9076, + "step": 3391 + }, + { + "epoch": 0.22982586896131174, + "grad_norm": 8.102355003356934, + "learning_rate": 9.638442056266685e-05, + "loss": 1.0476, + "step": 3392 + }, + { + "epoch": 0.22989362422928383, + "grad_norm": 9.366684913635254, + "learning_rate": 9.638305154356903e-05, + "loss": 1.1892, + "step": 3393 + }, + { + "epoch": 0.22996137949725592, + "grad_norm": 8.204293251037598, + "learning_rate": 9.638168252447122e-05, + "loss": 0.7731, + "step": 3394 + }, + { + "epoch": 0.230029134765228, + "grad_norm": 9.68787670135498, + "learning_rate": 9.63803135053734e-05, + "loss": 1.1126, + "step": 3395 + }, + { + "epoch": 0.23009689003320008, + "grad_norm": 6.6209330558776855, + "learning_rate": 9.637894448627558e-05, + "loss": 0.9006, + "step": 3396 + }, + { + "epoch": 0.23016464530117217, + "grad_norm": 7.963149547576904, + "learning_rate": 9.637757546717778e-05, + "loss": 0.8275, + "step": 3397 + }, + { + "epoch": 0.23023240056914426, + "grad_norm": 8.74716567993164, + "learning_rate": 9.637620644807996e-05, + "loss": 1.0592, + "step": 3398 + }, + { + "epoch": 0.23030015583711633, + "grad_norm": 8.819621086120605, + "learning_rate": 9.637483742898214e-05, + "loss": 0.8799, + "step": 3399 + }, + { + "epoch": 0.23036791110508842, + "grad_norm": 8.538536071777344, + "learning_rate": 9.637346840988433e-05, + "loss": 1.1498, + "step": 3400 + }, + { + "epoch": 0.2304356663730605, + "grad_norm": 8.787203788757324, + "learning_rate": 9.637209939078651e-05, + "loss": 0.8149, + "step": 3401 + }, + { + "epoch": 0.2305034216410326, + "grad_norm": 8.85105037689209, + "learning_rate": 9.63707303716887e-05, + "loss": 0.8525, + "step": 3402 + }, + { + "epoch": 0.23057117690900467, + "grad_norm": 9.773297309875488, + "learning_rate": 9.636936135259087e-05, + "loss": 0.9448, + "step": 3403 + }, + { + "epoch": 0.23063893217697676, + "grad_norm": 7.804196357727051, + "learning_rate": 9.636799233349307e-05, + "loss": 0.9549, + "step": 3404 + }, + { + "epoch": 0.23070668744494885, + "grad_norm": 8.398775100708008, + "learning_rate": 9.636662331439525e-05, + "loss": 0.9111, + "step": 3405 + }, + { + "epoch": 0.23077444271292094, + "grad_norm": 8.960867881774902, + "learning_rate": 9.636525429529743e-05, + "loss": 1.1533, + "step": 3406 + }, + { + "epoch": 0.230842197980893, + "grad_norm": 7.513526439666748, + "learning_rate": 9.636388527619961e-05, + "loss": 1.113, + "step": 3407 + }, + { + "epoch": 0.2309099532488651, + "grad_norm": 8.891728401184082, + "learning_rate": 9.636251625710179e-05, + "loss": 0.8727, + "step": 3408 + }, + { + "epoch": 0.2309777085168372, + "grad_norm": 6.972326755523682, + "learning_rate": 9.636114723800398e-05, + "loss": 0.8085, + "step": 3409 + }, + { + "epoch": 0.23104546378480928, + "grad_norm": 9.195303916931152, + "learning_rate": 9.635977821890616e-05, + "loss": 1.0276, + "step": 3410 + }, + { + "epoch": 0.23111321905278134, + "grad_norm": 9.36483383178711, + "learning_rate": 9.635840919980834e-05, + "loss": 1.1369, + "step": 3411 + }, + { + "epoch": 0.23118097432075344, + "grad_norm": 7.282619953155518, + "learning_rate": 9.635704018071052e-05, + "loss": 0.9214, + "step": 3412 + }, + { + "epoch": 0.23124872958872553, + "grad_norm": 6.289726257324219, + "learning_rate": 9.63556711616127e-05, + "loss": 0.8556, + "step": 3413 + }, + { + "epoch": 0.23131648485669762, + "grad_norm": 8.646320343017578, + "learning_rate": 9.63543021425149e-05, + "loss": 1.0833, + "step": 3414 + }, + { + "epoch": 0.23138424012466968, + "grad_norm": 8.372559547424316, + "learning_rate": 9.635293312341708e-05, + "loss": 0.9944, + "step": 3415 + }, + { + "epoch": 0.23145199539264177, + "grad_norm": 8.002387046813965, + "learning_rate": 9.635156410431926e-05, + "loss": 1.015, + "step": 3416 + }, + { + "epoch": 0.23151975066061387, + "grad_norm": 8.4190092086792, + "learning_rate": 9.635019508522144e-05, + "loss": 0.9766, + "step": 3417 + }, + { + "epoch": 0.23158750592858596, + "grad_norm": 8.759613037109375, + "learning_rate": 9.634882606612363e-05, + "loss": 1.2348, + "step": 3418 + }, + { + "epoch": 0.23165526119655802, + "grad_norm": 8.439151763916016, + "learning_rate": 9.634745704702581e-05, + "loss": 0.8778, + "step": 3419 + }, + { + "epoch": 0.2317230164645301, + "grad_norm": 8.05675220489502, + "learning_rate": 9.6346088027928e-05, + "loss": 0.9525, + "step": 3420 + }, + { + "epoch": 0.2317907717325022, + "grad_norm": 9.196937561035156, + "learning_rate": 9.634471900883017e-05, + "loss": 1.0663, + "step": 3421 + }, + { + "epoch": 0.2318585270004743, + "grad_norm": 7.622048854827881, + "learning_rate": 9.634334998973235e-05, + "loss": 0.9522, + "step": 3422 + }, + { + "epoch": 0.23192628226844636, + "grad_norm": 10.235919952392578, + "learning_rate": 9.634198097063455e-05, + "loss": 1.1084, + "step": 3423 + }, + { + "epoch": 0.23199403753641845, + "grad_norm": 7.920719623565674, + "learning_rate": 9.634061195153673e-05, + "loss": 0.9443, + "step": 3424 + }, + { + "epoch": 0.23206179280439054, + "grad_norm": 7.440132141113281, + "learning_rate": 9.633924293243891e-05, + "loss": 1.0757, + "step": 3425 + }, + { + "epoch": 0.23212954807236263, + "grad_norm": 7.0108771324157715, + "learning_rate": 9.633787391334109e-05, + "loss": 0.8043, + "step": 3426 + }, + { + "epoch": 0.2321973033403347, + "grad_norm": 6.620635032653809, + "learning_rate": 9.633650489424328e-05, + "loss": 0.999, + "step": 3427 + }, + { + "epoch": 0.2322650586083068, + "grad_norm": 7.27518367767334, + "learning_rate": 9.633513587514546e-05, + "loss": 0.888, + "step": 3428 + }, + { + "epoch": 0.23233281387627888, + "grad_norm": 7.081945419311523, + "learning_rate": 9.633376685604764e-05, + "loss": 0.8837, + "step": 3429 + }, + { + "epoch": 0.23240056914425097, + "grad_norm": 7.225597858428955, + "learning_rate": 9.633239783694982e-05, + "loss": 0.7774, + "step": 3430 + }, + { + "epoch": 0.23246832441222304, + "grad_norm": 7.0490217208862305, + "learning_rate": 9.6331028817852e-05, + "loss": 1.0654, + "step": 3431 + }, + { + "epoch": 0.23253607968019513, + "grad_norm": 8.641639709472656, + "learning_rate": 9.63296597987542e-05, + "loss": 1.1071, + "step": 3432 + }, + { + "epoch": 0.23260383494816722, + "grad_norm": 7.442448139190674, + "learning_rate": 9.632829077965638e-05, + "loss": 0.9392, + "step": 3433 + }, + { + "epoch": 0.2326715902161393, + "grad_norm": 6.6959452629089355, + "learning_rate": 9.632692176055856e-05, + "loss": 0.8334, + "step": 3434 + }, + { + "epoch": 0.23273934548411138, + "grad_norm": 9.67419719696045, + "learning_rate": 9.632555274146074e-05, + "loss": 1.1948, + "step": 3435 + }, + { + "epoch": 0.23280710075208347, + "grad_norm": 7.384359836578369, + "learning_rate": 9.632418372236292e-05, + "loss": 0.7992, + "step": 3436 + }, + { + "epoch": 0.23287485602005556, + "grad_norm": 6.566294193267822, + "learning_rate": 9.632281470326511e-05, + "loss": 0.9393, + "step": 3437 + }, + { + "epoch": 0.23294261128802765, + "grad_norm": 7.842831134796143, + "learning_rate": 9.63214456841673e-05, + "loss": 1.1372, + "step": 3438 + }, + { + "epoch": 0.23301036655599974, + "grad_norm": 9.487961769104004, + "learning_rate": 9.632007666506947e-05, + "loss": 1.0381, + "step": 3439 + }, + { + "epoch": 0.2330781218239718, + "grad_norm": 8.06712532043457, + "learning_rate": 9.631870764597167e-05, + "loss": 1.4106, + "step": 3440 + }, + { + "epoch": 0.2331458770919439, + "grad_norm": 7.026076793670654, + "learning_rate": 9.631733862687385e-05, + "loss": 1.0662, + "step": 3441 + }, + { + "epoch": 0.233213632359916, + "grad_norm": 10.926567077636719, + "learning_rate": 9.631596960777603e-05, + "loss": 1.2932, + "step": 3442 + }, + { + "epoch": 0.23328138762788808, + "grad_norm": 8.380082130432129, + "learning_rate": 9.631460058867822e-05, + "loss": 1.0784, + "step": 3443 + }, + { + "epoch": 0.23334914289586015, + "grad_norm": 8.768819808959961, + "learning_rate": 9.63132315695804e-05, + "loss": 1.1501, + "step": 3444 + }, + { + "epoch": 0.23341689816383224, + "grad_norm": 8.467490196228027, + "learning_rate": 9.631186255048258e-05, + "loss": 0.7555, + "step": 3445 + }, + { + "epoch": 0.23348465343180433, + "grad_norm": 6.0409770011901855, + "learning_rate": 9.631049353138478e-05, + "loss": 0.9099, + "step": 3446 + }, + { + "epoch": 0.23355240869977642, + "grad_norm": 7.160163402557373, + "learning_rate": 9.630912451228696e-05, + "loss": 0.8386, + "step": 3447 + }, + { + "epoch": 0.23362016396774848, + "grad_norm": 9.102558135986328, + "learning_rate": 9.630775549318914e-05, + "loss": 0.9259, + "step": 3448 + }, + { + "epoch": 0.23368791923572058, + "grad_norm": 7.071728706359863, + "learning_rate": 9.630638647409132e-05, + "loss": 1.0426, + "step": 3449 + }, + { + "epoch": 0.23375567450369267, + "grad_norm": 8.358567237854004, + "learning_rate": 9.630501745499351e-05, + "loss": 0.9806, + "step": 3450 + }, + { + "epoch": 0.23382342977166476, + "grad_norm": 12.588993072509766, + "learning_rate": 9.630364843589569e-05, + "loss": 1.0796, + "step": 3451 + }, + { + "epoch": 0.23389118503963682, + "grad_norm": 8.665871620178223, + "learning_rate": 9.630227941679787e-05, + "loss": 1.0224, + "step": 3452 + }, + { + "epoch": 0.23395894030760891, + "grad_norm": 9.288796424865723, + "learning_rate": 9.630091039770005e-05, + "loss": 0.9329, + "step": 3453 + }, + { + "epoch": 0.234026695575581, + "grad_norm": 7.319186210632324, + "learning_rate": 9.629954137860223e-05, + "loss": 1.057, + "step": 3454 + }, + { + "epoch": 0.2340944508435531, + "grad_norm": 8.640669822692871, + "learning_rate": 9.629817235950443e-05, + "loss": 0.9756, + "step": 3455 + }, + { + "epoch": 0.23416220611152516, + "grad_norm": 10.73513126373291, + "learning_rate": 9.629680334040661e-05, + "loss": 0.958, + "step": 3456 + }, + { + "epoch": 0.23422996137949725, + "grad_norm": 7.40097713470459, + "learning_rate": 9.629543432130879e-05, + "loss": 0.9062, + "step": 3457 + }, + { + "epoch": 0.23429771664746935, + "grad_norm": 9.506468772888184, + "learning_rate": 9.629406530221097e-05, + "loss": 1.0083, + "step": 3458 + }, + { + "epoch": 0.23436547191544144, + "grad_norm": 7.291172504425049, + "learning_rate": 9.629269628311315e-05, + "loss": 0.9966, + "step": 3459 + }, + { + "epoch": 0.2344332271834135, + "grad_norm": 8.962395668029785, + "learning_rate": 9.629132726401534e-05, + "loss": 0.8504, + "step": 3460 + }, + { + "epoch": 0.2345009824513856, + "grad_norm": 10.29174518585205, + "learning_rate": 9.628995824491752e-05, + "loss": 1.0807, + "step": 3461 + }, + { + "epoch": 0.23456873771935768, + "grad_norm": 9.461160659790039, + "learning_rate": 9.62885892258197e-05, + "loss": 0.996, + "step": 3462 + }, + { + "epoch": 0.23463649298732978, + "grad_norm": 7.13394021987915, + "learning_rate": 9.628722020672188e-05, + "loss": 0.7163, + "step": 3463 + }, + { + "epoch": 0.23470424825530184, + "grad_norm": 9.379724502563477, + "learning_rate": 9.628585118762408e-05, + "loss": 0.9571, + "step": 3464 + }, + { + "epoch": 0.23477200352327393, + "grad_norm": 9.72339916229248, + "learning_rate": 9.628448216852626e-05, + "loss": 1.2559, + "step": 3465 + }, + { + "epoch": 0.23483975879124602, + "grad_norm": 7.353830337524414, + "learning_rate": 9.628311314942844e-05, + "loss": 1.0563, + "step": 3466 + }, + { + "epoch": 0.23490751405921811, + "grad_norm": 8.423667907714844, + "learning_rate": 9.628174413033062e-05, + "loss": 1.0799, + "step": 3467 + }, + { + "epoch": 0.23497526932719018, + "grad_norm": 6.426609516143799, + "learning_rate": 9.62803751112328e-05, + "loss": 0.9417, + "step": 3468 + }, + { + "epoch": 0.23504302459516227, + "grad_norm": 10.882522583007812, + "learning_rate": 9.627900609213499e-05, + "loss": 1.1498, + "step": 3469 + }, + { + "epoch": 0.23511077986313436, + "grad_norm": 7.768298625946045, + "learning_rate": 9.627763707303717e-05, + "loss": 0.7389, + "step": 3470 + }, + { + "epoch": 0.23517853513110645, + "grad_norm": 9.002137184143066, + "learning_rate": 9.627626805393935e-05, + "loss": 1.2338, + "step": 3471 + }, + { + "epoch": 0.23524629039907852, + "grad_norm": 8.671159744262695, + "learning_rate": 9.627489903484153e-05, + "loss": 1.1114, + "step": 3472 + }, + { + "epoch": 0.2353140456670506, + "grad_norm": 5.836034774780273, + "learning_rate": 9.627353001574373e-05, + "loss": 0.7075, + "step": 3473 + }, + { + "epoch": 0.2353818009350227, + "grad_norm": 8.145447731018066, + "learning_rate": 9.627216099664591e-05, + "loss": 0.9504, + "step": 3474 + }, + { + "epoch": 0.2354495562029948, + "grad_norm": 7.3333563804626465, + "learning_rate": 9.627079197754809e-05, + "loss": 0.8443, + "step": 3475 + }, + { + "epoch": 0.23551731147096686, + "grad_norm": 8.191438674926758, + "learning_rate": 9.626942295845027e-05, + "loss": 0.8163, + "step": 3476 + }, + { + "epoch": 0.23558506673893895, + "grad_norm": 10.042975425720215, + "learning_rate": 9.626805393935245e-05, + "loss": 1.0715, + "step": 3477 + }, + { + "epoch": 0.23565282200691104, + "grad_norm": 7.854464054107666, + "learning_rate": 9.626668492025464e-05, + "loss": 0.9263, + "step": 3478 + }, + { + "epoch": 0.23572057727488313, + "grad_norm": 9.286520004272461, + "learning_rate": 9.626531590115682e-05, + "loss": 1.0172, + "step": 3479 + }, + { + "epoch": 0.2357883325428552, + "grad_norm": 7.321778297424316, + "learning_rate": 9.6263946882059e-05, + "loss": 1.0661, + "step": 3480 + }, + { + "epoch": 0.2358560878108273, + "grad_norm": 7.432394981384277, + "learning_rate": 9.626257786296118e-05, + "loss": 1.1682, + "step": 3481 + }, + { + "epoch": 0.23592384307879938, + "grad_norm": 10.230997085571289, + "learning_rate": 9.626120884386338e-05, + "loss": 1.0498, + "step": 3482 + }, + { + "epoch": 0.23599159834677147, + "grad_norm": 8.530696868896484, + "learning_rate": 9.625983982476556e-05, + "loss": 1.0598, + "step": 3483 + }, + { + "epoch": 0.23605935361474353, + "grad_norm": 7.35584020614624, + "learning_rate": 9.625847080566774e-05, + "loss": 1.0234, + "step": 3484 + }, + { + "epoch": 0.23612710888271563, + "grad_norm": 8.030210494995117, + "learning_rate": 9.625710178656992e-05, + "loss": 1.0241, + "step": 3485 + }, + { + "epoch": 0.23619486415068772, + "grad_norm": 7.508440971374512, + "learning_rate": 9.625573276747211e-05, + "loss": 0.9466, + "step": 3486 + }, + { + "epoch": 0.2362626194186598, + "grad_norm": 7.4618401527404785, + "learning_rate": 9.62543637483743e-05, + "loss": 0.88, + "step": 3487 + }, + { + "epoch": 0.23633037468663187, + "grad_norm": 7.738553524017334, + "learning_rate": 9.625299472927647e-05, + "loss": 1.0146, + "step": 3488 + }, + { + "epoch": 0.23639812995460396, + "grad_norm": 6.782055854797363, + "learning_rate": 9.625162571017867e-05, + "loss": 0.8196, + "step": 3489 + }, + { + "epoch": 0.23646588522257606, + "grad_norm": 7.495883464813232, + "learning_rate": 9.625025669108085e-05, + "loss": 1.0025, + "step": 3490 + }, + { + "epoch": 0.23653364049054815, + "grad_norm": 7.224970817565918, + "learning_rate": 9.624888767198303e-05, + "loss": 1.0069, + "step": 3491 + }, + { + "epoch": 0.23660139575852024, + "grad_norm": 8.686829566955566, + "learning_rate": 9.624751865288522e-05, + "loss": 0.8927, + "step": 3492 + }, + { + "epoch": 0.2366691510264923, + "grad_norm": 10.551370620727539, + "learning_rate": 9.62461496337874e-05, + "loss": 1.0973, + "step": 3493 + }, + { + "epoch": 0.2367369062944644, + "grad_norm": 8.38442611694336, + "learning_rate": 9.624478061468958e-05, + "loss": 0.8145, + "step": 3494 + }, + { + "epoch": 0.23680466156243649, + "grad_norm": 7.099575996398926, + "learning_rate": 9.624341159559176e-05, + "loss": 0.8304, + "step": 3495 + }, + { + "epoch": 0.23687241683040858, + "grad_norm": 7.905203342437744, + "learning_rate": 9.624204257649396e-05, + "loss": 1.1708, + "step": 3496 + }, + { + "epoch": 0.23694017209838064, + "grad_norm": 8.08218765258789, + "learning_rate": 9.624067355739614e-05, + "loss": 0.8367, + "step": 3497 + }, + { + "epoch": 0.23700792736635273, + "grad_norm": 7.766988277435303, + "learning_rate": 9.623930453829832e-05, + "loss": 1.1126, + "step": 3498 + }, + { + "epoch": 0.23707568263432482, + "grad_norm": 8.3229398727417, + "learning_rate": 9.62379355192005e-05, + "loss": 0.9798, + "step": 3499 + }, + { + "epoch": 0.23714343790229692, + "grad_norm": 7.113401889801025, + "learning_rate": 9.623656650010268e-05, + "loss": 0.7711, + "step": 3500 + }, + { + "epoch": 0.23721119317026898, + "grad_norm": 7.286252975463867, + "learning_rate": 9.623519748100487e-05, + "loss": 0.8018, + "step": 3501 + }, + { + "epoch": 0.23727894843824107, + "grad_norm": 8.724101066589355, + "learning_rate": 9.623382846190705e-05, + "loss": 0.9077, + "step": 3502 + }, + { + "epoch": 0.23734670370621316, + "grad_norm": 9.201606750488281, + "learning_rate": 9.623245944280923e-05, + "loss": 1.2277, + "step": 3503 + }, + { + "epoch": 0.23741445897418526, + "grad_norm": 6.827572822570801, + "learning_rate": 9.623109042371141e-05, + "loss": 0.9262, + "step": 3504 + }, + { + "epoch": 0.23748221424215732, + "grad_norm": 7.41288423538208, + "learning_rate": 9.622972140461361e-05, + "loss": 0.762, + "step": 3505 + }, + { + "epoch": 0.2375499695101294, + "grad_norm": 9.649503707885742, + "learning_rate": 9.622835238551579e-05, + "loss": 1.0371, + "step": 3506 + }, + { + "epoch": 0.2376177247781015, + "grad_norm": 9.630754470825195, + "learning_rate": 9.622698336641797e-05, + "loss": 0.9994, + "step": 3507 + }, + { + "epoch": 0.2376854800460736, + "grad_norm": 8.949817657470703, + "learning_rate": 9.622561434732015e-05, + "loss": 0.8555, + "step": 3508 + }, + { + "epoch": 0.23775323531404566, + "grad_norm": 8.096979141235352, + "learning_rate": 9.622424532822233e-05, + "loss": 0.9939, + "step": 3509 + }, + { + "epoch": 0.23782099058201775, + "grad_norm": 8.714512825012207, + "learning_rate": 9.622287630912452e-05, + "loss": 0.9533, + "step": 3510 + }, + { + "epoch": 0.23788874584998984, + "grad_norm": 8.663968086242676, + "learning_rate": 9.62215072900267e-05, + "loss": 0.9974, + "step": 3511 + }, + { + "epoch": 0.23795650111796193, + "grad_norm": 8.704265594482422, + "learning_rate": 9.622013827092888e-05, + "loss": 1.2524, + "step": 3512 + }, + { + "epoch": 0.238024256385934, + "grad_norm": 6.585339069366455, + "learning_rate": 9.621876925183106e-05, + "loss": 0.9021, + "step": 3513 + }, + { + "epoch": 0.2380920116539061, + "grad_norm": 7.992496013641357, + "learning_rate": 9.621740023273324e-05, + "loss": 0.9345, + "step": 3514 + }, + { + "epoch": 0.23815976692187818, + "grad_norm": 7.076025009155273, + "learning_rate": 9.621603121363544e-05, + "loss": 1.0487, + "step": 3515 + }, + { + "epoch": 0.23822752218985027, + "grad_norm": 8.833822250366211, + "learning_rate": 9.621466219453762e-05, + "loss": 0.9612, + "step": 3516 + }, + { + "epoch": 0.23829527745782234, + "grad_norm": 7.8553385734558105, + "learning_rate": 9.62132931754398e-05, + "loss": 1.0452, + "step": 3517 + }, + { + "epoch": 0.23836303272579443, + "grad_norm": 8.44243049621582, + "learning_rate": 9.621192415634198e-05, + "loss": 0.6874, + "step": 3518 + }, + { + "epoch": 0.23843078799376652, + "grad_norm": 8.1638822555542, + "learning_rate": 9.621055513724417e-05, + "loss": 0.9239, + "step": 3519 + }, + { + "epoch": 0.2384985432617386, + "grad_norm": 8.315443992614746, + "learning_rate": 9.620918611814635e-05, + "loss": 1.1256, + "step": 3520 + }, + { + "epoch": 0.23856629852971067, + "grad_norm": 10.451863288879395, + "learning_rate": 9.620781709904853e-05, + "loss": 1.1453, + "step": 3521 + }, + { + "epoch": 0.23863405379768277, + "grad_norm": 9.117147445678711, + "learning_rate": 9.620644807995071e-05, + "loss": 1.0751, + "step": 3522 + }, + { + "epoch": 0.23870180906565486, + "grad_norm": 7.746578216552734, + "learning_rate": 9.62050790608529e-05, + "loss": 0.867, + "step": 3523 + }, + { + "epoch": 0.23876956433362695, + "grad_norm": 8.366421699523926, + "learning_rate": 9.620371004175509e-05, + "loss": 1.0863, + "step": 3524 + }, + { + "epoch": 0.238837319601599, + "grad_norm": 9.951937675476074, + "learning_rate": 9.620234102265727e-05, + "loss": 1.0431, + "step": 3525 + }, + { + "epoch": 0.2389050748695711, + "grad_norm": 9.077424049377441, + "learning_rate": 9.620097200355945e-05, + "loss": 1.0433, + "step": 3526 + }, + { + "epoch": 0.2389728301375432, + "grad_norm": 6.919139385223389, + "learning_rate": 9.619960298446163e-05, + "loss": 0.8655, + "step": 3527 + }, + { + "epoch": 0.2390405854055153, + "grad_norm": 7.729475975036621, + "learning_rate": 9.619823396536382e-05, + "loss": 1.0943, + "step": 3528 + }, + { + "epoch": 0.23910834067348735, + "grad_norm": 9.782391548156738, + "learning_rate": 9.6196864946266e-05, + "loss": 1.1362, + "step": 3529 + }, + { + "epoch": 0.23917609594145944, + "grad_norm": 8.685064315795898, + "learning_rate": 9.619549592716818e-05, + "loss": 0.9885, + "step": 3530 + }, + { + "epoch": 0.23924385120943154, + "grad_norm": 6.963294982910156, + "learning_rate": 9.619412690807036e-05, + "loss": 0.9061, + "step": 3531 + }, + { + "epoch": 0.23931160647740363, + "grad_norm": 8.584861755371094, + "learning_rate": 9.619275788897256e-05, + "loss": 0.8527, + "step": 3532 + }, + { + "epoch": 0.2393793617453757, + "grad_norm": 9.253739356994629, + "learning_rate": 9.619138886987474e-05, + "loss": 1.0643, + "step": 3533 + }, + { + "epoch": 0.23944711701334778, + "grad_norm": 7.659543037414551, + "learning_rate": 9.619001985077692e-05, + "loss": 0.9553, + "step": 3534 + }, + { + "epoch": 0.23951487228131987, + "grad_norm": 8.769670486450195, + "learning_rate": 9.618865083167911e-05, + "loss": 1.0576, + "step": 3535 + }, + { + "epoch": 0.23958262754929197, + "grad_norm": 7.416141033172607, + "learning_rate": 9.618728181258129e-05, + "loss": 1.0064, + "step": 3536 + }, + { + "epoch": 0.23965038281726403, + "grad_norm": 7.91802978515625, + "learning_rate": 9.618591279348347e-05, + "loss": 1.0095, + "step": 3537 + }, + { + "epoch": 0.23971813808523612, + "grad_norm": 6.665622234344482, + "learning_rate": 9.618454377438567e-05, + "loss": 0.8099, + "step": 3538 + }, + { + "epoch": 0.2397858933532082, + "grad_norm": 7.3240203857421875, + "learning_rate": 9.618317475528785e-05, + "loss": 0.9496, + "step": 3539 + }, + { + "epoch": 0.2398536486211803, + "grad_norm": 7.268299102783203, + "learning_rate": 9.618180573619003e-05, + "loss": 1.0504, + "step": 3540 + }, + { + "epoch": 0.23992140388915237, + "grad_norm": 8.710535049438477, + "learning_rate": 9.618043671709221e-05, + "loss": 0.9228, + "step": 3541 + }, + { + "epoch": 0.23998915915712446, + "grad_norm": 8.5223970413208, + "learning_rate": 9.61790676979944e-05, + "loss": 1.0383, + "step": 3542 + }, + { + "epoch": 0.24005691442509655, + "grad_norm": 7.635293960571289, + "learning_rate": 9.617769867889658e-05, + "loss": 0.99, + "step": 3543 + }, + { + "epoch": 0.24012466969306864, + "grad_norm": 9.41180419921875, + "learning_rate": 9.617632965979876e-05, + "loss": 1.1214, + "step": 3544 + }, + { + "epoch": 0.24019242496104073, + "grad_norm": 9.31615924835205, + "learning_rate": 9.617496064070094e-05, + "loss": 0.9369, + "step": 3545 + }, + { + "epoch": 0.2402601802290128, + "grad_norm": 9.427112579345703, + "learning_rate": 9.617359162160312e-05, + "loss": 1.1829, + "step": 3546 + }, + { + "epoch": 0.2403279354969849, + "grad_norm": 7.825446605682373, + "learning_rate": 9.617222260250532e-05, + "loss": 0.9459, + "step": 3547 + }, + { + "epoch": 0.24039569076495698, + "grad_norm": 7.8191633224487305, + "learning_rate": 9.61708535834075e-05, + "loss": 1.1144, + "step": 3548 + }, + { + "epoch": 0.24046344603292907, + "grad_norm": 8.224778175354004, + "learning_rate": 9.616948456430968e-05, + "loss": 0.8544, + "step": 3549 + }, + { + "epoch": 0.24053120130090114, + "grad_norm": 9.618694305419922, + "learning_rate": 9.616811554521186e-05, + "loss": 1.2254, + "step": 3550 + }, + { + "epoch": 0.24059895656887323, + "grad_norm": 7.786314964294434, + "learning_rate": 9.616674652611405e-05, + "loss": 0.8867, + "step": 3551 + }, + { + "epoch": 0.24066671183684532, + "grad_norm": 9.397835731506348, + "learning_rate": 9.616537750701623e-05, + "loss": 1.0505, + "step": 3552 + }, + { + "epoch": 0.2407344671048174, + "grad_norm": 7.701049327850342, + "learning_rate": 9.616400848791841e-05, + "loss": 0.897, + "step": 3553 + }, + { + "epoch": 0.24080222237278948, + "grad_norm": 7.573019504547119, + "learning_rate": 9.616263946882059e-05, + "loss": 0.9972, + "step": 3554 + }, + { + "epoch": 0.24086997764076157, + "grad_norm": 8.337100982666016, + "learning_rate": 9.616127044972277e-05, + "loss": 1.0835, + "step": 3555 + }, + { + "epoch": 0.24093773290873366, + "grad_norm": 8.702056884765625, + "learning_rate": 9.615990143062497e-05, + "loss": 0.7688, + "step": 3556 + }, + { + "epoch": 0.24100548817670575, + "grad_norm": 8.482346534729004, + "learning_rate": 9.615853241152715e-05, + "loss": 1.1972, + "step": 3557 + }, + { + "epoch": 0.24107324344467781, + "grad_norm": 7.193674087524414, + "learning_rate": 9.615716339242933e-05, + "loss": 1.0389, + "step": 3558 + }, + { + "epoch": 0.2411409987126499, + "grad_norm": 8.803317070007324, + "learning_rate": 9.615579437333151e-05, + "loss": 1.0429, + "step": 3559 + }, + { + "epoch": 0.241208753980622, + "grad_norm": 9.055732727050781, + "learning_rate": 9.61544253542337e-05, + "loss": 1.1161, + "step": 3560 + }, + { + "epoch": 0.2412765092485941, + "grad_norm": 10.80001449584961, + "learning_rate": 9.615305633513588e-05, + "loss": 0.9628, + "step": 3561 + }, + { + "epoch": 0.24134426451656615, + "grad_norm": 7.707313060760498, + "learning_rate": 9.615168731603806e-05, + "loss": 0.8806, + "step": 3562 + }, + { + "epoch": 0.24141201978453825, + "grad_norm": 8.823626518249512, + "learning_rate": 9.615031829694024e-05, + "loss": 0.8771, + "step": 3563 + }, + { + "epoch": 0.24147977505251034, + "grad_norm": 7.984725475311279, + "learning_rate": 9.614894927784242e-05, + "loss": 0.9456, + "step": 3564 + }, + { + "epoch": 0.24154753032048243, + "grad_norm": 7.335816860198975, + "learning_rate": 9.614758025874462e-05, + "loss": 1.0323, + "step": 3565 + }, + { + "epoch": 0.2416152855884545, + "grad_norm": 5.959085941314697, + "learning_rate": 9.61462112396468e-05, + "loss": 0.8049, + "step": 3566 + }, + { + "epoch": 0.24168304085642658, + "grad_norm": 8.775632858276367, + "learning_rate": 9.614484222054898e-05, + "loss": 0.8875, + "step": 3567 + }, + { + "epoch": 0.24175079612439868, + "grad_norm": 6.589362621307373, + "learning_rate": 9.614347320145116e-05, + "loss": 1.0551, + "step": 3568 + }, + { + "epoch": 0.24181855139237077, + "grad_norm": 7.49434232711792, + "learning_rate": 9.614210418235334e-05, + "loss": 0.7662, + "step": 3569 + }, + { + "epoch": 0.24188630666034283, + "grad_norm": 7.759862899780273, + "learning_rate": 9.614073516325553e-05, + "loss": 0.7364, + "step": 3570 + }, + { + "epoch": 0.24195406192831492, + "grad_norm": 7.9510273933410645, + "learning_rate": 9.613936614415771e-05, + "loss": 0.9277, + "step": 3571 + }, + { + "epoch": 0.24202181719628701, + "grad_norm": 8.308568954467773, + "learning_rate": 9.61379971250599e-05, + "loss": 1.1689, + "step": 3572 + }, + { + "epoch": 0.2420895724642591, + "grad_norm": 9.44938850402832, + "learning_rate": 9.613662810596207e-05, + "loss": 1.0937, + "step": 3573 + }, + { + "epoch": 0.24215732773223117, + "grad_norm": 11.66707706451416, + "learning_rate": 9.613525908686427e-05, + "loss": 0.77, + "step": 3574 + }, + { + "epoch": 0.24222508300020326, + "grad_norm": 9.25683879852295, + "learning_rate": 9.613389006776645e-05, + "loss": 0.9661, + "step": 3575 + }, + { + "epoch": 0.24229283826817535, + "grad_norm": 7.289797782897949, + "learning_rate": 9.613252104866863e-05, + "loss": 0.8597, + "step": 3576 + }, + { + "epoch": 0.24236059353614745, + "grad_norm": 6.669293403625488, + "learning_rate": 9.613115202957081e-05, + "loss": 0.9134, + "step": 3577 + }, + { + "epoch": 0.2424283488041195, + "grad_norm": 8.785436630249023, + "learning_rate": 9.612978301047299e-05, + "loss": 0.7546, + "step": 3578 + }, + { + "epoch": 0.2424961040720916, + "grad_norm": 7.386310577392578, + "learning_rate": 9.612841399137518e-05, + "loss": 1.0635, + "step": 3579 + }, + { + "epoch": 0.2425638593400637, + "grad_norm": 8.23388957977295, + "learning_rate": 9.612704497227736e-05, + "loss": 0.8598, + "step": 3580 + }, + { + "epoch": 0.24263161460803578, + "grad_norm": 8.498323440551758, + "learning_rate": 9.612567595317954e-05, + "loss": 1.083, + "step": 3581 + }, + { + "epoch": 0.24269936987600785, + "grad_norm": 7.849715709686279, + "learning_rate": 9.612430693408174e-05, + "loss": 0.7673, + "step": 3582 + }, + { + "epoch": 0.24276712514397994, + "grad_norm": 8.113242149353027, + "learning_rate": 9.612293791498392e-05, + "loss": 1.1003, + "step": 3583 + }, + { + "epoch": 0.24283488041195203, + "grad_norm": 6.983048915863037, + "learning_rate": 9.61215688958861e-05, + "loss": 0.8668, + "step": 3584 + }, + { + "epoch": 0.24290263567992412, + "grad_norm": 8.389126777648926, + "learning_rate": 9.612019987678829e-05, + "loss": 1.0008, + "step": 3585 + }, + { + "epoch": 0.2429703909478962, + "grad_norm": 7.593414783477783, + "learning_rate": 9.611883085769047e-05, + "loss": 0.9829, + "step": 3586 + }, + { + "epoch": 0.24303814621586828, + "grad_norm": 7.836172103881836, + "learning_rate": 9.611746183859265e-05, + "loss": 0.8128, + "step": 3587 + }, + { + "epoch": 0.24310590148384037, + "grad_norm": 8.981040954589844, + "learning_rate": 9.611609281949485e-05, + "loss": 1.1249, + "step": 3588 + }, + { + "epoch": 0.24317365675181246, + "grad_norm": 6.724935054779053, + "learning_rate": 9.611472380039703e-05, + "loss": 0.7699, + "step": 3589 + }, + { + "epoch": 0.24324141201978453, + "grad_norm": 9.456436157226562, + "learning_rate": 9.611335478129921e-05, + "loss": 1.0085, + "step": 3590 + }, + { + "epoch": 0.24330916728775662, + "grad_norm": 11.044548988342285, + "learning_rate": 9.611198576220139e-05, + "loss": 1.0671, + "step": 3591 + }, + { + "epoch": 0.2433769225557287, + "grad_norm": 10.590521812438965, + "learning_rate": 9.611061674310358e-05, + "loss": 1.0051, + "step": 3592 + }, + { + "epoch": 0.2434446778237008, + "grad_norm": 8.595380783081055, + "learning_rate": 9.610924772400576e-05, + "loss": 0.9655, + "step": 3593 + }, + { + "epoch": 0.24351243309167286, + "grad_norm": 7.794788837432861, + "learning_rate": 9.610787870490794e-05, + "loss": 0.9551, + "step": 3594 + }, + { + "epoch": 0.24358018835964496, + "grad_norm": 7.609074115753174, + "learning_rate": 9.610650968581012e-05, + "loss": 0.9856, + "step": 3595 + }, + { + "epoch": 0.24364794362761705, + "grad_norm": 6.909607410430908, + "learning_rate": 9.61051406667123e-05, + "loss": 1.0195, + "step": 3596 + }, + { + "epoch": 0.24371569889558914, + "grad_norm": 7.958381652832031, + "learning_rate": 9.61037716476145e-05, + "loss": 0.8263, + "step": 3597 + }, + { + "epoch": 0.24378345416356123, + "grad_norm": 7.38173246383667, + "learning_rate": 9.610240262851668e-05, + "loss": 0.8551, + "step": 3598 + }, + { + "epoch": 0.2438512094315333, + "grad_norm": 7.1448822021484375, + "learning_rate": 9.610103360941886e-05, + "loss": 0.8189, + "step": 3599 + }, + { + "epoch": 0.2439189646995054, + "grad_norm": 7.44658088684082, + "learning_rate": 9.609966459032104e-05, + "loss": 0.8056, + "step": 3600 + }, + { + "epoch": 0.24398671996747748, + "grad_norm": 7.480542182922363, + "learning_rate": 9.609829557122322e-05, + "loss": 0.8591, + "step": 3601 + }, + { + "epoch": 0.24405447523544957, + "grad_norm": 9.895995140075684, + "learning_rate": 9.609692655212541e-05, + "loss": 1.1256, + "step": 3602 + }, + { + "epoch": 0.24412223050342163, + "grad_norm": 7.009078025817871, + "learning_rate": 9.609555753302759e-05, + "loss": 0.9045, + "step": 3603 + }, + { + "epoch": 0.24418998577139372, + "grad_norm": 8.714953422546387, + "learning_rate": 9.609418851392977e-05, + "loss": 1.1207, + "step": 3604 + }, + { + "epoch": 0.24425774103936582, + "grad_norm": 7.239734172821045, + "learning_rate": 9.609281949483195e-05, + "loss": 0.7811, + "step": 3605 + }, + { + "epoch": 0.2443254963073379, + "grad_norm": 10.486507415771484, + "learning_rate": 9.609145047573415e-05, + "loss": 1.2213, + "step": 3606 + }, + { + "epoch": 0.24439325157530997, + "grad_norm": 12.031790733337402, + "learning_rate": 9.609008145663633e-05, + "loss": 1.0316, + "step": 3607 + }, + { + "epoch": 0.24446100684328206, + "grad_norm": 7.607183456420898, + "learning_rate": 9.608871243753851e-05, + "loss": 0.7754, + "step": 3608 + }, + { + "epoch": 0.24452876211125416, + "grad_norm": 9.313577651977539, + "learning_rate": 9.608734341844069e-05, + "loss": 0.9907, + "step": 3609 + }, + { + "epoch": 0.24459651737922625, + "grad_norm": 7.672274589538574, + "learning_rate": 9.608597439934287e-05, + "loss": 0.8428, + "step": 3610 + }, + { + "epoch": 0.2446642726471983, + "grad_norm": 8.259462356567383, + "learning_rate": 9.608460538024506e-05, + "loss": 0.8863, + "step": 3611 + }, + { + "epoch": 0.2447320279151704, + "grad_norm": 8.50256061553955, + "learning_rate": 9.608323636114724e-05, + "loss": 0.7771, + "step": 3612 + }, + { + "epoch": 0.2447997831831425, + "grad_norm": 6.8893818855285645, + "learning_rate": 9.608186734204942e-05, + "loss": 0.9791, + "step": 3613 + }, + { + "epoch": 0.24486753845111459, + "grad_norm": 8.954825401306152, + "learning_rate": 9.60804983229516e-05, + "loss": 1.0712, + "step": 3614 + }, + { + "epoch": 0.24493529371908665, + "grad_norm": 8.83995532989502, + "learning_rate": 9.60791293038538e-05, + "loss": 1.1982, + "step": 3615 + }, + { + "epoch": 0.24500304898705874, + "grad_norm": 8.96689224243164, + "learning_rate": 9.607776028475598e-05, + "loss": 0.9816, + "step": 3616 + }, + { + "epoch": 0.24507080425503083, + "grad_norm": 15.17086124420166, + "learning_rate": 9.607639126565816e-05, + "loss": 1.1416, + "step": 3617 + }, + { + "epoch": 0.24513855952300292, + "grad_norm": 9.527314186096191, + "learning_rate": 9.607502224656034e-05, + "loss": 1.1699, + "step": 3618 + }, + { + "epoch": 0.245206314790975, + "grad_norm": 7.378002166748047, + "learning_rate": 9.607365322746252e-05, + "loss": 1.1303, + "step": 3619 + }, + { + "epoch": 0.24527407005894708, + "grad_norm": 7.204291343688965, + "learning_rate": 9.607228420836471e-05, + "loss": 0.8078, + "step": 3620 + }, + { + "epoch": 0.24534182532691917, + "grad_norm": 8.181205749511719, + "learning_rate": 9.607091518926689e-05, + "loss": 0.9221, + "step": 3621 + }, + { + "epoch": 0.24540958059489126, + "grad_norm": 8.479545593261719, + "learning_rate": 9.606954617016907e-05, + "loss": 0.8133, + "step": 3622 + }, + { + "epoch": 0.24547733586286333, + "grad_norm": 7.591360092163086, + "learning_rate": 9.606817715107125e-05, + "loss": 1.0632, + "step": 3623 + }, + { + "epoch": 0.24554509113083542, + "grad_norm": 8.558969497680664, + "learning_rate": 9.606680813197343e-05, + "loss": 1.0755, + "step": 3624 + }, + { + "epoch": 0.2456128463988075, + "grad_norm": 8.02037525177002, + "learning_rate": 9.606543911287563e-05, + "loss": 1.0164, + "step": 3625 + }, + { + "epoch": 0.2456806016667796, + "grad_norm": 7.49207878112793, + "learning_rate": 9.606407009377781e-05, + "loss": 1.1325, + "step": 3626 + }, + { + "epoch": 0.24574835693475167, + "grad_norm": 7.376079559326172, + "learning_rate": 9.606270107467999e-05, + "loss": 0.7917, + "step": 3627 + }, + { + "epoch": 0.24581611220272376, + "grad_norm": 7.331247329711914, + "learning_rate": 9.606133205558218e-05, + "loss": 1.2186, + "step": 3628 + }, + { + "epoch": 0.24588386747069585, + "grad_norm": 7.393257141113281, + "learning_rate": 9.605996303648436e-05, + "loss": 0.7155, + "step": 3629 + }, + { + "epoch": 0.24595162273866794, + "grad_norm": 9.15472412109375, + "learning_rate": 9.605859401738654e-05, + "loss": 0.9562, + "step": 3630 + }, + { + "epoch": 0.24601937800664, + "grad_norm": 6.846646785736084, + "learning_rate": 9.605722499828874e-05, + "loss": 1.109, + "step": 3631 + }, + { + "epoch": 0.2460871332746121, + "grad_norm": 8.57854175567627, + "learning_rate": 9.605585597919092e-05, + "loss": 0.8706, + "step": 3632 + }, + { + "epoch": 0.2461548885425842, + "grad_norm": 7.089768886566162, + "learning_rate": 9.60544869600931e-05, + "loss": 0.8776, + "step": 3633 + }, + { + "epoch": 0.24622264381055628, + "grad_norm": 7.549044132232666, + "learning_rate": 9.605311794099529e-05, + "loss": 0.9956, + "step": 3634 + }, + { + "epoch": 0.24629039907852834, + "grad_norm": 6.839412689208984, + "learning_rate": 9.605174892189747e-05, + "loss": 0.8722, + "step": 3635 + }, + { + "epoch": 0.24635815434650044, + "grad_norm": 7.378058910369873, + "learning_rate": 9.605037990279965e-05, + "loss": 0.8663, + "step": 3636 + }, + { + "epoch": 0.24642590961447253, + "grad_norm": 7.893070220947266, + "learning_rate": 9.604901088370183e-05, + "loss": 0.9213, + "step": 3637 + }, + { + "epoch": 0.24649366488244462, + "grad_norm": 7.3345232009887695, + "learning_rate": 9.604764186460403e-05, + "loss": 0.9827, + "step": 3638 + }, + { + "epoch": 0.24656142015041668, + "grad_norm": 6.120781421661377, + "learning_rate": 9.60462728455062e-05, + "loss": 0.6371, + "step": 3639 + }, + { + "epoch": 0.24662917541838877, + "grad_norm": 8.695615768432617, + "learning_rate": 9.604490382640839e-05, + "loss": 0.9769, + "step": 3640 + }, + { + "epoch": 0.24669693068636087, + "grad_norm": 8.469325065612793, + "learning_rate": 9.604353480731057e-05, + "loss": 0.9267, + "step": 3641 + }, + { + "epoch": 0.24676468595433296, + "grad_norm": 9.191173553466797, + "learning_rate": 9.604216578821275e-05, + "loss": 1.1818, + "step": 3642 + }, + { + "epoch": 0.24683244122230502, + "grad_norm": 9.337483406066895, + "learning_rate": 9.604079676911494e-05, + "loss": 0.9904, + "step": 3643 + }, + { + "epoch": 0.2469001964902771, + "grad_norm": 7.597773551940918, + "learning_rate": 9.603942775001712e-05, + "loss": 0.8928, + "step": 3644 + }, + { + "epoch": 0.2469679517582492, + "grad_norm": 8.155903816223145, + "learning_rate": 9.60380587309193e-05, + "loss": 0.9511, + "step": 3645 + }, + { + "epoch": 0.2470357070262213, + "grad_norm": 7.695154666900635, + "learning_rate": 9.603668971182148e-05, + "loss": 1.0724, + "step": 3646 + }, + { + "epoch": 0.24710346229419336, + "grad_norm": 7.492908000946045, + "learning_rate": 9.603532069272366e-05, + "loss": 0.9133, + "step": 3647 + }, + { + "epoch": 0.24717121756216545, + "grad_norm": 8.190613746643066, + "learning_rate": 9.603395167362586e-05, + "loss": 0.8972, + "step": 3648 + }, + { + "epoch": 0.24723897283013754, + "grad_norm": 7.9199347496032715, + "learning_rate": 9.603258265452804e-05, + "loss": 1.0501, + "step": 3649 + }, + { + "epoch": 0.24730672809810963, + "grad_norm": 8.386896133422852, + "learning_rate": 9.603121363543022e-05, + "loss": 0.998, + "step": 3650 + }, + { + "epoch": 0.24737448336608173, + "grad_norm": 6.536781311035156, + "learning_rate": 9.60298446163324e-05, + "loss": 0.8379, + "step": 3651 + }, + { + "epoch": 0.2474422386340538, + "grad_norm": 6.918766021728516, + "learning_rate": 9.602847559723459e-05, + "loss": 0.9397, + "step": 3652 + }, + { + "epoch": 0.24750999390202588, + "grad_norm": 7.00775146484375, + "learning_rate": 9.602710657813677e-05, + "loss": 1.0079, + "step": 3653 + }, + { + "epoch": 0.24757774916999797, + "grad_norm": 7.693192958831787, + "learning_rate": 9.602573755903895e-05, + "loss": 1.0657, + "step": 3654 + }, + { + "epoch": 0.24764550443797007, + "grad_norm": 10.667771339416504, + "learning_rate": 9.602436853994113e-05, + "loss": 1.0657, + "step": 3655 + }, + { + "epoch": 0.24771325970594213, + "grad_norm": 8.622758865356445, + "learning_rate": 9.602299952084331e-05, + "loss": 0.9792, + "step": 3656 + }, + { + "epoch": 0.24778101497391422, + "grad_norm": 8.834444046020508, + "learning_rate": 9.602163050174551e-05, + "loss": 1.1014, + "step": 3657 + }, + { + "epoch": 0.2478487702418863, + "grad_norm": 7.717538356781006, + "learning_rate": 9.602026148264769e-05, + "loss": 0.8781, + "step": 3658 + }, + { + "epoch": 0.2479165255098584, + "grad_norm": 7.797954559326172, + "learning_rate": 9.601889246354987e-05, + "loss": 0.9772, + "step": 3659 + }, + { + "epoch": 0.24798428077783047, + "grad_norm": 7.376112937927246, + "learning_rate": 9.601752344445205e-05, + "loss": 1.0468, + "step": 3660 + }, + { + "epoch": 0.24805203604580256, + "grad_norm": 7.7266387939453125, + "learning_rate": 9.601615442535424e-05, + "loss": 1.007, + "step": 3661 + }, + { + "epoch": 0.24811979131377465, + "grad_norm": 9.196928977966309, + "learning_rate": 9.601478540625642e-05, + "loss": 1.1327, + "step": 3662 + }, + { + "epoch": 0.24818754658174674, + "grad_norm": 7.892288684844971, + "learning_rate": 9.60134163871586e-05, + "loss": 0.7511, + "step": 3663 + }, + { + "epoch": 0.2482553018497188, + "grad_norm": 8.854056358337402, + "learning_rate": 9.601204736806078e-05, + "loss": 0.9546, + "step": 3664 + }, + { + "epoch": 0.2483230571176909, + "grad_norm": 7.985452651977539, + "learning_rate": 9.601067834896296e-05, + "loss": 1.2186, + "step": 3665 + }, + { + "epoch": 0.248390812385663, + "grad_norm": 8.336162567138672, + "learning_rate": 9.600930932986516e-05, + "loss": 0.864, + "step": 3666 + }, + { + "epoch": 0.24845856765363508, + "grad_norm": 10.857757568359375, + "learning_rate": 9.600794031076734e-05, + "loss": 1.1948, + "step": 3667 + }, + { + "epoch": 0.24852632292160715, + "grad_norm": 8.168721199035645, + "learning_rate": 9.600657129166952e-05, + "loss": 1.1151, + "step": 3668 + }, + { + "epoch": 0.24859407818957924, + "grad_norm": 7.509332656860352, + "learning_rate": 9.60052022725717e-05, + "loss": 0.9215, + "step": 3669 + }, + { + "epoch": 0.24866183345755133, + "grad_norm": 10.4354829788208, + "learning_rate": 9.600383325347389e-05, + "loss": 1.0696, + "step": 3670 + }, + { + "epoch": 0.24872958872552342, + "grad_norm": 6.930381774902344, + "learning_rate": 9.600246423437607e-05, + "loss": 0.9229, + "step": 3671 + }, + { + "epoch": 0.24879734399349548, + "grad_norm": 6.608088970184326, + "learning_rate": 9.600109521527825e-05, + "loss": 0.928, + "step": 3672 + }, + { + "epoch": 0.24886509926146758, + "grad_norm": 7.4217095375061035, + "learning_rate": 9.599972619618043e-05, + "loss": 0.9608, + "step": 3673 + }, + { + "epoch": 0.24893285452943967, + "grad_norm": 7.46991491317749, + "learning_rate": 9.599835717708263e-05, + "loss": 0.8928, + "step": 3674 + }, + { + "epoch": 0.24900060979741176, + "grad_norm": 8.76001262664795, + "learning_rate": 9.599698815798481e-05, + "loss": 1.191, + "step": 3675 + }, + { + "epoch": 0.24906836506538382, + "grad_norm": 8.395779609680176, + "learning_rate": 9.599561913888699e-05, + "loss": 1.1479, + "step": 3676 + }, + { + "epoch": 0.24913612033335591, + "grad_norm": 8.00460147857666, + "learning_rate": 9.599425011978918e-05, + "loss": 1.1005, + "step": 3677 + }, + { + "epoch": 0.249203875601328, + "grad_norm": 7.817287445068359, + "learning_rate": 9.599288110069136e-05, + "loss": 1.1568, + "step": 3678 + }, + { + "epoch": 0.2492716308693001, + "grad_norm": 9.60706615447998, + "learning_rate": 9.599151208159354e-05, + "loss": 0.9734, + "step": 3679 + }, + { + "epoch": 0.24933938613727216, + "grad_norm": 8.347691535949707, + "learning_rate": 9.599014306249574e-05, + "loss": 1.2208, + "step": 3680 + }, + { + "epoch": 0.24940714140524425, + "grad_norm": 8.106613159179688, + "learning_rate": 9.598877404339792e-05, + "loss": 0.9253, + "step": 3681 + }, + { + "epoch": 0.24947489667321635, + "grad_norm": 8.099063873291016, + "learning_rate": 9.59874050243001e-05, + "loss": 1.0516, + "step": 3682 + }, + { + "epoch": 0.24954265194118844, + "grad_norm": 7.839589595794678, + "learning_rate": 9.598603600520228e-05, + "loss": 0.9764, + "step": 3683 + }, + { + "epoch": 0.2496104072091605, + "grad_norm": 8.679840087890625, + "learning_rate": 9.598466698610447e-05, + "loss": 1.0407, + "step": 3684 + }, + { + "epoch": 0.2496781624771326, + "grad_norm": 6.854926109313965, + "learning_rate": 9.598329796700665e-05, + "loss": 0.9457, + "step": 3685 + }, + { + "epoch": 0.24974591774510468, + "grad_norm": 9.133661270141602, + "learning_rate": 9.598192894790883e-05, + "loss": 1.0391, + "step": 3686 + }, + { + "epoch": 0.24981367301307678, + "grad_norm": 7.744726657867432, + "learning_rate": 9.598055992881101e-05, + "loss": 1.0256, + "step": 3687 + }, + { + "epoch": 0.24988142828104884, + "grad_norm": 8.9452543258667, + "learning_rate": 9.597919090971319e-05, + "loss": 0.9649, + "step": 3688 + }, + { + "epoch": 0.24994918354902093, + "grad_norm": 8.396431922912598, + "learning_rate": 9.597782189061539e-05, + "loss": 0.8189, + "step": 3689 + }, + { + "epoch": 0.24994918354902093, + "eval_loss": 0.9518795609474182, + "eval_noise_accuracy": 0.0, + "eval_runtime": 1533.2421, + "eval_samples_per_second": 3.352, + "eval_steps_per_second": 0.21, + "eval_wer": 88.77438705594233, + "step": 3689 + }, + { + "epoch": 0.250016938816993, + "grad_norm": 7.252607822418213, + "learning_rate": 9.597645287151757e-05, + "loss": 1.0734, + "step": 3690 + }, + { + "epoch": 0.2500846940849651, + "grad_norm": 8.768789291381836, + "learning_rate": 9.597508385241975e-05, + "loss": 0.9763, + "step": 3691 + }, + { + "epoch": 0.2501524493529372, + "grad_norm": 6.9475321769714355, + "learning_rate": 9.597371483332193e-05, + "loss": 0.8121, + "step": 3692 + }, + { + "epoch": 0.25022020462090927, + "grad_norm": 7.9007248878479, + "learning_rate": 9.597234581422412e-05, + "loss": 0.9668, + "step": 3693 + }, + { + "epoch": 0.25028795988888136, + "grad_norm": 6.524989604949951, + "learning_rate": 9.59709767951263e-05, + "loss": 0.9151, + "step": 3694 + }, + { + "epoch": 0.25035571515685345, + "grad_norm": 7.83770751953125, + "learning_rate": 9.596960777602848e-05, + "loss": 0.8338, + "step": 3695 + }, + { + "epoch": 0.25042347042482554, + "grad_norm": 8.619128227233887, + "learning_rate": 9.596823875693066e-05, + "loss": 1.0394, + "step": 3696 + }, + { + "epoch": 0.25049122569279764, + "grad_norm": 7.238592624664307, + "learning_rate": 9.596686973783284e-05, + "loss": 0.8773, + "step": 3697 + }, + { + "epoch": 0.2505589809607697, + "grad_norm": 10.257181167602539, + "learning_rate": 9.596550071873504e-05, + "loss": 1.0137, + "step": 3698 + }, + { + "epoch": 0.25062673622874176, + "grad_norm": 9.302980422973633, + "learning_rate": 9.596413169963722e-05, + "loss": 0.8317, + "step": 3699 + }, + { + "epoch": 0.25069449149671386, + "grad_norm": 7.485314846038818, + "learning_rate": 9.59627626805394e-05, + "loss": 0.9272, + "step": 3700 + }, + { + "epoch": 0.25076224676468595, + "grad_norm": 7.870807647705078, + "learning_rate": 9.596139366144158e-05, + "loss": 1.009, + "step": 3701 + }, + { + "epoch": 0.25083000203265804, + "grad_norm": 8.245805740356445, + "learning_rate": 9.596002464234376e-05, + "loss": 0.8616, + "step": 3702 + }, + { + "epoch": 0.25089775730063013, + "grad_norm": 10.324470520019531, + "learning_rate": 9.595865562324595e-05, + "loss": 1.1567, + "step": 3703 + }, + { + "epoch": 0.2509655125686022, + "grad_norm": 7.588774681091309, + "learning_rate": 9.595728660414813e-05, + "loss": 1.0025, + "step": 3704 + }, + { + "epoch": 0.2510332678365743, + "grad_norm": 8.32935905456543, + "learning_rate": 9.595591758505031e-05, + "loss": 1.1826, + "step": 3705 + }, + { + "epoch": 0.2511010231045464, + "grad_norm": 7.146906852722168, + "learning_rate": 9.595454856595249e-05, + "loss": 0.8108, + "step": 3706 + }, + { + "epoch": 0.25116877837251844, + "grad_norm": 8.91852855682373, + "learning_rate": 9.595317954685469e-05, + "loss": 1.1377, + "step": 3707 + }, + { + "epoch": 0.25123653364049053, + "grad_norm": 7.726437568664551, + "learning_rate": 9.595181052775687e-05, + "loss": 0.9971, + "step": 3708 + }, + { + "epoch": 0.2513042889084626, + "grad_norm": 11.102527618408203, + "learning_rate": 9.595044150865905e-05, + "loss": 1.1075, + "step": 3709 + }, + { + "epoch": 0.2513720441764347, + "grad_norm": 6.124303340911865, + "learning_rate": 9.594907248956123e-05, + "loss": 0.8523, + "step": 3710 + }, + { + "epoch": 0.2514397994444068, + "grad_norm": 8.56926441192627, + "learning_rate": 9.594770347046341e-05, + "loss": 1.0801, + "step": 3711 + }, + { + "epoch": 0.2515075547123789, + "grad_norm": 6.994394779205322, + "learning_rate": 9.59463344513656e-05, + "loss": 0.8559, + "step": 3712 + }, + { + "epoch": 0.251575309980351, + "grad_norm": 7.428825378417969, + "learning_rate": 9.594496543226778e-05, + "loss": 0.8861, + "step": 3713 + }, + { + "epoch": 0.2516430652483231, + "grad_norm": 8.7849760055542, + "learning_rate": 9.594359641316996e-05, + "loss": 0.9083, + "step": 3714 + }, + { + "epoch": 0.2517108205162951, + "grad_norm": 7.608119964599609, + "learning_rate": 9.594222739407214e-05, + "loss": 0.8529, + "step": 3715 + }, + { + "epoch": 0.2517785757842672, + "grad_norm": 7.076242446899414, + "learning_rate": 9.594085837497434e-05, + "loss": 0.8817, + "step": 3716 + }, + { + "epoch": 0.2518463310522393, + "grad_norm": 8.78627872467041, + "learning_rate": 9.593948935587652e-05, + "loss": 1.1752, + "step": 3717 + }, + { + "epoch": 0.2519140863202114, + "grad_norm": 8.589457511901855, + "learning_rate": 9.59381203367787e-05, + "loss": 0.9099, + "step": 3718 + }, + { + "epoch": 0.2519818415881835, + "grad_norm": 8.67271900177002, + "learning_rate": 9.593675131768088e-05, + "loss": 1.0304, + "step": 3719 + }, + { + "epoch": 0.2520495968561556, + "grad_norm": 9.180156707763672, + "learning_rate": 9.593538229858307e-05, + "loss": 1.0815, + "step": 3720 + }, + { + "epoch": 0.25211735212412767, + "grad_norm": 7.973734378814697, + "learning_rate": 9.593401327948525e-05, + "loss": 1.0816, + "step": 3721 + }, + { + "epoch": 0.25218510739209976, + "grad_norm": 7.272556781768799, + "learning_rate": 9.593264426038743e-05, + "loss": 0.9027, + "step": 3722 + }, + { + "epoch": 0.2522528626600718, + "grad_norm": 8.033550262451172, + "learning_rate": 9.593127524128963e-05, + "loss": 1.0772, + "step": 3723 + }, + { + "epoch": 0.2523206179280439, + "grad_norm": 7.861289978027344, + "learning_rate": 9.59299062221918e-05, + "loss": 0.8774, + "step": 3724 + }, + { + "epoch": 0.252388373196016, + "grad_norm": 7.043121337890625, + "learning_rate": 9.592853720309399e-05, + "loss": 1.0165, + "step": 3725 + }, + { + "epoch": 0.25245612846398807, + "grad_norm": 7.830938816070557, + "learning_rate": 9.592716818399618e-05, + "loss": 1.0379, + "step": 3726 + }, + { + "epoch": 0.25252388373196016, + "grad_norm": 8.138580322265625, + "learning_rate": 9.592579916489836e-05, + "loss": 0.954, + "step": 3727 + }, + { + "epoch": 0.25259163899993226, + "grad_norm": 6.810534477233887, + "learning_rate": 9.592443014580054e-05, + "loss": 0.7475, + "step": 3728 + }, + { + "epoch": 0.25265939426790435, + "grad_norm": 6.66425895690918, + "learning_rate": 9.592306112670272e-05, + "loss": 0.8203, + "step": 3729 + }, + { + "epoch": 0.25272714953587644, + "grad_norm": 7.962128162384033, + "learning_rate": 9.592169210760492e-05, + "loss": 0.9923, + "step": 3730 + }, + { + "epoch": 0.2527949048038485, + "grad_norm": 8.372082710266113, + "learning_rate": 9.59203230885071e-05, + "loss": 0.809, + "step": 3731 + }, + { + "epoch": 0.25286266007182057, + "grad_norm": 8.343878746032715, + "learning_rate": 9.591895406940928e-05, + "loss": 0.9373, + "step": 3732 + }, + { + "epoch": 0.25293041533979266, + "grad_norm": 9.935523986816406, + "learning_rate": 9.591758505031146e-05, + "loss": 1.0037, + "step": 3733 + }, + { + "epoch": 0.25299817060776475, + "grad_norm": 8.378336906433105, + "learning_rate": 9.591621603121364e-05, + "loss": 0.9834, + "step": 3734 + }, + { + "epoch": 0.25306592587573684, + "grad_norm": 8.581600189208984, + "learning_rate": 9.591484701211583e-05, + "loss": 1.1192, + "step": 3735 + }, + { + "epoch": 0.25313368114370893, + "grad_norm": 8.007279396057129, + "learning_rate": 9.591347799301801e-05, + "loss": 1.0261, + "step": 3736 + }, + { + "epoch": 0.253201436411681, + "grad_norm": 7.40525484085083, + "learning_rate": 9.591210897392019e-05, + "loss": 0.9273, + "step": 3737 + }, + { + "epoch": 0.2532691916796531, + "grad_norm": 7.228291988372803, + "learning_rate": 9.591073995482237e-05, + "loss": 1.0219, + "step": 3738 + }, + { + "epoch": 0.25333694694762515, + "grad_norm": 6.714911460876465, + "learning_rate": 9.590937093572457e-05, + "loss": 0.9134, + "step": 3739 + }, + { + "epoch": 0.25340470221559724, + "grad_norm": 6.150938510894775, + "learning_rate": 9.590800191662675e-05, + "loss": 0.8577, + "step": 3740 + }, + { + "epoch": 0.25347245748356934, + "grad_norm": 10.693168640136719, + "learning_rate": 9.590663289752893e-05, + "loss": 1.2867, + "step": 3741 + }, + { + "epoch": 0.2535402127515414, + "grad_norm": 8.114953994750977, + "learning_rate": 9.590526387843111e-05, + "loss": 1.0602, + "step": 3742 + }, + { + "epoch": 0.2536079680195135, + "grad_norm": 8.472567558288574, + "learning_rate": 9.590389485933329e-05, + "loss": 0.9992, + "step": 3743 + }, + { + "epoch": 0.2536757232874856, + "grad_norm": 8.681047439575195, + "learning_rate": 9.590252584023548e-05, + "loss": 1.003, + "step": 3744 + }, + { + "epoch": 0.2537434785554577, + "grad_norm": 8.753557205200195, + "learning_rate": 9.590115682113766e-05, + "loss": 0.9447, + "step": 3745 + }, + { + "epoch": 0.2538112338234298, + "grad_norm": 6.80125093460083, + "learning_rate": 9.589978780203984e-05, + "loss": 0.8522, + "step": 3746 + }, + { + "epoch": 0.25387898909140183, + "grad_norm": 8.829830169677734, + "learning_rate": 9.589841878294202e-05, + "loss": 0.9131, + "step": 3747 + }, + { + "epoch": 0.2539467443593739, + "grad_norm": 7.068274021148682, + "learning_rate": 9.589704976384422e-05, + "loss": 0.8348, + "step": 3748 + }, + { + "epoch": 0.254014499627346, + "grad_norm": 8.19235897064209, + "learning_rate": 9.58956807447464e-05, + "loss": 0.8975, + "step": 3749 + }, + { + "epoch": 0.2540822548953181, + "grad_norm": 8.1896333694458, + "learning_rate": 9.589431172564858e-05, + "loss": 1.1285, + "step": 3750 + }, + { + "epoch": 0.2541500101632902, + "grad_norm": 7.581019878387451, + "learning_rate": 9.589294270655076e-05, + "loss": 0.8838, + "step": 3751 + }, + { + "epoch": 0.2542177654312623, + "grad_norm": 6.806415557861328, + "learning_rate": 9.589157368745294e-05, + "loss": 0.9144, + "step": 3752 + }, + { + "epoch": 0.2542855206992344, + "grad_norm": 6.077991485595703, + "learning_rate": 9.589020466835513e-05, + "loss": 0.7032, + "step": 3753 + }, + { + "epoch": 0.25435327596720647, + "grad_norm": 9.278702735900879, + "learning_rate": 9.588883564925731e-05, + "loss": 1.1806, + "step": 3754 + }, + { + "epoch": 0.2544210312351785, + "grad_norm": 7.136252403259277, + "learning_rate": 9.588746663015949e-05, + "loss": 0.9589, + "step": 3755 + }, + { + "epoch": 0.2544887865031506, + "grad_norm": 6.349377632141113, + "learning_rate": 9.588609761106167e-05, + "loss": 0.9972, + "step": 3756 + }, + { + "epoch": 0.2545565417711227, + "grad_norm": 10.120612144470215, + "learning_rate": 9.588472859196385e-05, + "loss": 1.124, + "step": 3757 + }, + { + "epoch": 0.2546242970390948, + "grad_norm": 6.840261459350586, + "learning_rate": 9.588335957286605e-05, + "loss": 0.8897, + "step": 3758 + }, + { + "epoch": 0.2546920523070669, + "grad_norm": 6.069836616516113, + "learning_rate": 9.588199055376823e-05, + "loss": 0.7794, + "step": 3759 + }, + { + "epoch": 0.25475980757503897, + "grad_norm": 6.336123943328857, + "learning_rate": 9.588062153467041e-05, + "loss": 0.8936, + "step": 3760 + }, + { + "epoch": 0.25482756284301106, + "grad_norm": 7.337663173675537, + "learning_rate": 9.587925251557259e-05, + "loss": 0.9241, + "step": 3761 + }, + { + "epoch": 0.25489531811098315, + "grad_norm": 9.220332145690918, + "learning_rate": 9.587788349647478e-05, + "loss": 1.2426, + "step": 3762 + }, + { + "epoch": 0.25496307337895524, + "grad_norm": 6.4654951095581055, + "learning_rate": 9.587651447737696e-05, + "loss": 0.7795, + "step": 3763 + }, + { + "epoch": 0.2550308286469273, + "grad_norm": 8.458954811096191, + "learning_rate": 9.587514545827914e-05, + "loss": 1.0493, + "step": 3764 + }, + { + "epoch": 0.25509858391489937, + "grad_norm": 6.73598575592041, + "learning_rate": 9.587377643918132e-05, + "loss": 0.9777, + "step": 3765 + }, + { + "epoch": 0.25516633918287146, + "grad_norm": 8.673493385314941, + "learning_rate": 9.587240742008352e-05, + "loss": 1.1108, + "step": 3766 + }, + { + "epoch": 0.25523409445084355, + "grad_norm": 7.328574180603027, + "learning_rate": 9.58710384009857e-05, + "loss": 0.9502, + "step": 3767 + }, + { + "epoch": 0.25530184971881564, + "grad_norm": 7.9878692626953125, + "learning_rate": 9.586966938188788e-05, + "loss": 1.0191, + "step": 3768 + }, + { + "epoch": 0.25536960498678773, + "grad_norm": 7.247650623321533, + "learning_rate": 9.586830036279007e-05, + "loss": 0.9356, + "step": 3769 + }, + { + "epoch": 0.2554373602547598, + "grad_norm": 7.29271125793457, + "learning_rate": 9.586693134369225e-05, + "loss": 0.9877, + "step": 3770 + }, + { + "epoch": 0.2555051155227319, + "grad_norm": 7.802029132843018, + "learning_rate": 9.586556232459443e-05, + "loss": 0.8923, + "step": 3771 + }, + { + "epoch": 0.25557287079070395, + "grad_norm": 8.129645347595215, + "learning_rate": 9.586419330549663e-05, + "loss": 0.9451, + "step": 3772 + }, + { + "epoch": 0.25564062605867605, + "grad_norm": 7.033285140991211, + "learning_rate": 9.58628242863988e-05, + "loss": 0.8529, + "step": 3773 + }, + { + "epoch": 0.25570838132664814, + "grad_norm": 7.490065574645996, + "learning_rate": 9.586145526730099e-05, + "loss": 0.8912, + "step": 3774 + }, + { + "epoch": 0.25577613659462023, + "grad_norm": 8.730104446411133, + "learning_rate": 9.586008624820317e-05, + "loss": 1.2685, + "step": 3775 + }, + { + "epoch": 0.2558438918625923, + "grad_norm": 9.958065032958984, + "learning_rate": 9.585871722910536e-05, + "loss": 1.1792, + "step": 3776 + }, + { + "epoch": 0.2559116471305644, + "grad_norm": 6.82180118560791, + "learning_rate": 9.585734821000754e-05, + "loss": 0.8734, + "step": 3777 + }, + { + "epoch": 0.2559794023985365, + "grad_norm": 9.444950103759766, + "learning_rate": 9.585597919090972e-05, + "loss": 1.0368, + "step": 3778 + }, + { + "epoch": 0.2560471576665086, + "grad_norm": 9.437919616699219, + "learning_rate": 9.58546101718119e-05, + "loss": 1.2728, + "step": 3779 + }, + { + "epoch": 0.25611491293448063, + "grad_norm": 8.925026893615723, + "learning_rate": 9.585324115271408e-05, + "loss": 0.944, + "step": 3780 + }, + { + "epoch": 0.2561826682024527, + "grad_norm": 8.421260833740234, + "learning_rate": 9.585187213361628e-05, + "loss": 0.9913, + "step": 3781 + }, + { + "epoch": 0.2562504234704248, + "grad_norm": 8.5851469039917, + "learning_rate": 9.585050311451846e-05, + "loss": 0.9885, + "step": 3782 + }, + { + "epoch": 0.2563181787383969, + "grad_norm": 7.179548263549805, + "learning_rate": 9.584913409542064e-05, + "loss": 1.0647, + "step": 3783 + }, + { + "epoch": 0.256385934006369, + "grad_norm": 8.60708999633789, + "learning_rate": 9.584776507632282e-05, + "loss": 1.0475, + "step": 3784 + }, + { + "epoch": 0.2564536892743411, + "grad_norm": 8.462443351745605, + "learning_rate": 9.584639605722501e-05, + "loss": 1.1381, + "step": 3785 + }, + { + "epoch": 0.2565214445423132, + "grad_norm": 7.044154644012451, + "learning_rate": 9.584502703812719e-05, + "loss": 0.8223, + "step": 3786 + }, + { + "epoch": 0.2565891998102853, + "grad_norm": 8.842753410339355, + "learning_rate": 9.584365801902937e-05, + "loss": 1.1205, + "step": 3787 + }, + { + "epoch": 0.2566569550782573, + "grad_norm": 7.294439792633057, + "learning_rate": 9.584228899993155e-05, + "loss": 0.9563, + "step": 3788 + }, + { + "epoch": 0.2567247103462294, + "grad_norm": 7.842654228210449, + "learning_rate": 9.584091998083373e-05, + "loss": 1.2798, + "step": 3789 + }, + { + "epoch": 0.2567924656142015, + "grad_norm": 5.479234218597412, + "learning_rate": 9.583955096173593e-05, + "loss": 0.7835, + "step": 3790 + }, + { + "epoch": 0.2568602208821736, + "grad_norm": 7.670284271240234, + "learning_rate": 9.58381819426381e-05, + "loss": 0.8263, + "step": 3791 + }, + { + "epoch": 0.2569279761501457, + "grad_norm": 8.564105987548828, + "learning_rate": 9.583681292354029e-05, + "loss": 1.032, + "step": 3792 + }, + { + "epoch": 0.25699573141811777, + "grad_norm": 8.706098556518555, + "learning_rate": 9.583544390444247e-05, + "loss": 1.0166, + "step": 3793 + }, + { + "epoch": 0.25706348668608986, + "grad_norm": 6.727125644683838, + "learning_rate": 9.583407488534466e-05, + "loss": 0.756, + "step": 3794 + }, + { + "epoch": 0.25713124195406195, + "grad_norm": 8.941418647766113, + "learning_rate": 9.583270586624684e-05, + "loss": 1.0223, + "step": 3795 + }, + { + "epoch": 0.257198997222034, + "grad_norm": 7.35167932510376, + "learning_rate": 9.583133684714902e-05, + "loss": 0.9117, + "step": 3796 + }, + { + "epoch": 0.2572667524900061, + "grad_norm": 9.245199203491211, + "learning_rate": 9.58299678280512e-05, + "loss": 1.0131, + "step": 3797 + }, + { + "epoch": 0.25733450775797817, + "grad_norm": 10.583107948303223, + "learning_rate": 9.582859880895338e-05, + "loss": 1.0452, + "step": 3798 + }, + { + "epoch": 0.25740226302595026, + "grad_norm": 7.672145843505859, + "learning_rate": 9.582722978985558e-05, + "loss": 0.9961, + "step": 3799 + }, + { + "epoch": 0.25747001829392235, + "grad_norm": 6.5851640701293945, + "learning_rate": 9.582586077075776e-05, + "loss": 0.9217, + "step": 3800 + }, + { + "epoch": 0.25753777356189445, + "grad_norm": 9.444985389709473, + "learning_rate": 9.582449175165994e-05, + "loss": 1.1704, + "step": 3801 + }, + { + "epoch": 0.25760552882986654, + "grad_norm": 7.136216163635254, + "learning_rate": 9.582312273256212e-05, + "loss": 0.9597, + "step": 3802 + }, + { + "epoch": 0.25767328409783863, + "grad_norm": 9.74889087677002, + "learning_rate": 9.582175371346431e-05, + "loss": 0.9866, + "step": 3803 + }, + { + "epoch": 0.25774103936581066, + "grad_norm": 7.692512512207031, + "learning_rate": 9.582038469436649e-05, + "loss": 1.0412, + "step": 3804 + }, + { + "epoch": 0.25780879463378276, + "grad_norm": 7.882124423980713, + "learning_rate": 9.581901567526867e-05, + "loss": 1.1129, + "step": 3805 + }, + { + "epoch": 0.25787654990175485, + "grad_norm": 8.117066383361816, + "learning_rate": 9.581764665617085e-05, + "loss": 0.968, + "step": 3806 + }, + { + "epoch": 0.25794430516972694, + "grad_norm": 8.883692741394043, + "learning_rate": 9.581627763707303e-05, + "loss": 1.0859, + "step": 3807 + }, + { + "epoch": 0.25801206043769903, + "grad_norm": 7.985278129577637, + "learning_rate": 9.581490861797523e-05, + "loss": 0.9511, + "step": 3808 + }, + { + "epoch": 0.2580798157056711, + "grad_norm": 10.487812995910645, + "learning_rate": 9.58135395988774e-05, + "loss": 1.2395, + "step": 3809 + }, + { + "epoch": 0.2581475709736432, + "grad_norm": 7.5507707595825195, + "learning_rate": 9.581217057977959e-05, + "loss": 0.9959, + "step": 3810 + }, + { + "epoch": 0.2582153262416153, + "grad_norm": 9.950063705444336, + "learning_rate": 9.581080156068177e-05, + "loss": 0.9453, + "step": 3811 + }, + { + "epoch": 0.2582830815095874, + "grad_norm": 7.301966190338135, + "learning_rate": 9.580943254158396e-05, + "loss": 0.8468, + "step": 3812 + }, + { + "epoch": 0.25835083677755943, + "grad_norm": 6.434390544891357, + "learning_rate": 9.580806352248614e-05, + "loss": 1.0513, + "step": 3813 + }, + { + "epoch": 0.2584185920455315, + "grad_norm": 8.878791809082031, + "learning_rate": 9.580669450338832e-05, + "loss": 1.1264, + "step": 3814 + }, + { + "epoch": 0.2584863473135036, + "grad_norm": 8.107111930847168, + "learning_rate": 9.580532548429052e-05, + "loss": 1.3037, + "step": 3815 + }, + { + "epoch": 0.2585541025814757, + "grad_norm": 7.54078483581543, + "learning_rate": 9.58039564651927e-05, + "loss": 0.917, + "step": 3816 + }, + { + "epoch": 0.2586218578494478, + "grad_norm": 9.369047164916992, + "learning_rate": 9.580258744609488e-05, + "loss": 0.8761, + "step": 3817 + }, + { + "epoch": 0.2586896131174199, + "grad_norm": 8.927732467651367, + "learning_rate": 9.580121842699707e-05, + "loss": 1.0591, + "step": 3818 + }, + { + "epoch": 0.258757368385392, + "grad_norm": 9.261579513549805, + "learning_rate": 9.579984940789925e-05, + "loss": 1.0671, + "step": 3819 + }, + { + "epoch": 0.2588251236533641, + "grad_norm": 7.396904468536377, + "learning_rate": 9.579848038880143e-05, + "loss": 0.8242, + "step": 3820 + }, + { + "epoch": 0.2588928789213361, + "grad_norm": 6.665501594543457, + "learning_rate": 9.579711136970361e-05, + "loss": 0.7686, + "step": 3821 + }, + { + "epoch": 0.2589606341893082, + "grad_norm": 7.0808281898498535, + "learning_rate": 9.57957423506058e-05, + "loss": 0.9757, + "step": 3822 + }, + { + "epoch": 0.2590283894572803, + "grad_norm": 7.734886646270752, + "learning_rate": 9.579437333150799e-05, + "loss": 1.0365, + "step": 3823 + }, + { + "epoch": 0.2590961447252524, + "grad_norm": 7.383622169494629, + "learning_rate": 9.579300431241017e-05, + "loss": 0.8458, + "step": 3824 + }, + { + "epoch": 0.2591638999932245, + "grad_norm": 7.758030891418457, + "learning_rate": 9.579163529331235e-05, + "loss": 0.9454, + "step": 3825 + }, + { + "epoch": 0.25923165526119657, + "grad_norm": 9.456972122192383, + "learning_rate": 9.579026627421454e-05, + "loss": 1.065, + "step": 3826 + }, + { + "epoch": 0.25929941052916866, + "grad_norm": 8.046957969665527, + "learning_rate": 9.578889725511672e-05, + "loss": 0.7021, + "step": 3827 + }, + { + "epoch": 0.25936716579714075, + "grad_norm": 9.415145874023438, + "learning_rate": 9.57875282360189e-05, + "loss": 1.2344, + "step": 3828 + }, + { + "epoch": 0.2594349210651128, + "grad_norm": 6.525821685791016, + "learning_rate": 9.578615921692108e-05, + "loss": 0.8005, + "step": 3829 + }, + { + "epoch": 0.2595026763330849, + "grad_norm": 6.948854923248291, + "learning_rate": 9.578479019782326e-05, + "loss": 0.8313, + "step": 3830 + }, + { + "epoch": 0.259570431601057, + "grad_norm": 6.991540431976318, + "learning_rate": 9.578342117872546e-05, + "loss": 0.945, + "step": 3831 + }, + { + "epoch": 0.25963818686902906, + "grad_norm": 7.0269551277160645, + "learning_rate": 9.578205215962764e-05, + "loss": 0.8927, + "step": 3832 + }, + { + "epoch": 0.25970594213700116, + "grad_norm": 7.773914813995361, + "learning_rate": 9.578068314052982e-05, + "loss": 0.8469, + "step": 3833 + }, + { + "epoch": 0.25977369740497325, + "grad_norm": 9.6503267288208, + "learning_rate": 9.5779314121432e-05, + "loss": 1.1192, + "step": 3834 + }, + { + "epoch": 0.25984145267294534, + "grad_norm": 6.582554340362549, + "learning_rate": 9.577794510233418e-05, + "loss": 0.8996, + "step": 3835 + }, + { + "epoch": 0.25990920794091743, + "grad_norm": 10.097637176513672, + "learning_rate": 9.577657608323637e-05, + "loss": 1.211, + "step": 3836 + }, + { + "epoch": 0.25997696320888947, + "grad_norm": 8.63124942779541, + "learning_rate": 9.577520706413855e-05, + "loss": 1.0353, + "step": 3837 + }, + { + "epoch": 0.26004471847686156, + "grad_norm": 8.282122611999512, + "learning_rate": 9.577383804504073e-05, + "loss": 0.9677, + "step": 3838 + }, + { + "epoch": 0.26011247374483365, + "grad_norm": 8.298484802246094, + "learning_rate": 9.577246902594291e-05, + "loss": 0.909, + "step": 3839 + }, + { + "epoch": 0.26018022901280574, + "grad_norm": 7.91752290725708, + "learning_rate": 9.57711000068451e-05, + "loss": 1.1106, + "step": 3840 + }, + { + "epoch": 0.26024798428077783, + "grad_norm": 8.073543548583984, + "learning_rate": 9.576973098774729e-05, + "loss": 0.9777, + "step": 3841 + }, + { + "epoch": 0.2603157395487499, + "grad_norm": 8.225390434265137, + "learning_rate": 9.576836196864947e-05, + "loss": 1.0302, + "step": 3842 + }, + { + "epoch": 0.260383494816722, + "grad_norm": 7.086613655090332, + "learning_rate": 9.576699294955165e-05, + "loss": 1.0061, + "step": 3843 + }, + { + "epoch": 0.2604512500846941, + "grad_norm": 6.9043965339660645, + "learning_rate": 9.576562393045383e-05, + "loss": 0.8707, + "step": 3844 + }, + { + "epoch": 0.26051900535266614, + "grad_norm": 9.083130836486816, + "learning_rate": 9.576425491135602e-05, + "loss": 1.0567, + "step": 3845 + }, + { + "epoch": 0.26058676062063824, + "grad_norm": 6.962080478668213, + "learning_rate": 9.57628858922582e-05, + "loss": 0.8592, + "step": 3846 + }, + { + "epoch": 0.2606545158886103, + "grad_norm": 7.196011066436768, + "learning_rate": 9.576151687316038e-05, + "loss": 0.8318, + "step": 3847 + }, + { + "epoch": 0.2607222711565824, + "grad_norm": 7.421074867248535, + "learning_rate": 9.576014785406256e-05, + "loss": 1.0162, + "step": 3848 + }, + { + "epoch": 0.2607900264245545, + "grad_norm": 7.065299034118652, + "learning_rate": 9.575877883496476e-05, + "loss": 0.9999, + "step": 3849 + }, + { + "epoch": 0.2608577816925266, + "grad_norm": 7.442328929901123, + "learning_rate": 9.575740981586694e-05, + "loss": 0.8118, + "step": 3850 + }, + { + "epoch": 0.2609255369604987, + "grad_norm": 6.888897895812988, + "learning_rate": 9.575604079676912e-05, + "loss": 1.0343, + "step": 3851 + }, + { + "epoch": 0.2609932922284708, + "grad_norm": 7.788427829742432, + "learning_rate": 9.57546717776713e-05, + "loss": 0.9931, + "step": 3852 + }, + { + "epoch": 0.2610610474964428, + "grad_norm": 7.247363567352295, + "learning_rate": 9.575330275857348e-05, + "loss": 0.8403, + "step": 3853 + }, + { + "epoch": 0.2611288027644149, + "grad_norm": 7.305066108703613, + "learning_rate": 9.575193373947567e-05, + "loss": 1.0222, + "step": 3854 + }, + { + "epoch": 0.261196558032387, + "grad_norm": 5.8723249435424805, + "learning_rate": 9.575056472037785e-05, + "loss": 0.7461, + "step": 3855 + }, + { + "epoch": 0.2612643133003591, + "grad_norm": 8.933609008789062, + "learning_rate": 9.574919570128003e-05, + "loss": 1.1537, + "step": 3856 + }, + { + "epoch": 0.2613320685683312, + "grad_norm": 7.3203125, + "learning_rate": 9.574782668218221e-05, + "loss": 1.0277, + "step": 3857 + }, + { + "epoch": 0.2613998238363033, + "grad_norm": 7.455322742462158, + "learning_rate": 9.57464576630844e-05, + "loss": 0.8136, + "step": 3858 + }, + { + "epoch": 0.26146757910427537, + "grad_norm": 8.074299812316895, + "learning_rate": 9.574508864398659e-05, + "loss": 0.9597, + "step": 3859 + }, + { + "epoch": 0.26153533437224746, + "grad_norm": 8.732856750488281, + "learning_rate": 9.574371962488877e-05, + "loss": 1.2009, + "step": 3860 + }, + { + "epoch": 0.2616030896402195, + "grad_norm": 7.179652690887451, + "learning_rate": 9.574235060579095e-05, + "loss": 0.8365, + "step": 3861 + }, + { + "epoch": 0.2616708449081916, + "grad_norm": 8.975394248962402, + "learning_rate": 9.574098158669314e-05, + "loss": 0.9099, + "step": 3862 + }, + { + "epoch": 0.2617386001761637, + "grad_norm": 9.083860397338867, + "learning_rate": 9.573961256759532e-05, + "loss": 1.0307, + "step": 3863 + }, + { + "epoch": 0.2618063554441358, + "grad_norm": 7.449617385864258, + "learning_rate": 9.57382435484975e-05, + "loss": 1.1442, + "step": 3864 + }, + { + "epoch": 0.26187411071210787, + "grad_norm": 8.299210548400879, + "learning_rate": 9.57368745293997e-05, + "loss": 1.0119, + "step": 3865 + }, + { + "epoch": 0.26194186598007996, + "grad_norm": 6.847742080688477, + "learning_rate": 9.573550551030188e-05, + "loss": 0.9058, + "step": 3866 + }, + { + "epoch": 0.26200962124805205, + "grad_norm": 7.057496070861816, + "learning_rate": 9.573413649120406e-05, + "loss": 1.0022, + "step": 3867 + }, + { + "epoch": 0.26207737651602414, + "grad_norm": 7.9798359870910645, + "learning_rate": 9.573276747210625e-05, + "loss": 0.9555, + "step": 3868 + }, + { + "epoch": 0.26214513178399623, + "grad_norm": 8.119134902954102, + "learning_rate": 9.573139845300843e-05, + "loss": 1.1039, + "step": 3869 + }, + { + "epoch": 0.26221288705196827, + "grad_norm": 6.998579502105713, + "learning_rate": 9.573002943391061e-05, + "loss": 1.0448, + "step": 3870 + }, + { + "epoch": 0.26228064231994036, + "grad_norm": 6.59659481048584, + "learning_rate": 9.572866041481279e-05, + "loss": 0.8946, + "step": 3871 + }, + { + "epoch": 0.26234839758791245, + "grad_norm": 8.110078811645508, + "learning_rate": 9.572729139571498e-05, + "loss": 0.8568, + "step": 3872 + }, + { + "epoch": 0.26241615285588454, + "grad_norm": 9.192879676818848, + "learning_rate": 9.572592237661717e-05, + "loss": 0.9517, + "step": 3873 + }, + { + "epoch": 0.26248390812385664, + "grad_norm": 7.378695964813232, + "learning_rate": 9.572455335751935e-05, + "loss": 1.0917, + "step": 3874 + }, + { + "epoch": 0.2625516633918287, + "grad_norm": 10.016194343566895, + "learning_rate": 9.572318433842153e-05, + "loss": 0.9754, + "step": 3875 + }, + { + "epoch": 0.2626194186598008, + "grad_norm": 7.392832279205322, + "learning_rate": 9.57218153193237e-05, + "loss": 0.8258, + "step": 3876 + }, + { + "epoch": 0.2626871739277729, + "grad_norm": 6.59785795211792, + "learning_rate": 9.57204463002259e-05, + "loss": 0.9656, + "step": 3877 + }, + { + "epoch": 0.26275492919574495, + "grad_norm": 7.799993991851807, + "learning_rate": 9.571907728112808e-05, + "loss": 0.9776, + "step": 3878 + }, + { + "epoch": 0.26282268446371704, + "grad_norm": 9.275368690490723, + "learning_rate": 9.571770826203026e-05, + "loss": 0.8827, + "step": 3879 + }, + { + "epoch": 0.26289043973168913, + "grad_norm": 9.413054466247559, + "learning_rate": 9.571633924293244e-05, + "loss": 1.0671, + "step": 3880 + }, + { + "epoch": 0.2629581949996612, + "grad_norm": 8.446796417236328, + "learning_rate": 9.571497022383464e-05, + "loss": 0.8702, + "step": 3881 + }, + { + "epoch": 0.2630259502676333, + "grad_norm": 12.962898254394531, + "learning_rate": 9.571360120473682e-05, + "loss": 0.9281, + "step": 3882 + }, + { + "epoch": 0.2630937055356054, + "grad_norm": 7.7250213623046875, + "learning_rate": 9.5712232185639e-05, + "loss": 0.8908, + "step": 3883 + }, + { + "epoch": 0.2631614608035775, + "grad_norm": 10.405988693237305, + "learning_rate": 9.571086316654118e-05, + "loss": 1.2047, + "step": 3884 + }, + { + "epoch": 0.2632292160715496, + "grad_norm": 9.009016036987305, + "learning_rate": 9.570949414744336e-05, + "loss": 1.0619, + "step": 3885 + }, + { + "epoch": 0.2632969713395216, + "grad_norm": 8.600632667541504, + "learning_rate": 9.570812512834555e-05, + "loss": 1.4186, + "step": 3886 + }, + { + "epoch": 0.2633647266074937, + "grad_norm": 7.9944071769714355, + "learning_rate": 9.570675610924773e-05, + "loss": 1.1493, + "step": 3887 + }, + { + "epoch": 0.2634324818754658, + "grad_norm": 7.7023539543151855, + "learning_rate": 9.570538709014991e-05, + "loss": 0.8716, + "step": 3888 + }, + { + "epoch": 0.2635002371434379, + "grad_norm": 7.743750095367432, + "learning_rate": 9.570401807105209e-05, + "loss": 0.9091, + "step": 3889 + }, + { + "epoch": 0.26356799241141, + "grad_norm": 7.080264091491699, + "learning_rate": 9.570264905195427e-05, + "loss": 0.8064, + "step": 3890 + }, + { + "epoch": 0.2636357476793821, + "grad_norm": 10.498579025268555, + "learning_rate": 9.570128003285647e-05, + "loss": 1.2763, + "step": 3891 + }, + { + "epoch": 0.2637035029473542, + "grad_norm": 8.932741165161133, + "learning_rate": 9.569991101375865e-05, + "loss": 1.0108, + "step": 3892 + }, + { + "epoch": 0.26377125821532627, + "grad_norm": 7.670261859893799, + "learning_rate": 9.569854199466083e-05, + "loss": 0.9312, + "step": 3893 + }, + { + "epoch": 0.2638390134832983, + "grad_norm": 6.713436126708984, + "learning_rate": 9.5697172975563e-05, + "loss": 0.9319, + "step": 3894 + }, + { + "epoch": 0.2639067687512704, + "grad_norm": 6.828521728515625, + "learning_rate": 9.56958039564652e-05, + "loss": 0.908, + "step": 3895 + }, + { + "epoch": 0.2639745240192425, + "grad_norm": 7.664526462554932, + "learning_rate": 9.569443493736738e-05, + "loss": 0.9204, + "step": 3896 + }, + { + "epoch": 0.2640422792872146, + "grad_norm": 7.330194473266602, + "learning_rate": 9.569306591826956e-05, + "loss": 0.9971, + "step": 3897 + }, + { + "epoch": 0.26411003455518667, + "grad_norm": 7.202576160430908, + "learning_rate": 9.569169689917174e-05, + "loss": 1.1183, + "step": 3898 + }, + { + "epoch": 0.26417778982315876, + "grad_norm": 7.7107720375061035, + "learning_rate": 9.569032788007392e-05, + "loss": 1.0794, + "step": 3899 + }, + { + "epoch": 0.26424554509113085, + "grad_norm": 8.634172439575195, + "learning_rate": 9.568895886097612e-05, + "loss": 0.9124, + "step": 3900 + }, + { + "epoch": 0.26431330035910294, + "grad_norm": 7.264395236968994, + "learning_rate": 9.56875898418783e-05, + "loss": 1.0044, + "step": 3901 + }, + { + "epoch": 0.264381055627075, + "grad_norm": 7.352424144744873, + "learning_rate": 9.568622082278048e-05, + "loss": 0.9213, + "step": 3902 + }, + { + "epoch": 0.26444881089504707, + "grad_norm": 8.39152717590332, + "learning_rate": 9.568485180368266e-05, + "loss": 0.914, + "step": 3903 + }, + { + "epoch": 0.26451656616301916, + "grad_norm": 6.5833611488342285, + "learning_rate": 9.568348278458485e-05, + "loss": 0.9906, + "step": 3904 + }, + { + "epoch": 0.26458432143099125, + "grad_norm": 7.952385902404785, + "learning_rate": 9.568211376548703e-05, + "loss": 0.9624, + "step": 3905 + }, + { + "epoch": 0.26465207669896335, + "grad_norm": 7.232090950012207, + "learning_rate": 9.568074474638921e-05, + "loss": 0.8852, + "step": 3906 + }, + { + "epoch": 0.26471983196693544, + "grad_norm": 7.817921161651611, + "learning_rate": 9.567937572729139e-05, + "loss": 0.8474, + "step": 3907 + }, + { + "epoch": 0.26478758723490753, + "grad_norm": 8.926132202148438, + "learning_rate": 9.567800670819359e-05, + "loss": 1.0302, + "step": 3908 + }, + { + "epoch": 0.2648553425028796, + "grad_norm": 8.923449516296387, + "learning_rate": 9.567663768909577e-05, + "loss": 1.0837, + "step": 3909 + }, + { + "epoch": 0.26492309777085166, + "grad_norm": 8.431096076965332, + "learning_rate": 9.567526866999795e-05, + "loss": 0.7992, + "step": 3910 + }, + { + "epoch": 0.26499085303882375, + "grad_norm": 10.121541976928711, + "learning_rate": 9.567389965090014e-05, + "loss": 1.0082, + "step": 3911 + }, + { + "epoch": 0.26505860830679584, + "grad_norm": 9.349747657775879, + "learning_rate": 9.567253063180232e-05, + "loss": 0.8257, + "step": 3912 + }, + { + "epoch": 0.26512636357476793, + "grad_norm": 9.438392639160156, + "learning_rate": 9.56711616127045e-05, + "loss": 0.979, + "step": 3913 + }, + { + "epoch": 0.26519411884274, + "grad_norm": 7.422990798950195, + "learning_rate": 9.56697925936067e-05, + "loss": 0.8456, + "step": 3914 + }, + { + "epoch": 0.2652618741107121, + "grad_norm": 6.354233741760254, + "learning_rate": 9.566842357450888e-05, + "loss": 0.6639, + "step": 3915 + }, + { + "epoch": 0.2653296293786842, + "grad_norm": 9.160786628723145, + "learning_rate": 9.566705455541106e-05, + "loss": 1.1126, + "step": 3916 + }, + { + "epoch": 0.2653973846466563, + "grad_norm": 7.786096096038818, + "learning_rate": 9.566568553631324e-05, + "loss": 1.2703, + "step": 3917 + }, + { + "epoch": 0.2654651399146284, + "grad_norm": 7.358225345611572, + "learning_rate": 9.566431651721543e-05, + "loss": 1.0625, + "step": 3918 + }, + { + "epoch": 0.2655328951826004, + "grad_norm": 8.920319557189941, + "learning_rate": 9.566294749811761e-05, + "loss": 1.0037, + "step": 3919 + }, + { + "epoch": 0.2656006504505725, + "grad_norm": 7.16439151763916, + "learning_rate": 9.566157847901979e-05, + "loss": 1.0486, + "step": 3920 + }, + { + "epoch": 0.2656684057185446, + "grad_norm": 7.374850749969482, + "learning_rate": 9.566020945992197e-05, + "loss": 1.011, + "step": 3921 + }, + { + "epoch": 0.2657361609865167, + "grad_norm": 5.965388298034668, + "learning_rate": 9.565884044082415e-05, + "loss": 0.9043, + "step": 3922 + }, + { + "epoch": 0.2658039162544888, + "grad_norm": 7.1143879890441895, + "learning_rate": 9.565747142172634e-05, + "loss": 0.923, + "step": 3923 + }, + { + "epoch": 0.2658716715224609, + "grad_norm": 9.05667495727539, + "learning_rate": 9.565610240262853e-05, + "loss": 1.1247, + "step": 3924 + }, + { + "epoch": 0.265939426790433, + "grad_norm": 6.407328128814697, + "learning_rate": 9.56547333835307e-05, + "loss": 0.9028, + "step": 3925 + }, + { + "epoch": 0.26600718205840507, + "grad_norm": 9.335012435913086, + "learning_rate": 9.565336436443289e-05, + "loss": 0.9761, + "step": 3926 + }, + { + "epoch": 0.2660749373263771, + "grad_norm": 7.462203025817871, + "learning_rate": 9.565199534533508e-05, + "loss": 0.8812, + "step": 3927 + }, + { + "epoch": 0.2661426925943492, + "grad_norm": 8.162378311157227, + "learning_rate": 9.565062632623726e-05, + "loss": 0.991, + "step": 3928 + }, + { + "epoch": 0.2662104478623213, + "grad_norm": 8.835287094116211, + "learning_rate": 9.564925730713944e-05, + "loss": 1.3085, + "step": 3929 + }, + { + "epoch": 0.2662782031302934, + "grad_norm": 9.219624519348145, + "learning_rate": 9.564788828804162e-05, + "loss": 0.989, + "step": 3930 + }, + { + "epoch": 0.26634595839826547, + "grad_norm": 6.832587718963623, + "learning_rate": 9.56465192689438e-05, + "loss": 1.0186, + "step": 3931 + }, + { + "epoch": 0.26641371366623756, + "grad_norm": 8.075157165527344, + "learning_rate": 9.5645150249846e-05, + "loss": 0.9734, + "step": 3932 + }, + { + "epoch": 0.26648146893420965, + "grad_norm": 9.403346061706543, + "learning_rate": 9.564378123074818e-05, + "loss": 1.1875, + "step": 3933 + }, + { + "epoch": 0.26654922420218174, + "grad_norm": 8.556446075439453, + "learning_rate": 9.564241221165036e-05, + "loss": 1.0932, + "step": 3934 + }, + { + "epoch": 0.2666169794701538, + "grad_norm": 8.178442001342773, + "learning_rate": 9.564104319255254e-05, + "loss": 0.7654, + "step": 3935 + }, + { + "epoch": 0.2666847347381259, + "grad_norm": 8.82776165008545, + "learning_rate": 9.563967417345473e-05, + "loss": 1.0518, + "step": 3936 + }, + { + "epoch": 0.26675249000609796, + "grad_norm": 8.001256942749023, + "learning_rate": 9.563830515435691e-05, + "loss": 1.0109, + "step": 3937 + }, + { + "epoch": 0.26682024527407006, + "grad_norm": 7.439608573913574, + "learning_rate": 9.563693613525909e-05, + "loss": 0.9365, + "step": 3938 + }, + { + "epoch": 0.26688800054204215, + "grad_norm": 5.78077507019043, + "learning_rate": 9.563556711616127e-05, + "loss": 0.9356, + "step": 3939 + }, + { + "epoch": 0.26695575581001424, + "grad_norm": 7.6134819984436035, + "learning_rate": 9.563419809706345e-05, + "loss": 1.0003, + "step": 3940 + }, + { + "epoch": 0.26702351107798633, + "grad_norm": 8.467934608459473, + "learning_rate": 9.563282907796565e-05, + "loss": 0.8052, + "step": 3941 + }, + { + "epoch": 0.2670912663459584, + "grad_norm": 8.88598346710205, + "learning_rate": 9.563146005886783e-05, + "loss": 1.0069, + "step": 3942 + }, + { + "epoch": 0.26715902161393046, + "grad_norm": 7.627633094787598, + "learning_rate": 9.563009103977e-05, + "loss": 1.1463, + "step": 3943 + }, + { + "epoch": 0.26722677688190255, + "grad_norm": 7.948824882507324, + "learning_rate": 9.562872202067219e-05, + "loss": 0.9451, + "step": 3944 + }, + { + "epoch": 0.26729453214987464, + "grad_norm": 8.10439395904541, + "learning_rate": 9.562735300157437e-05, + "loss": 1.1375, + "step": 3945 + }, + { + "epoch": 0.26736228741784673, + "grad_norm": 6.488743782043457, + "learning_rate": 9.562598398247656e-05, + "loss": 0.8608, + "step": 3946 + }, + { + "epoch": 0.2674300426858188, + "grad_norm": 9.731819152832031, + "learning_rate": 9.562461496337874e-05, + "loss": 1.2055, + "step": 3947 + }, + { + "epoch": 0.2674977979537909, + "grad_norm": 10.455330848693848, + "learning_rate": 9.562324594428092e-05, + "loss": 1.1095, + "step": 3948 + }, + { + "epoch": 0.267565553221763, + "grad_norm": 6.7713212966918945, + "learning_rate": 9.56218769251831e-05, + "loss": 0.9047, + "step": 3949 + }, + { + "epoch": 0.2676333084897351, + "grad_norm": 9.292582511901855, + "learning_rate": 9.56205079060853e-05, + "loss": 0.9404, + "step": 3950 + }, + { + "epoch": 0.26770106375770714, + "grad_norm": 8.252067565917969, + "learning_rate": 9.561913888698748e-05, + "loss": 0.8826, + "step": 3951 + }, + { + "epoch": 0.26776881902567923, + "grad_norm": 7.126963138580322, + "learning_rate": 9.561776986788966e-05, + "loss": 0.8445, + "step": 3952 + }, + { + "epoch": 0.2678365742936513, + "grad_norm": 8.352923393249512, + "learning_rate": 9.561640084879184e-05, + "loss": 1.1081, + "step": 3953 + }, + { + "epoch": 0.2679043295616234, + "grad_norm": 6.933292388916016, + "learning_rate": 9.561503182969403e-05, + "loss": 0.8228, + "step": 3954 + }, + { + "epoch": 0.2679720848295955, + "grad_norm": 7.9115986824035645, + "learning_rate": 9.561366281059621e-05, + "loss": 1.0984, + "step": 3955 + }, + { + "epoch": 0.2680398400975676, + "grad_norm": 6.988186359405518, + "learning_rate": 9.561229379149839e-05, + "loss": 0.9334, + "step": 3956 + }, + { + "epoch": 0.2681075953655397, + "grad_norm": 6.6764631271362305, + "learning_rate": 9.561092477240058e-05, + "loss": 0.907, + "step": 3957 + }, + { + "epoch": 0.2681753506335118, + "grad_norm": 9.355245590209961, + "learning_rate": 9.560955575330277e-05, + "loss": 0.9087, + "step": 3958 + }, + { + "epoch": 0.2682431059014838, + "grad_norm": 8.177611351013184, + "learning_rate": 9.560818673420495e-05, + "loss": 1.0815, + "step": 3959 + }, + { + "epoch": 0.2683108611694559, + "grad_norm": 9.085289001464844, + "learning_rate": 9.560681771510714e-05, + "loss": 1.1291, + "step": 3960 + }, + { + "epoch": 0.268378616437428, + "grad_norm": 7.056759357452393, + "learning_rate": 9.560544869600932e-05, + "loss": 0.8623, + "step": 3961 + }, + { + "epoch": 0.2684463717054001, + "grad_norm": 8.175825119018555, + "learning_rate": 9.56040796769115e-05, + "loss": 0.8755, + "step": 3962 + }, + { + "epoch": 0.2685141269733722, + "grad_norm": 7.127376079559326, + "learning_rate": 9.560271065781368e-05, + "loss": 0.8877, + "step": 3963 + }, + { + "epoch": 0.26858188224134427, + "grad_norm": 9.635464668273926, + "learning_rate": 9.560134163871587e-05, + "loss": 0.8675, + "step": 3964 + }, + { + "epoch": 0.26864963750931636, + "grad_norm": 7.8138275146484375, + "learning_rate": 9.559997261961805e-05, + "loss": 0.9997, + "step": 3965 + }, + { + "epoch": 0.26871739277728846, + "grad_norm": 7.242639064788818, + "learning_rate": 9.559860360052024e-05, + "loss": 0.7692, + "step": 3966 + }, + { + "epoch": 0.2687851480452605, + "grad_norm": 8.106497764587402, + "learning_rate": 9.559723458142242e-05, + "loss": 0.9801, + "step": 3967 + }, + { + "epoch": 0.2688529033132326, + "grad_norm": 7.468952178955078, + "learning_rate": 9.55958655623246e-05, + "loss": 0.939, + "step": 3968 + }, + { + "epoch": 0.2689206585812047, + "grad_norm": 9.002805709838867, + "learning_rate": 9.559449654322679e-05, + "loss": 1.1655, + "step": 3969 + }, + { + "epoch": 0.26898841384917677, + "grad_norm": 8.787810325622559, + "learning_rate": 9.559312752412897e-05, + "loss": 0.9663, + "step": 3970 + }, + { + "epoch": 0.26905616911714886, + "grad_norm": 9.113668441772461, + "learning_rate": 9.559175850503115e-05, + "loss": 1.0041, + "step": 3971 + }, + { + "epoch": 0.26912392438512095, + "grad_norm": 6.289670944213867, + "learning_rate": 9.559038948593333e-05, + "loss": 0.8714, + "step": 3972 + }, + { + "epoch": 0.26919167965309304, + "grad_norm": 7.755144119262695, + "learning_rate": 9.558902046683552e-05, + "loss": 0.8606, + "step": 3973 + }, + { + "epoch": 0.26925943492106513, + "grad_norm": 8.989197731018066, + "learning_rate": 9.55876514477377e-05, + "loss": 0.9956, + "step": 3974 + }, + { + "epoch": 0.2693271901890372, + "grad_norm": 6.45689058303833, + "learning_rate": 9.558628242863989e-05, + "loss": 0.7416, + "step": 3975 + }, + { + "epoch": 0.26939494545700926, + "grad_norm": 7.772951126098633, + "learning_rate": 9.558491340954207e-05, + "loss": 0.927, + "step": 3976 + }, + { + "epoch": 0.26946270072498135, + "grad_norm": 7.347445487976074, + "learning_rate": 9.558354439044425e-05, + "loss": 0.9337, + "step": 3977 + }, + { + "epoch": 0.26953045599295344, + "grad_norm": 8.20067310333252, + "learning_rate": 9.558217537134644e-05, + "loss": 0.8324, + "step": 3978 + }, + { + "epoch": 0.26959821126092554, + "grad_norm": 10.375189781188965, + "learning_rate": 9.558080635224862e-05, + "loss": 1.1103, + "step": 3979 + }, + { + "epoch": 0.2696659665288976, + "grad_norm": 8.187355041503906, + "learning_rate": 9.55794373331508e-05, + "loss": 0.8469, + "step": 3980 + }, + { + "epoch": 0.2697337217968697, + "grad_norm": 6.336839199066162, + "learning_rate": 9.557806831405298e-05, + "loss": 1.037, + "step": 3981 + }, + { + "epoch": 0.2698014770648418, + "grad_norm": 6.368093967437744, + "learning_rate": 9.557669929495517e-05, + "loss": 0.6851, + "step": 3982 + }, + { + "epoch": 0.2698692323328139, + "grad_norm": 7.394474506378174, + "learning_rate": 9.557533027585736e-05, + "loss": 1.0921, + "step": 3983 + }, + { + "epoch": 0.26993698760078594, + "grad_norm": 9.0152006149292, + "learning_rate": 9.557396125675954e-05, + "loss": 0.9765, + "step": 3984 + }, + { + "epoch": 0.27000474286875803, + "grad_norm": 8.247949600219727, + "learning_rate": 9.557259223766172e-05, + "loss": 1.0941, + "step": 3985 + }, + { + "epoch": 0.2700724981367301, + "grad_norm": 7.9166579246521, + "learning_rate": 9.55712232185639e-05, + "loss": 0.798, + "step": 3986 + }, + { + "epoch": 0.2701402534047022, + "grad_norm": 8.501713752746582, + "learning_rate": 9.556985419946609e-05, + "loss": 0.923, + "step": 3987 + }, + { + "epoch": 0.2702080086726743, + "grad_norm": 7.450741767883301, + "learning_rate": 9.556848518036827e-05, + "loss": 0.9485, + "step": 3988 + }, + { + "epoch": 0.2702757639406464, + "grad_norm": 6.7950239181518555, + "learning_rate": 9.556711616127045e-05, + "loss": 0.9041, + "step": 3989 + }, + { + "epoch": 0.2703435192086185, + "grad_norm": 8.021660804748535, + "learning_rate": 9.556574714217263e-05, + "loss": 0.9494, + "step": 3990 + }, + { + "epoch": 0.2704112744765906, + "grad_norm": 8.049949645996094, + "learning_rate": 9.556437812307482e-05, + "loss": 0.9629, + "step": 3991 + }, + { + "epoch": 0.2704790297445626, + "grad_norm": 9.56615161895752, + "learning_rate": 9.5563009103977e-05, + "loss": 0.8822, + "step": 3992 + }, + { + "epoch": 0.2705467850125347, + "grad_norm": 7.12232780456543, + "learning_rate": 9.556164008487919e-05, + "loss": 1.1377, + "step": 3993 + }, + { + "epoch": 0.2706145402805068, + "grad_norm": 7.224277496337891, + "learning_rate": 9.556027106578137e-05, + "loss": 1.1023, + "step": 3994 + }, + { + "epoch": 0.2706822955484789, + "grad_norm": 8.0076322555542, + "learning_rate": 9.555890204668355e-05, + "loss": 1.0599, + "step": 3995 + }, + { + "epoch": 0.270750050816451, + "grad_norm": 7.8958845138549805, + "learning_rate": 9.555753302758574e-05, + "loss": 1.0455, + "step": 3996 + }, + { + "epoch": 0.2708178060844231, + "grad_norm": 7.802896976470947, + "learning_rate": 9.555616400848792e-05, + "loss": 1.1089, + "step": 3997 + }, + { + "epoch": 0.27088556135239517, + "grad_norm": 8.122269630432129, + "learning_rate": 9.55547949893901e-05, + "loss": 1.0708, + "step": 3998 + }, + { + "epoch": 0.27095331662036726, + "grad_norm": 6.3488383293151855, + "learning_rate": 9.555342597029228e-05, + "loss": 0.85, + "step": 3999 + }, + { + "epoch": 0.2710210718883393, + "grad_norm": 10.236666679382324, + "learning_rate": 9.555205695119448e-05, + "loss": 0.962, + "step": 4000 + }, + { + "epoch": 0.2710888271563114, + "grad_norm": 7.594062328338623, + "learning_rate": 9.555068793209666e-05, + "loss": 1.1641, + "step": 4001 + }, + { + "epoch": 0.2711565824242835, + "grad_norm": 6.824306964874268, + "learning_rate": 9.554931891299884e-05, + "loss": 0.7914, + "step": 4002 + }, + { + "epoch": 0.27122433769225557, + "grad_norm": 7.826432228088379, + "learning_rate": 9.554794989390103e-05, + "loss": 1.0062, + "step": 4003 + }, + { + "epoch": 0.27129209296022766, + "grad_norm": 7.189459800720215, + "learning_rate": 9.554658087480321e-05, + "loss": 0.7324, + "step": 4004 + }, + { + "epoch": 0.27135984822819975, + "grad_norm": 7.949024200439453, + "learning_rate": 9.554521185570539e-05, + "loss": 1.1033, + "step": 4005 + }, + { + "epoch": 0.27142760349617184, + "grad_norm": 6.575378894805908, + "learning_rate": 9.554384283660758e-05, + "loss": 1.0048, + "step": 4006 + }, + { + "epoch": 0.27149535876414393, + "grad_norm": 8.585273742675781, + "learning_rate": 9.554247381750976e-05, + "loss": 1.0338, + "step": 4007 + }, + { + "epoch": 0.27156311403211597, + "grad_norm": 7.999851703643799, + "learning_rate": 9.554110479841194e-05, + "loss": 1.1788, + "step": 4008 + }, + { + "epoch": 0.27163086930008806, + "grad_norm": 8.40134334564209, + "learning_rate": 9.553973577931413e-05, + "loss": 1.0986, + "step": 4009 + }, + { + "epoch": 0.27169862456806015, + "grad_norm": 6.380734920501709, + "learning_rate": 9.553836676021632e-05, + "loss": 0.717, + "step": 4010 + }, + { + "epoch": 0.27176637983603225, + "grad_norm": 7.673857688903809, + "learning_rate": 9.55369977411185e-05, + "loss": 1.1646, + "step": 4011 + }, + { + "epoch": 0.27183413510400434, + "grad_norm": 10.897799491882324, + "learning_rate": 9.553562872202068e-05, + "loss": 1.2796, + "step": 4012 + }, + { + "epoch": 0.27190189037197643, + "grad_norm": 9.101582527160645, + "learning_rate": 9.553425970292286e-05, + "loss": 0.8778, + "step": 4013 + }, + { + "epoch": 0.2719696456399485, + "grad_norm": 6.685849666595459, + "learning_rate": 9.553289068382505e-05, + "loss": 1.2329, + "step": 4014 + }, + { + "epoch": 0.2720374009079206, + "grad_norm": 6.580325603485107, + "learning_rate": 9.553152166472723e-05, + "loss": 0.81, + "step": 4015 + }, + { + "epoch": 0.27210515617589265, + "grad_norm": 7.854914665222168, + "learning_rate": 9.553015264562941e-05, + "loss": 1.1152, + "step": 4016 + }, + { + "epoch": 0.27217291144386474, + "grad_norm": 7.293428421020508, + "learning_rate": 9.55287836265316e-05, + "loss": 0.9238, + "step": 4017 + }, + { + "epoch": 0.27224066671183683, + "grad_norm": 6.944539546966553, + "learning_rate": 9.552741460743378e-05, + "loss": 1.018, + "step": 4018 + }, + { + "epoch": 0.2723084219798089, + "grad_norm": 7.550015926361084, + "learning_rate": 9.552604558833597e-05, + "loss": 1.0332, + "step": 4019 + }, + { + "epoch": 0.272376177247781, + "grad_norm": 8.035116195678711, + "learning_rate": 9.552467656923815e-05, + "loss": 1.0752, + "step": 4020 + }, + { + "epoch": 0.2724439325157531, + "grad_norm": 7.088611125946045, + "learning_rate": 9.552330755014033e-05, + "loss": 0.833, + "step": 4021 + }, + { + "epoch": 0.2725116877837252, + "grad_norm": 7.615128040313721, + "learning_rate": 9.552193853104251e-05, + "loss": 1.0236, + "step": 4022 + }, + { + "epoch": 0.2725794430516973, + "grad_norm": 7.366427421569824, + "learning_rate": 9.552056951194469e-05, + "loss": 0.892, + "step": 4023 + }, + { + "epoch": 0.2726471983196694, + "grad_norm": 7.073375225067139, + "learning_rate": 9.551920049284688e-05, + "loss": 0.8082, + "step": 4024 + }, + { + "epoch": 0.2727149535876414, + "grad_norm": 6.350280284881592, + "learning_rate": 9.551783147374906e-05, + "loss": 0.8154, + "step": 4025 + }, + { + "epoch": 0.2727827088556135, + "grad_norm": 5.656667709350586, + "learning_rate": 9.551646245465125e-05, + "loss": 0.8558, + "step": 4026 + }, + { + "epoch": 0.2728504641235856, + "grad_norm": 6.565401077270508, + "learning_rate": 9.551509343555343e-05, + "loss": 1.0397, + "step": 4027 + }, + { + "epoch": 0.2729182193915577, + "grad_norm": 8.4253511428833, + "learning_rate": 9.551372441645562e-05, + "loss": 0.9872, + "step": 4028 + }, + { + "epoch": 0.2729859746595298, + "grad_norm": 7.32992696762085, + "learning_rate": 9.55123553973578e-05, + "loss": 0.8262, + "step": 4029 + }, + { + "epoch": 0.2730537299275019, + "grad_norm": 7.277110576629639, + "learning_rate": 9.551098637825998e-05, + "loss": 1.218, + "step": 4030 + }, + { + "epoch": 0.27312148519547397, + "grad_norm": 9.594376564025879, + "learning_rate": 9.550961735916216e-05, + "loss": 0.9785, + "step": 4031 + }, + { + "epoch": 0.27318924046344606, + "grad_norm": 9.339418411254883, + "learning_rate": 9.550824834006434e-05, + "loss": 0.911, + "step": 4032 + }, + { + "epoch": 0.2732569957314181, + "grad_norm": 6.717375755310059, + "learning_rate": 9.550687932096653e-05, + "loss": 0.8084, + "step": 4033 + }, + { + "epoch": 0.2733247509993902, + "grad_norm": 6.447595596313477, + "learning_rate": 9.550551030186872e-05, + "loss": 0.9882, + "step": 4034 + }, + { + "epoch": 0.2733925062673623, + "grad_norm": 7.6800312995910645, + "learning_rate": 9.55041412827709e-05, + "loss": 0.7895, + "step": 4035 + }, + { + "epoch": 0.27346026153533437, + "grad_norm": 7.270735263824463, + "learning_rate": 9.550277226367308e-05, + "loss": 0.83, + "step": 4036 + }, + { + "epoch": 0.27352801680330646, + "grad_norm": 8.246411323547363, + "learning_rate": 9.550140324457527e-05, + "loss": 0.8406, + "step": 4037 + }, + { + "epoch": 0.27359577207127855, + "grad_norm": 9.59301471710205, + "learning_rate": 9.550003422547745e-05, + "loss": 0.9433, + "step": 4038 + }, + { + "epoch": 0.27366352733925065, + "grad_norm": 7.304765701293945, + "learning_rate": 9.549866520637963e-05, + "loss": 0.8551, + "step": 4039 + }, + { + "epoch": 0.27373128260722274, + "grad_norm": 10.581608772277832, + "learning_rate": 9.549729618728181e-05, + "loss": 1.0734, + "step": 4040 + }, + { + "epoch": 0.2737990378751948, + "grad_norm": 8.858924865722656, + "learning_rate": 9.549592716818399e-05, + "loss": 1.1424, + "step": 4041 + }, + { + "epoch": 0.27386679314316686, + "grad_norm": 7.042451858520508, + "learning_rate": 9.549455814908618e-05, + "loss": 0.9556, + "step": 4042 + }, + { + "epoch": 0.27393454841113896, + "grad_norm": 7.213229656219482, + "learning_rate": 9.549318912998837e-05, + "loss": 1.0575, + "step": 4043 + }, + { + "epoch": 0.27400230367911105, + "grad_norm": 10.457990646362305, + "learning_rate": 9.549182011089055e-05, + "loss": 1.1868, + "step": 4044 + }, + { + "epoch": 0.27407005894708314, + "grad_norm": 7.598734378814697, + "learning_rate": 9.549045109179273e-05, + "loss": 0.7216, + "step": 4045 + }, + { + "epoch": 0.27413781421505523, + "grad_norm": 7.892279148101807, + "learning_rate": 9.548908207269492e-05, + "loss": 1.1437, + "step": 4046 + }, + { + "epoch": 0.2742055694830273, + "grad_norm": 7.455031394958496, + "learning_rate": 9.54877130535971e-05, + "loss": 0.8402, + "step": 4047 + }, + { + "epoch": 0.2742733247509994, + "grad_norm": 7.1315107345581055, + "learning_rate": 9.548634403449928e-05, + "loss": 0.653, + "step": 4048 + }, + { + "epoch": 0.27434108001897145, + "grad_norm": 7.116184234619141, + "learning_rate": 9.548497501540147e-05, + "loss": 0.9, + "step": 4049 + }, + { + "epoch": 0.27440883528694354, + "grad_norm": 7.833000659942627, + "learning_rate": 9.548360599630365e-05, + "loss": 0.9328, + "step": 4050 + }, + { + "epoch": 0.27447659055491563, + "grad_norm": 7.393906116485596, + "learning_rate": 9.548223697720584e-05, + "loss": 1.0643, + "step": 4051 + }, + { + "epoch": 0.2745443458228877, + "grad_norm": 8.286185264587402, + "learning_rate": 9.548086795810803e-05, + "loss": 1.0402, + "step": 4052 + }, + { + "epoch": 0.2746121010908598, + "grad_norm": 7.88281774520874, + "learning_rate": 9.547949893901021e-05, + "loss": 1.0117, + "step": 4053 + }, + { + "epoch": 0.2746798563588319, + "grad_norm": 9.544231414794922, + "learning_rate": 9.547812991991239e-05, + "loss": 1.1522, + "step": 4054 + }, + { + "epoch": 0.274747611626804, + "grad_norm": 7.637237071990967, + "learning_rate": 9.547676090081457e-05, + "loss": 1.0805, + "step": 4055 + }, + { + "epoch": 0.2748153668947761, + "grad_norm": 7.00446891784668, + "learning_rate": 9.547539188171676e-05, + "loss": 0.9941, + "step": 4056 + }, + { + "epoch": 0.27488312216274813, + "grad_norm": 6.8821306228637695, + "learning_rate": 9.547402286261894e-05, + "loss": 0.997, + "step": 4057 + }, + { + "epoch": 0.2749508774307202, + "grad_norm": 6.466810703277588, + "learning_rate": 9.547265384352112e-05, + "loss": 0.8872, + "step": 4058 + }, + { + "epoch": 0.2750186326986923, + "grad_norm": 7.136430263519287, + "learning_rate": 9.54712848244233e-05, + "loss": 0.768, + "step": 4059 + }, + { + "epoch": 0.2750863879666644, + "grad_norm": 7.117071151733398, + "learning_rate": 9.54699158053255e-05, + "loss": 0.965, + "step": 4060 + }, + { + "epoch": 0.2751541432346365, + "grad_norm": 6.811083793640137, + "learning_rate": 9.546854678622768e-05, + "loss": 0.8448, + "step": 4061 + }, + { + "epoch": 0.2752218985026086, + "grad_norm": 6.055437088012695, + "learning_rate": 9.546717776712986e-05, + "loss": 1.0217, + "step": 4062 + }, + { + "epoch": 0.2752896537705807, + "grad_norm": 8.154548645019531, + "learning_rate": 9.546580874803204e-05, + "loss": 1.0642, + "step": 4063 + }, + { + "epoch": 0.27535740903855277, + "grad_norm": 7.394543170928955, + "learning_rate": 9.546443972893422e-05, + "loss": 1.0716, + "step": 4064 + }, + { + "epoch": 0.2754251643065248, + "grad_norm": 8.716939926147461, + "learning_rate": 9.546307070983641e-05, + "loss": 0.8457, + "step": 4065 + }, + { + "epoch": 0.2754929195744969, + "grad_norm": 8.732163429260254, + "learning_rate": 9.54617016907386e-05, + "loss": 1.0349, + "step": 4066 + }, + { + "epoch": 0.275560674842469, + "grad_norm": 8.67320442199707, + "learning_rate": 9.546033267164077e-05, + "loss": 1.1179, + "step": 4067 + }, + { + "epoch": 0.2756284301104411, + "grad_norm": 8.010993003845215, + "learning_rate": 9.545896365254296e-05, + "loss": 1.0168, + "step": 4068 + }, + { + "epoch": 0.27569618537841317, + "grad_norm": 6.747826099395752, + "learning_rate": 9.545759463344515e-05, + "loss": 0.7905, + "step": 4069 + }, + { + "epoch": 0.27576394064638526, + "grad_norm": 8.352065086364746, + "learning_rate": 9.545622561434733e-05, + "loss": 1.1287, + "step": 4070 + }, + { + "epoch": 0.27583169591435736, + "grad_norm": 8.072574615478516, + "learning_rate": 9.545485659524951e-05, + "loss": 1.1582, + "step": 4071 + }, + { + "epoch": 0.27589945118232945, + "grad_norm": 8.851838111877441, + "learning_rate": 9.545348757615169e-05, + "loss": 1.0434, + "step": 4072 + }, + { + "epoch": 0.2759672064503015, + "grad_norm": 9.833956718444824, + "learning_rate": 9.545211855705387e-05, + "loss": 1.1956, + "step": 4073 + }, + { + "epoch": 0.2760349617182736, + "grad_norm": 8.043981552124023, + "learning_rate": 9.545074953795606e-05, + "loss": 1.0625, + "step": 4074 + }, + { + "epoch": 0.27610271698624567, + "grad_norm": 7.420129776000977, + "learning_rate": 9.544938051885824e-05, + "loss": 0.8933, + "step": 4075 + }, + { + "epoch": 0.27617047225421776, + "grad_norm": 7.062417030334473, + "learning_rate": 9.544801149976042e-05, + "loss": 0.9655, + "step": 4076 + }, + { + "epoch": 0.27623822752218985, + "grad_norm": 8.733392715454102, + "learning_rate": 9.54466424806626e-05, + "loss": 0.9182, + "step": 4077 + }, + { + "epoch": 0.27630598279016194, + "grad_norm": 8.931736946105957, + "learning_rate": 9.544527346156479e-05, + "loss": 1.0243, + "step": 4078 + }, + { + "epoch": 0.27637373805813403, + "grad_norm": 7.487978935241699, + "learning_rate": 9.544390444246698e-05, + "loss": 0.9967, + "step": 4079 + }, + { + "epoch": 0.2764414933261061, + "grad_norm": 8.259819030761719, + "learning_rate": 9.544253542336916e-05, + "loss": 0.942, + "step": 4080 + }, + { + "epoch": 0.2765092485940782, + "grad_norm": 9.625347137451172, + "learning_rate": 9.544116640427134e-05, + "loss": 1.2148, + "step": 4081 + }, + { + "epoch": 0.27657700386205025, + "grad_norm": 7.737034797668457, + "learning_rate": 9.543979738517352e-05, + "loss": 0.7905, + "step": 4082 + }, + { + "epoch": 0.27664475913002234, + "grad_norm": 7.118561744689941, + "learning_rate": 9.543842836607571e-05, + "loss": 0.9883, + "step": 4083 + }, + { + "epoch": 0.27671251439799444, + "grad_norm": 9.749618530273438, + "learning_rate": 9.54370593469779e-05, + "loss": 1.0249, + "step": 4084 + }, + { + "epoch": 0.2767802696659665, + "grad_norm": 8.805608749389648, + "learning_rate": 9.543569032788008e-05, + "loss": 1.041, + "step": 4085 + }, + { + "epoch": 0.2768480249339386, + "grad_norm": 9.240931510925293, + "learning_rate": 9.543432130878226e-05, + "loss": 1.0598, + "step": 4086 + }, + { + "epoch": 0.2769157802019107, + "grad_norm": 6.621399879455566, + "learning_rate": 9.543295228968444e-05, + "loss": 0.8884, + "step": 4087 + }, + { + "epoch": 0.2769835354698828, + "grad_norm": 6.869698524475098, + "learning_rate": 9.543158327058663e-05, + "loss": 0.8309, + "step": 4088 + }, + { + "epoch": 0.2770512907378549, + "grad_norm": 7.135868549346924, + "learning_rate": 9.543021425148881e-05, + "loss": 0.9399, + "step": 4089 + }, + { + "epoch": 0.27711904600582693, + "grad_norm": 7.172493934631348, + "learning_rate": 9.542884523239099e-05, + "loss": 0.9626, + "step": 4090 + }, + { + "epoch": 0.277186801273799, + "grad_norm": 6.903214931488037, + "learning_rate": 9.542747621329317e-05, + "loss": 1.0047, + "step": 4091 + }, + { + "epoch": 0.2772545565417711, + "grad_norm": 7.557178020477295, + "learning_rate": 9.542610719419536e-05, + "loss": 0.6978, + "step": 4092 + }, + { + "epoch": 0.2773223118097432, + "grad_norm": 7.468019485473633, + "learning_rate": 9.542473817509754e-05, + "loss": 0.8109, + "step": 4093 + }, + { + "epoch": 0.2773900670777153, + "grad_norm": 8.699142456054688, + "learning_rate": 9.542336915599973e-05, + "loss": 0.7031, + "step": 4094 + }, + { + "epoch": 0.2774578223456874, + "grad_norm": 8.03862190246582, + "learning_rate": 9.54220001369019e-05, + "loss": 0.9824, + "step": 4095 + }, + { + "epoch": 0.2775255776136595, + "grad_norm": 9.884957313537598, + "learning_rate": 9.54206311178041e-05, + "loss": 0.8122, + "step": 4096 + }, + { + "epoch": 0.27759333288163157, + "grad_norm": 9.435370445251465, + "learning_rate": 9.541926209870628e-05, + "loss": 1.2642, + "step": 4097 + }, + { + "epoch": 0.2776610881496036, + "grad_norm": 8.154888153076172, + "learning_rate": 9.541789307960846e-05, + "loss": 0.9814, + "step": 4098 + }, + { + "epoch": 0.2777288434175757, + "grad_norm": 9.771589279174805, + "learning_rate": 9.541652406051065e-05, + "loss": 1.0959, + "step": 4099 + }, + { + "epoch": 0.2777965986855478, + "grad_norm": 7.635507106781006, + "learning_rate": 9.541515504141283e-05, + "loss": 0.8268, + "step": 4100 + }, + { + "epoch": 0.2778643539535199, + "grad_norm": 9.028327941894531, + "learning_rate": 9.541378602231501e-05, + "loss": 1.227, + "step": 4101 + }, + { + "epoch": 0.277932109221492, + "grad_norm": 8.327515602111816, + "learning_rate": 9.541241700321721e-05, + "loss": 0.8644, + "step": 4102 + }, + { + "epoch": 0.27799986448946407, + "grad_norm": 7.547940254211426, + "learning_rate": 9.541104798411939e-05, + "loss": 1.0169, + "step": 4103 + }, + { + "epoch": 0.27806761975743616, + "grad_norm": 8.0435152053833, + "learning_rate": 9.540967896502157e-05, + "loss": 1.0883, + "step": 4104 + }, + { + "epoch": 0.27813537502540825, + "grad_norm": 7.7741217613220215, + "learning_rate": 9.540830994592375e-05, + "loss": 0.9389, + "step": 4105 + }, + { + "epoch": 0.2782031302933803, + "grad_norm": 8.059552192687988, + "learning_rate": 9.540694092682594e-05, + "loss": 1.0024, + "step": 4106 + }, + { + "epoch": 0.2782708855613524, + "grad_norm": 9.13268756866455, + "learning_rate": 9.540557190772812e-05, + "loss": 1.0448, + "step": 4107 + }, + { + "epoch": 0.27833864082932447, + "grad_norm": 7.901900768280029, + "learning_rate": 9.54042028886303e-05, + "loss": 0.8235, + "step": 4108 + }, + { + "epoch": 0.27840639609729656, + "grad_norm": 8.727076530456543, + "learning_rate": 9.540283386953248e-05, + "loss": 1.1047, + "step": 4109 + }, + { + "epoch": 0.27847415136526865, + "grad_norm": 7.1972880363464355, + "learning_rate": 9.540146485043466e-05, + "loss": 0.9456, + "step": 4110 + }, + { + "epoch": 0.27854190663324074, + "grad_norm": 6.886523246765137, + "learning_rate": 9.540009583133686e-05, + "loss": 1.0041, + "step": 4111 + }, + { + "epoch": 0.27860966190121284, + "grad_norm": 7.595452308654785, + "learning_rate": 9.539872681223904e-05, + "loss": 1.0038, + "step": 4112 + }, + { + "epoch": 0.2786774171691849, + "grad_norm": 6.007086753845215, + "learning_rate": 9.539735779314122e-05, + "loss": 0.8042, + "step": 4113 + }, + { + "epoch": 0.27874517243715696, + "grad_norm": 7.112758159637451, + "learning_rate": 9.53959887740434e-05, + "loss": 0.8252, + "step": 4114 + }, + { + "epoch": 0.27881292770512905, + "grad_norm": 10.120092391967773, + "learning_rate": 9.53946197549456e-05, + "loss": 1.1589, + "step": 4115 + }, + { + "epoch": 0.27888068297310115, + "grad_norm": 7.587961196899414, + "learning_rate": 9.539325073584777e-05, + "loss": 1.0699, + "step": 4116 + }, + { + "epoch": 0.27894843824107324, + "grad_norm": 7.671876430511475, + "learning_rate": 9.539188171674995e-05, + "loss": 0.9699, + "step": 4117 + }, + { + "epoch": 0.27901619350904533, + "grad_norm": 7.345922470092773, + "learning_rate": 9.539051269765213e-05, + "loss": 0.9957, + "step": 4118 + }, + { + "epoch": 0.2790839487770174, + "grad_norm": 9.215903282165527, + "learning_rate": 9.538914367855432e-05, + "loss": 1.156, + "step": 4119 + }, + { + "epoch": 0.2791517040449895, + "grad_norm": 10.056458473205566, + "learning_rate": 9.538777465945651e-05, + "loss": 1.1229, + "step": 4120 + }, + { + "epoch": 0.2792194593129616, + "grad_norm": 7.9655938148498535, + "learning_rate": 9.538640564035869e-05, + "loss": 1.017, + "step": 4121 + }, + { + "epoch": 0.27928721458093364, + "grad_norm": 8.49431324005127, + "learning_rate": 9.538503662126087e-05, + "loss": 1.0035, + "step": 4122 + }, + { + "epoch": 0.27935496984890573, + "grad_norm": 8.746543884277344, + "learning_rate": 9.538366760216305e-05, + "loss": 1.0711, + "step": 4123 + }, + { + "epoch": 0.2794227251168778, + "grad_norm": 7.75557279586792, + "learning_rate": 9.538229858306524e-05, + "loss": 0.6868, + "step": 4124 + }, + { + "epoch": 0.2794904803848499, + "grad_norm": 7.1494622230529785, + "learning_rate": 9.538092956396742e-05, + "loss": 0.721, + "step": 4125 + }, + { + "epoch": 0.279558235652822, + "grad_norm": 6.971895217895508, + "learning_rate": 9.53795605448696e-05, + "loss": 1.0669, + "step": 4126 + }, + { + "epoch": 0.2796259909207941, + "grad_norm": 6.478157043457031, + "learning_rate": 9.537819152577178e-05, + "loss": 0.9348, + "step": 4127 + }, + { + "epoch": 0.2796937461887662, + "grad_norm": 6.307050704956055, + "learning_rate": 9.537682250667397e-05, + "loss": 0.9622, + "step": 4128 + }, + { + "epoch": 0.2797615014567383, + "grad_norm": 9.505130767822266, + "learning_rate": 9.537545348757616e-05, + "loss": 1.1675, + "step": 4129 + }, + { + "epoch": 0.2798292567247104, + "grad_norm": 8.11099624633789, + "learning_rate": 9.537408446847834e-05, + "loss": 0.853, + "step": 4130 + }, + { + "epoch": 0.2798970119926824, + "grad_norm": 6.637272834777832, + "learning_rate": 9.537271544938052e-05, + "loss": 1.0993, + "step": 4131 + }, + { + "epoch": 0.2799647672606545, + "grad_norm": 7.888055801391602, + "learning_rate": 9.53713464302827e-05, + "loss": 0.9582, + "step": 4132 + }, + { + "epoch": 0.2800325225286266, + "grad_norm": 6.289199352264404, + "learning_rate": 9.536997741118488e-05, + "loss": 0.9027, + "step": 4133 + }, + { + "epoch": 0.2801002777965987, + "grad_norm": 7.488378047943115, + "learning_rate": 9.536860839208707e-05, + "loss": 1.0697, + "step": 4134 + }, + { + "epoch": 0.2801680330645708, + "grad_norm": 8.341411590576172, + "learning_rate": 9.536723937298925e-05, + "loss": 1.0705, + "step": 4135 + }, + { + "epoch": 0.28023578833254287, + "grad_norm": 7.55519437789917, + "learning_rate": 9.536587035389144e-05, + "loss": 0.7589, + "step": 4136 + }, + { + "epoch": 0.28030354360051496, + "grad_norm": 6.104217052459717, + "learning_rate": 9.536450133479362e-05, + "loss": 0.9161, + "step": 4137 + }, + { + "epoch": 0.28037129886848705, + "grad_norm": 6.97914457321167, + "learning_rate": 9.536313231569581e-05, + "loss": 1.0342, + "step": 4138 + }, + { + "epoch": 0.2804390541364591, + "grad_norm": 8.791030883789062, + "learning_rate": 9.536176329659799e-05, + "loss": 0.863, + "step": 4139 + }, + { + "epoch": 0.2805068094044312, + "grad_norm": 6.868939399719238, + "learning_rate": 9.536039427750017e-05, + "loss": 0.9005, + "step": 4140 + }, + { + "epoch": 0.28057456467240327, + "grad_norm": 9.854182243347168, + "learning_rate": 9.535902525840235e-05, + "loss": 0.773, + "step": 4141 + }, + { + "epoch": 0.28064231994037536, + "grad_norm": 7.64580774307251, + "learning_rate": 9.535765623930454e-05, + "loss": 0.9999, + "step": 4142 + }, + { + "epoch": 0.28071007520834745, + "grad_norm": 6.032886028289795, + "learning_rate": 9.535628722020672e-05, + "loss": 0.8916, + "step": 4143 + }, + { + "epoch": 0.28077783047631955, + "grad_norm": 7.0441060066223145, + "learning_rate": 9.53549182011089e-05, + "loss": 0.9787, + "step": 4144 + }, + { + "epoch": 0.28084558574429164, + "grad_norm": 6.4428629875183105, + "learning_rate": 9.53535491820111e-05, + "loss": 0.6749, + "step": 4145 + }, + { + "epoch": 0.28091334101226373, + "grad_norm": 8.476522445678711, + "learning_rate": 9.535218016291328e-05, + "loss": 0.9408, + "step": 4146 + }, + { + "epoch": 0.28098109628023576, + "grad_norm": 8.720208168029785, + "learning_rate": 9.535081114381546e-05, + "loss": 0.8571, + "step": 4147 + }, + { + "epoch": 0.28104885154820786, + "grad_norm": 7.846646785736084, + "learning_rate": 9.534944212471765e-05, + "loss": 0.9193, + "step": 4148 + }, + { + "epoch": 0.28111660681617995, + "grad_norm": 7.848026275634766, + "learning_rate": 9.534807310561983e-05, + "loss": 1.1975, + "step": 4149 + }, + { + "epoch": 0.28118436208415204, + "grad_norm": 9.543595314025879, + "learning_rate": 9.534670408652201e-05, + "loss": 0.8769, + "step": 4150 + }, + { + "epoch": 0.28125211735212413, + "grad_norm": 7.252998352050781, + "learning_rate": 9.53453350674242e-05, + "loss": 1.029, + "step": 4151 + }, + { + "epoch": 0.2813198726200962, + "grad_norm": 6.271702289581299, + "learning_rate": 9.534396604832639e-05, + "loss": 0.8992, + "step": 4152 + }, + { + "epoch": 0.2813876278880683, + "grad_norm": 9.071548461914062, + "learning_rate": 9.534259702922857e-05, + "loss": 1.0518, + "step": 4153 + }, + { + "epoch": 0.2814553831560404, + "grad_norm": 7.452267169952393, + "learning_rate": 9.534122801013075e-05, + "loss": 0.9573, + "step": 4154 + }, + { + "epoch": 0.28152313842401244, + "grad_norm": 8.72459602355957, + "learning_rate": 9.533985899103293e-05, + "loss": 1.09, + "step": 4155 + }, + { + "epoch": 0.28159089369198453, + "grad_norm": 8.032079696655273, + "learning_rate": 9.533848997193511e-05, + "loss": 0.9595, + "step": 4156 + }, + { + "epoch": 0.2816586489599566, + "grad_norm": 7.622939109802246, + "learning_rate": 9.53371209528373e-05, + "loss": 0.8722, + "step": 4157 + }, + { + "epoch": 0.2817264042279287, + "grad_norm": 8.330899238586426, + "learning_rate": 9.533575193373948e-05, + "loss": 0.985, + "step": 4158 + }, + { + "epoch": 0.2817941594959008, + "grad_norm": 9.378608703613281, + "learning_rate": 9.533438291464166e-05, + "loss": 0.8458, + "step": 4159 + }, + { + "epoch": 0.2818619147638729, + "grad_norm": 7.305957317352295, + "learning_rate": 9.533301389554384e-05, + "loss": 0.9653, + "step": 4160 + }, + { + "epoch": 0.281929670031845, + "grad_norm": 8.557588577270508, + "learning_rate": 9.533164487644604e-05, + "loss": 0.8742, + "step": 4161 + }, + { + "epoch": 0.2819974252998171, + "grad_norm": 8.77814769744873, + "learning_rate": 9.533027585734822e-05, + "loss": 1.1782, + "step": 4162 + }, + { + "epoch": 0.2820651805677891, + "grad_norm": 7.212672233581543, + "learning_rate": 9.53289068382504e-05, + "loss": 0.6333, + "step": 4163 + }, + { + "epoch": 0.2821329358357612, + "grad_norm": 10.004176139831543, + "learning_rate": 9.532753781915258e-05, + "loss": 0.9544, + "step": 4164 + }, + { + "epoch": 0.2822006911037333, + "grad_norm": 7.577065467834473, + "learning_rate": 9.532616880005476e-05, + "loss": 1.0708, + "step": 4165 + }, + { + "epoch": 0.2822684463717054, + "grad_norm": 7.181521415710449, + "learning_rate": 9.532479978095695e-05, + "loss": 0.829, + "step": 4166 + }, + { + "epoch": 0.2823362016396775, + "grad_norm": 5.7575249671936035, + "learning_rate": 9.532343076185913e-05, + "loss": 0.9353, + "step": 4167 + }, + { + "epoch": 0.2824039569076496, + "grad_norm": 7.974564075469971, + "learning_rate": 9.532206174276131e-05, + "loss": 0.9976, + "step": 4168 + }, + { + "epoch": 0.28247171217562167, + "grad_norm": 8.870126724243164, + "learning_rate": 9.53206927236635e-05, + "loss": 0.9704, + "step": 4169 + }, + { + "epoch": 0.28253946744359376, + "grad_norm": 6.596248149871826, + "learning_rate": 9.531932370456569e-05, + "loss": 0.8763, + "step": 4170 + }, + { + "epoch": 0.2826072227115658, + "grad_norm": 7.725964546203613, + "learning_rate": 9.531795468546787e-05, + "loss": 1.4009, + "step": 4171 + }, + { + "epoch": 0.2826749779795379, + "grad_norm": 6.741204261779785, + "learning_rate": 9.531658566637005e-05, + "loss": 0.9103, + "step": 4172 + }, + { + "epoch": 0.28274273324751, + "grad_norm": 9.002605438232422, + "learning_rate": 9.531521664727223e-05, + "loss": 1.1446, + "step": 4173 + }, + { + "epoch": 0.2828104885154821, + "grad_norm": 7.860680103302002, + "learning_rate": 9.531384762817441e-05, + "loss": 1.081, + "step": 4174 + }, + { + "epoch": 0.28287824378345416, + "grad_norm": 7.939533233642578, + "learning_rate": 9.53124786090766e-05, + "loss": 0.8876, + "step": 4175 + }, + { + "epoch": 0.28294599905142626, + "grad_norm": 7.104232311248779, + "learning_rate": 9.531110958997878e-05, + "loss": 0.8687, + "step": 4176 + }, + { + "epoch": 0.28301375431939835, + "grad_norm": 9.095148086547852, + "learning_rate": 9.530974057088096e-05, + "loss": 0.873, + "step": 4177 + }, + { + "epoch": 0.28308150958737044, + "grad_norm": 6.869518280029297, + "learning_rate": 9.530837155178314e-05, + "loss": 0.8372, + "step": 4178 + }, + { + "epoch": 0.2831492648553425, + "grad_norm": 7.7245049476623535, + "learning_rate": 9.530700253268534e-05, + "loss": 1.0267, + "step": 4179 + }, + { + "epoch": 0.28321702012331457, + "grad_norm": 8.298705101013184, + "learning_rate": 9.530563351358752e-05, + "loss": 0.8919, + "step": 4180 + }, + { + "epoch": 0.28328477539128666, + "grad_norm": 6.176532745361328, + "learning_rate": 9.53042644944897e-05, + "loss": 0.8603, + "step": 4181 + }, + { + "epoch": 0.28335253065925875, + "grad_norm": 6.592353820800781, + "learning_rate": 9.530289547539188e-05, + "loss": 0.9415, + "step": 4182 + }, + { + "epoch": 0.28342028592723084, + "grad_norm": 7.962296962738037, + "learning_rate": 9.530152645629406e-05, + "loss": 0.8764, + "step": 4183 + }, + { + "epoch": 0.28348804119520293, + "grad_norm": 6.484033584594727, + "learning_rate": 9.530015743719625e-05, + "loss": 0.9577, + "step": 4184 + }, + { + "epoch": 0.283555796463175, + "grad_norm": 8.156810760498047, + "learning_rate": 9.529878841809843e-05, + "loss": 1.1628, + "step": 4185 + }, + { + "epoch": 0.2836235517311471, + "grad_norm": 6.916367053985596, + "learning_rate": 9.529741939900061e-05, + "loss": 1.1518, + "step": 4186 + }, + { + "epoch": 0.2836913069991192, + "grad_norm": 7.70388650894165, + "learning_rate": 9.52960503799028e-05, + "loss": 0.9309, + "step": 4187 + }, + { + "epoch": 0.28375906226709124, + "grad_norm": 8.770346641540527, + "learning_rate": 9.529468136080499e-05, + "loss": 0.891, + "step": 4188 + }, + { + "epoch": 0.28382681753506334, + "grad_norm": 7.100319862365723, + "learning_rate": 9.529331234170717e-05, + "loss": 1.0302, + "step": 4189 + }, + { + "epoch": 0.2838945728030354, + "grad_norm": 7.376253128051758, + "learning_rate": 9.529194332260935e-05, + "loss": 0.7906, + "step": 4190 + }, + { + "epoch": 0.2839623280710075, + "grad_norm": 10.125496864318848, + "learning_rate": 9.529057430351154e-05, + "loss": 0.9829, + "step": 4191 + }, + { + "epoch": 0.2840300833389796, + "grad_norm": 7.877635955810547, + "learning_rate": 9.528920528441372e-05, + "loss": 0.8803, + "step": 4192 + }, + { + "epoch": 0.2840978386069517, + "grad_norm": 8.096887588500977, + "learning_rate": 9.52878362653159e-05, + "loss": 0.9342, + "step": 4193 + }, + { + "epoch": 0.2841655938749238, + "grad_norm": 7.934850215911865, + "learning_rate": 9.52864672462181e-05, + "loss": 1.1699, + "step": 4194 + }, + { + "epoch": 0.2842333491428959, + "grad_norm": 8.237794876098633, + "learning_rate": 9.528509822712028e-05, + "loss": 1.1718, + "step": 4195 + }, + { + "epoch": 0.2843011044108679, + "grad_norm": 7.528624057769775, + "learning_rate": 9.528372920802246e-05, + "loss": 0.8229, + "step": 4196 + }, + { + "epoch": 0.28436885967884, + "grad_norm": 7.210242748260498, + "learning_rate": 9.528236018892464e-05, + "loss": 0.8875, + "step": 4197 + }, + { + "epoch": 0.2844366149468121, + "grad_norm": 7.630309581756592, + "learning_rate": 9.528099116982683e-05, + "loss": 0.8811, + "step": 4198 + }, + { + "epoch": 0.2845043702147842, + "grad_norm": 11.624275207519531, + "learning_rate": 9.527962215072901e-05, + "loss": 1.0702, + "step": 4199 + }, + { + "epoch": 0.2845721254827563, + "grad_norm": 7.52834415435791, + "learning_rate": 9.52782531316312e-05, + "loss": 0.9066, + "step": 4200 + }, + { + "epoch": 0.2846398807507284, + "grad_norm": 8.418506622314453, + "learning_rate": 9.527688411253337e-05, + "loss": 0.8147, + "step": 4201 + }, + { + "epoch": 0.28470763601870047, + "grad_norm": 9.67719554901123, + "learning_rate": 9.527551509343557e-05, + "loss": 0.976, + "step": 4202 + }, + { + "epoch": 0.28477539128667256, + "grad_norm": 7.107409954071045, + "learning_rate": 9.527414607433775e-05, + "loss": 0.695, + "step": 4203 + }, + { + "epoch": 0.2848431465546446, + "grad_norm": 7.840113162994385, + "learning_rate": 9.527277705523993e-05, + "loss": 0.9629, + "step": 4204 + }, + { + "epoch": 0.2849109018226167, + "grad_norm": 8.170151710510254, + "learning_rate": 9.527140803614211e-05, + "loss": 0.8715, + "step": 4205 + }, + { + "epoch": 0.2849786570905888, + "grad_norm": 7.882331848144531, + "learning_rate": 9.527003901704429e-05, + "loss": 1.0364, + "step": 4206 + }, + { + "epoch": 0.2850464123585609, + "grad_norm": 10.077646255493164, + "learning_rate": 9.526866999794648e-05, + "loss": 1.2749, + "step": 4207 + }, + { + "epoch": 0.28511416762653297, + "grad_norm": 8.749690055847168, + "learning_rate": 9.526730097884866e-05, + "loss": 0.9815, + "step": 4208 + }, + { + "epoch": 0.28518192289450506, + "grad_norm": 6.793465614318848, + "learning_rate": 9.526593195975084e-05, + "loss": 1.0165, + "step": 4209 + }, + { + "epoch": 0.28524967816247715, + "grad_norm": 7.186471939086914, + "learning_rate": 9.526456294065302e-05, + "loss": 0.911, + "step": 4210 + }, + { + "epoch": 0.28531743343044924, + "grad_norm": 8.116944313049316, + "learning_rate": 9.52631939215552e-05, + "loss": 1.051, + "step": 4211 + }, + { + "epoch": 0.2853851886984213, + "grad_norm": 5.9162750244140625, + "learning_rate": 9.52618249024574e-05, + "loss": 0.8165, + "step": 4212 + }, + { + "epoch": 0.28545294396639337, + "grad_norm": 7.20265531539917, + "learning_rate": 9.526045588335958e-05, + "loss": 1.0693, + "step": 4213 + }, + { + "epoch": 0.28552069923436546, + "grad_norm": 7.380153179168701, + "learning_rate": 9.525908686426176e-05, + "loss": 0.9611, + "step": 4214 + }, + { + "epoch": 0.28558845450233755, + "grad_norm": 7.211367607116699, + "learning_rate": 9.525771784516394e-05, + "loss": 0.7595, + "step": 4215 + }, + { + "epoch": 0.28565620977030964, + "grad_norm": 6.8030104637146, + "learning_rate": 9.525634882606613e-05, + "loss": 0.8281, + "step": 4216 + }, + { + "epoch": 0.28572396503828174, + "grad_norm": 7.774519443511963, + "learning_rate": 9.525497980696831e-05, + "loss": 0.963, + "step": 4217 + }, + { + "epoch": 0.2857917203062538, + "grad_norm": 8.210673332214355, + "learning_rate": 9.52536107878705e-05, + "loss": 1.3076, + "step": 4218 + }, + { + "epoch": 0.2858594755742259, + "grad_norm": 7.3878583908081055, + "learning_rate": 9.525224176877267e-05, + "loss": 0.9546, + "step": 4219 + }, + { + "epoch": 0.28592723084219795, + "grad_norm": 10.107827186584473, + "learning_rate": 9.525087274967485e-05, + "loss": 1.3039, + "step": 4220 + }, + { + "epoch": 0.28599498611017005, + "grad_norm": 7.974700450897217, + "learning_rate": 9.524950373057705e-05, + "loss": 0.9651, + "step": 4221 + }, + { + "epoch": 0.28606274137814214, + "grad_norm": 8.606413841247559, + "learning_rate": 9.524813471147923e-05, + "loss": 0.9459, + "step": 4222 + }, + { + "epoch": 0.28613049664611423, + "grad_norm": 6.7952799797058105, + "learning_rate": 9.524676569238141e-05, + "loss": 0.8818, + "step": 4223 + }, + { + "epoch": 0.2861982519140863, + "grad_norm": 7.2026214599609375, + "learning_rate": 9.524539667328359e-05, + "loss": 0.8578, + "step": 4224 + }, + { + "epoch": 0.2862660071820584, + "grad_norm": 9.135619163513184, + "learning_rate": 9.524402765418578e-05, + "loss": 1.1594, + "step": 4225 + }, + { + "epoch": 0.2863337624500305, + "grad_norm": 7.704957962036133, + "learning_rate": 9.524265863508796e-05, + "loss": 0.9715, + "step": 4226 + }, + { + "epoch": 0.2864015177180026, + "grad_norm": 6.570467948913574, + "learning_rate": 9.524128961599014e-05, + "loss": 0.8916, + "step": 4227 + }, + { + "epoch": 0.28646927298597463, + "grad_norm": 8.359966278076172, + "learning_rate": 9.523992059689232e-05, + "loss": 1.0021, + "step": 4228 + }, + { + "epoch": 0.2865370282539467, + "grad_norm": 7.011820316314697, + "learning_rate": 9.52385515777945e-05, + "loss": 0.8718, + "step": 4229 + }, + { + "epoch": 0.2866047835219188, + "grad_norm": 6.853650093078613, + "learning_rate": 9.52371825586967e-05, + "loss": 0.8041, + "step": 4230 + }, + { + "epoch": 0.2866725387898909, + "grad_norm": 6.95853328704834, + "learning_rate": 9.523581353959888e-05, + "loss": 0.8763, + "step": 4231 + }, + { + "epoch": 0.286740294057863, + "grad_norm": 9.217144012451172, + "learning_rate": 9.523444452050106e-05, + "loss": 1.046, + "step": 4232 + }, + { + "epoch": 0.2868080493258351, + "grad_norm": 8.338934898376465, + "learning_rate": 9.523307550140324e-05, + "loss": 0.9205, + "step": 4233 + }, + { + "epoch": 0.2868758045938072, + "grad_norm": 9.220541954040527, + "learning_rate": 9.523170648230543e-05, + "loss": 1.1468, + "step": 4234 + }, + { + "epoch": 0.2869435598617793, + "grad_norm": 7.641387462615967, + "learning_rate": 9.523033746320761e-05, + "loss": 1.118, + "step": 4235 + }, + { + "epoch": 0.28701131512975137, + "grad_norm": 7.52994441986084, + "learning_rate": 9.52289684441098e-05, + "loss": 0.851, + "step": 4236 + }, + { + "epoch": 0.2870790703977234, + "grad_norm": 8.712708473205566, + "learning_rate": 9.522759942501199e-05, + "loss": 1.1741, + "step": 4237 + }, + { + "epoch": 0.2871468256656955, + "grad_norm": 7.8549723625183105, + "learning_rate": 9.522623040591417e-05, + "loss": 0.9744, + "step": 4238 + }, + { + "epoch": 0.2872145809336676, + "grad_norm": 6.8177642822265625, + "learning_rate": 9.522486138681635e-05, + "loss": 1.0432, + "step": 4239 + }, + { + "epoch": 0.2872823362016397, + "grad_norm": 10.273691177368164, + "learning_rate": 9.522349236771854e-05, + "loss": 1.0777, + "step": 4240 + }, + { + "epoch": 0.28735009146961177, + "grad_norm": 8.261405944824219, + "learning_rate": 9.522212334862072e-05, + "loss": 0.9415, + "step": 4241 + }, + { + "epoch": 0.28741784673758386, + "grad_norm": 10.067824363708496, + "learning_rate": 9.52207543295229e-05, + "loss": 1.1091, + "step": 4242 + }, + { + "epoch": 0.28748560200555595, + "grad_norm": 9.639914512634277, + "learning_rate": 9.521938531042508e-05, + "loss": 0.9534, + "step": 4243 + }, + { + "epoch": 0.28755335727352804, + "grad_norm": 6.993269920349121, + "learning_rate": 9.521801629132728e-05, + "loss": 1.1193, + "step": 4244 + }, + { + "epoch": 0.2876211125415001, + "grad_norm": 7.779829502105713, + "learning_rate": 9.521664727222946e-05, + "loss": 0.8744, + "step": 4245 + }, + { + "epoch": 0.28768886780947217, + "grad_norm": 7.070007801055908, + "learning_rate": 9.521527825313164e-05, + "loss": 0.8609, + "step": 4246 + }, + { + "epoch": 0.28775662307744426, + "grad_norm": 6.091519355773926, + "learning_rate": 9.521390923403382e-05, + "loss": 0.8018, + "step": 4247 + }, + { + "epoch": 0.28782437834541635, + "grad_norm": 6.76396369934082, + "learning_rate": 9.521254021493601e-05, + "loss": 0.8509, + "step": 4248 + }, + { + "epoch": 0.28789213361338845, + "grad_norm": 8.478080749511719, + "learning_rate": 9.521117119583819e-05, + "loss": 1.0355, + "step": 4249 + }, + { + "epoch": 0.28795988888136054, + "grad_norm": 7.830933094024658, + "learning_rate": 9.520980217674037e-05, + "loss": 0.8699, + "step": 4250 + }, + { + "epoch": 0.28802764414933263, + "grad_norm": 8.354218482971191, + "learning_rate": 9.520843315764255e-05, + "loss": 0.9228, + "step": 4251 + }, + { + "epoch": 0.2880953994173047, + "grad_norm": 8.529300689697266, + "learning_rate": 9.520706413854473e-05, + "loss": 1.1215, + "step": 4252 + }, + { + "epoch": 0.28816315468527676, + "grad_norm": 6.831529140472412, + "learning_rate": 9.520569511944693e-05, + "loss": 1.0665, + "step": 4253 + }, + { + "epoch": 0.28823090995324885, + "grad_norm": 7.824577808380127, + "learning_rate": 9.520432610034911e-05, + "loss": 0.8864, + "step": 4254 + }, + { + "epoch": 0.28829866522122094, + "grad_norm": 7.795472621917725, + "learning_rate": 9.520295708125129e-05, + "loss": 1.0775, + "step": 4255 + }, + { + "epoch": 0.28836642048919303, + "grad_norm": 7.210735321044922, + "learning_rate": 9.520158806215347e-05, + "loss": 1.0244, + "step": 4256 + }, + { + "epoch": 0.2884341757571651, + "grad_norm": 7.224759101867676, + "learning_rate": 9.520021904305566e-05, + "loss": 1.0527, + "step": 4257 + }, + { + "epoch": 0.2885019310251372, + "grad_norm": 7.608676910400391, + "learning_rate": 9.519885002395784e-05, + "loss": 0.7978, + "step": 4258 + }, + { + "epoch": 0.2885696862931093, + "grad_norm": 7.426436424255371, + "learning_rate": 9.519748100486002e-05, + "loss": 0.859, + "step": 4259 + }, + { + "epoch": 0.2886374415610814, + "grad_norm": 7.585330963134766, + "learning_rate": 9.51961119857622e-05, + "loss": 1.1091, + "step": 4260 + }, + { + "epoch": 0.28870519682905343, + "grad_norm": 6.930294990539551, + "learning_rate": 9.519474296666438e-05, + "loss": 0.8862, + "step": 4261 + }, + { + "epoch": 0.2887729520970255, + "grad_norm": 8.092456817626953, + "learning_rate": 9.519337394756658e-05, + "loss": 1.051, + "step": 4262 + }, + { + "epoch": 0.2888407073649976, + "grad_norm": 7.897385597229004, + "learning_rate": 9.519200492846876e-05, + "loss": 1.0589, + "step": 4263 + }, + { + "epoch": 0.2889084626329697, + "grad_norm": 7.344932556152344, + "learning_rate": 9.519063590937094e-05, + "loss": 1.0319, + "step": 4264 + }, + { + "epoch": 0.2889762179009418, + "grad_norm": 8.676694869995117, + "learning_rate": 9.518926689027312e-05, + "loss": 0.9119, + "step": 4265 + }, + { + "epoch": 0.2890439731689139, + "grad_norm": 7.291116714477539, + "learning_rate": 9.51878978711753e-05, + "loss": 0.8249, + "step": 4266 + }, + { + "epoch": 0.289111728436886, + "grad_norm": 7.467698574066162, + "learning_rate": 9.51865288520775e-05, + "loss": 0.9807, + "step": 4267 + }, + { + "epoch": 0.2891794837048581, + "grad_norm": 7.744437217712402, + "learning_rate": 9.518515983297967e-05, + "loss": 0.8792, + "step": 4268 + }, + { + "epoch": 0.2892472389728301, + "grad_norm": 6.8492560386657715, + "learning_rate": 9.518379081388185e-05, + "loss": 0.9216, + "step": 4269 + }, + { + "epoch": 0.2893149942408022, + "grad_norm": 7.247424602508545, + "learning_rate": 9.518242179478403e-05, + "loss": 1.0865, + "step": 4270 + }, + { + "epoch": 0.2893827495087743, + "grad_norm": 8.125252723693848, + "learning_rate": 9.518105277568623e-05, + "loss": 1.0298, + "step": 4271 + }, + { + "epoch": 0.2894505047767464, + "grad_norm": 6.578275680541992, + "learning_rate": 9.517968375658841e-05, + "loss": 0.9353, + "step": 4272 + }, + { + "epoch": 0.2895182600447185, + "grad_norm": 8.099616050720215, + "learning_rate": 9.517831473749059e-05, + "loss": 1.038, + "step": 4273 + }, + { + "epoch": 0.28958601531269057, + "grad_norm": 7.529900074005127, + "learning_rate": 9.517694571839277e-05, + "loss": 0.9753, + "step": 4274 + }, + { + "epoch": 0.28965377058066266, + "grad_norm": 7.521812915802002, + "learning_rate": 9.517557669929495e-05, + "loss": 0.972, + "step": 4275 + }, + { + "epoch": 0.28972152584863475, + "grad_norm": 7.780272483825684, + "learning_rate": 9.517420768019714e-05, + "loss": 0.9927, + "step": 4276 + }, + { + "epoch": 0.2897892811166068, + "grad_norm": 7.982085704803467, + "learning_rate": 9.517283866109932e-05, + "loss": 0.9462, + "step": 4277 + }, + { + "epoch": 0.2898570363845789, + "grad_norm": 7.325984477996826, + "learning_rate": 9.51714696420015e-05, + "loss": 1.0814, + "step": 4278 + }, + { + "epoch": 0.289924791652551, + "grad_norm": 7.833248138427734, + "learning_rate": 9.517010062290368e-05, + "loss": 1.1736, + "step": 4279 + }, + { + "epoch": 0.28999254692052306, + "grad_norm": 6.983424186706543, + "learning_rate": 9.516873160380588e-05, + "loss": 0.9662, + "step": 4280 + }, + { + "epoch": 0.29006030218849516, + "grad_norm": 7.954293251037598, + "learning_rate": 9.516736258470806e-05, + "loss": 0.8449, + "step": 4281 + }, + { + "epoch": 0.29012805745646725, + "grad_norm": 7.47749137878418, + "learning_rate": 9.516599356561024e-05, + "loss": 0.7869, + "step": 4282 + }, + { + "epoch": 0.29019581272443934, + "grad_norm": 8.70733642578125, + "learning_rate": 9.516462454651243e-05, + "loss": 0.915, + "step": 4283 + }, + { + "epoch": 0.29026356799241143, + "grad_norm": 7.003527641296387, + "learning_rate": 9.516325552741461e-05, + "loss": 0.8543, + "step": 4284 + }, + { + "epoch": 0.2903313232603835, + "grad_norm": 8.230785369873047, + "learning_rate": 9.51618865083168e-05, + "loss": 1.0022, + "step": 4285 + }, + { + "epoch": 0.29039907852835556, + "grad_norm": 6.20754337310791, + "learning_rate": 9.516051748921899e-05, + "loss": 0.8967, + "step": 4286 + }, + { + "epoch": 0.29046683379632765, + "grad_norm": 7.0158538818359375, + "learning_rate": 9.515914847012117e-05, + "loss": 0.9162, + "step": 4287 + }, + { + "epoch": 0.29053458906429974, + "grad_norm": 6.215134143829346, + "learning_rate": 9.515777945102335e-05, + "loss": 0.7362, + "step": 4288 + }, + { + "epoch": 0.29060234433227183, + "grad_norm": 7.366815090179443, + "learning_rate": 9.515641043192553e-05, + "loss": 1.0174, + "step": 4289 + }, + { + "epoch": 0.2906700996002439, + "grad_norm": 7.514114856719971, + "learning_rate": 9.515504141282772e-05, + "loss": 0.9979, + "step": 4290 + }, + { + "epoch": 0.290737854868216, + "grad_norm": 6.899611949920654, + "learning_rate": 9.51536723937299e-05, + "loss": 0.6909, + "step": 4291 + }, + { + "epoch": 0.2908056101361881, + "grad_norm": 7.254974842071533, + "learning_rate": 9.515230337463208e-05, + "loss": 1.1744, + "step": 4292 + }, + { + "epoch": 0.2908733654041602, + "grad_norm": 7.3820719718933105, + "learning_rate": 9.515093435553426e-05, + "loss": 1.0514, + "step": 4293 + }, + { + "epoch": 0.29094112067213224, + "grad_norm": 7.556882858276367, + "learning_rate": 9.514956533643646e-05, + "loss": 1.0515, + "step": 4294 + }, + { + "epoch": 0.29100887594010433, + "grad_norm": 7.479793548583984, + "learning_rate": 9.514819631733864e-05, + "loss": 0.9634, + "step": 4295 + }, + { + "epoch": 0.2910766312080764, + "grad_norm": 6.750072956085205, + "learning_rate": 9.514682729824082e-05, + "loss": 0.8536, + "step": 4296 + }, + { + "epoch": 0.2911443864760485, + "grad_norm": 6.5008931159973145, + "learning_rate": 9.5145458279143e-05, + "loss": 0.8414, + "step": 4297 + }, + { + "epoch": 0.2912121417440206, + "grad_norm": 5.633193016052246, + "learning_rate": 9.514408926004518e-05, + "loss": 0.9628, + "step": 4298 + }, + { + "epoch": 0.2912798970119927, + "grad_norm": 7.052159309387207, + "learning_rate": 9.514272024094737e-05, + "loss": 0.9612, + "step": 4299 + }, + { + "epoch": 0.2913476522799648, + "grad_norm": 6.944911003112793, + "learning_rate": 9.514135122184955e-05, + "loss": 0.8237, + "step": 4300 + }, + { + "epoch": 0.2914154075479369, + "grad_norm": 6.553284645080566, + "learning_rate": 9.513998220275173e-05, + "loss": 1.103, + "step": 4301 + }, + { + "epoch": 0.2914831628159089, + "grad_norm": 6.414531707763672, + "learning_rate": 9.513861318365391e-05, + "loss": 1.163, + "step": 4302 + }, + { + "epoch": 0.291550918083881, + "grad_norm": 9.551664352416992, + "learning_rate": 9.513724416455611e-05, + "loss": 1.0118, + "step": 4303 + }, + { + "epoch": 0.2916186733518531, + "grad_norm": 8.332045555114746, + "learning_rate": 9.513587514545829e-05, + "loss": 1.0938, + "step": 4304 + }, + { + "epoch": 0.2916864286198252, + "grad_norm": 7.512511253356934, + "learning_rate": 9.513450612636047e-05, + "loss": 0.9499, + "step": 4305 + }, + { + "epoch": 0.2917541838877973, + "grad_norm": 7.233335494995117, + "learning_rate": 9.513313710726265e-05, + "loss": 0.9298, + "step": 4306 + }, + { + "epoch": 0.29182193915576937, + "grad_norm": 6.711233615875244, + "learning_rate": 9.513176808816483e-05, + "loss": 0.8409, + "step": 4307 + }, + { + "epoch": 0.29188969442374146, + "grad_norm": 6.433267593383789, + "learning_rate": 9.513039906906702e-05, + "loss": 0.7654, + "step": 4308 + }, + { + "epoch": 0.29195744969171356, + "grad_norm": 7.049487113952637, + "learning_rate": 9.51290300499692e-05, + "loss": 0.8363, + "step": 4309 + }, + { + "epoch": 0.2920252049596856, + "grad_norm": 7.030927658081055, + "learning_rate": 9.512766103087138e-05, + "loss": 0.8536, + "step": 4310 + }, + { + "epoch": 0.2920929602276577, + "grad_norm": 7.723023414611816, + "learning_rate": 9.512629201177356e-05, + "loss": 0.8738, + "step": 4311 + }, + { + "epoch": 0.2921607154956298, + "grad_norm": 6.293034553527832, + "learning_rate": 9.512492299267576e-05, + "loss": 0.8247, + "step": 4312 + }, + { + "epoch": 0.29222847076360187, + "grad_norm": 7.551876544952393, + "learning_rate": 9.512355397357794e-05, + "loss": 0.9144, + "step": 4313 + }, + { + "epoch": 0.29229622603157396, + "grad_norm": 7.78217077255249, + "learning_rate": 9.512218495448012e-05, + "loss": 0.911, + "step": 4314 + }, + { + "epoch": 0.29236398129954605, + "grad_norm": 6.316245079040527, + "learning_rate": 9.51208159353823e-05, + "loss": 0.7426, + "step": 4315 + }, + { + "epoch": 0.29243173656751814, + "grad_norm": 6.730735778808594, + "learning_rate": 9.511944691628448e-05, + "loss": 0.8938, + "step": 4316 + }, + { + "epoch": 0.29249949183549023, + "grad_norm": 8.128830909729004, + "learning_rate": 9.511807789718667e-05, + "loss": 1.129, + "step": 4317 + }, + { + "epoch": 0.29256724710346227, + "grad_norm": 8.023150444030762, + "learning_rate": 9.511670887808885e-05, + "loss": 1.1377, + "step": 4318 + }, + { + "epoch": 0.29263500237143436, + "grad_norm": 9.075041770935059, + "learning_rate": 9.511533985899103e-05, + "loss": 1.0358, + "step": 4319 + }, + { + "epoch": 0.29270275763940645, + "grad_norm": 9.679409980773926, + "learning_rate": 9.511397083989321e-05, + "loss": 0.9415, + "step": 4320 + }, + { + "epoch": 0.29277051290737854, + "grad_norm": 7.316888809204102, + "learning_rate": 9.51126018207954e-05, + "loss": 0.9422, + "step": 4321 + }, + { + "epoch": 0.29283826817535064, + "grad_norm": 8.017754554748535, + "learning_rate": 9.511123280169759e-05, + "loss": 0.9685, + "step": 4322 + }, + { + "epoch": 0.2929060234433227, + "grad_norm": 7.990202903747559, + "learning_rate": 9.510986378259977e-05, + "loss": 0.9028, + "step": 4323 + }, + { + "epoch": 0.2929737787112948, + "grad_norm": 7.538297653198242, + "learning_rate": 9.510849476350195e-05, + "loss": 0.8146, + "step": 4324 + }, + { + "epoch": 0.2930415339792669, + "grad_norm": 6.794483184814453, + "learning_rate": 9.510712574440413e-05, + "loss": 0.8859, + "step": 4325 + }, + { + "epoch": 0.29310928924723895, + "grad_norm": 9.820608139038086, + "learning_rate": 9.510575672530632e-05, + "loss": 1.0486, + "step": 4326 + }, + { + "epoch": 0.29317704451521104, + "grad_norm": 8.107736587524414, + "learning_rate": 9.51043877062085e-05, + "loss": 0.9978, + "step": 4327 + }, + { + "epoch": 0.29324479978318313, + "grad_norm": 8.129117965698242, + "learning_rate": 9.510301868711068e-05, + "loss": 0.9966, + "step": 4328 + }, + { + "epoch": 0.2933125550511552, + "grad_norm": 7.15314245223999, + "learning_rate": 9.510164966801288e-05, + "loss": 0.8078, + "step": 4329 + }, + { + "epoch": 0.2933803103191273, + "grad_norm": 7.951611518859863, + "learning_rate": 9.510028064891506e-05, + "loss": 0.8566, + "step": 4330 + }, + { + "epoch": 0.2934480655870994, + "grad_norm": 8.350584030151367, + "learning_rate": 9.509891162981724e-05, + "loss": 1.1757, + "step": 4331 + }, + { + "epoch": 0.2935158208550715, + "grad_norm": 7.556168079376221, + "learning_rate": 9.509754261071943e-05, + "loss": 1.175, + "step": 4332 + }, + { + "epoch": 0.2935835761230436, + "grad_norm": 8.102100372314453, + "learning_rate": 9.509617359162161e-05, + "loss": 0.8948, + "step": 4333 + }, + { + "epoch": 0.2936513313910156, + "grad_norm": 6.762083530426025, + "learning_rate": 9.509480457252379e-05, + "loss": 0.9597, + "step": 4334 + }, + { + "epoch": 0.2937190866589877, + "grad_norm": 7.2654948234558105, + "learning_rate": 9.509343555342599e-05, + "loss": 0.9352, + "step": 4335 + }, + { + "epoch": 0.2937868419269598, + "grad_norm": 8.114389419555664, + "learning_rate": 9.509206653432817e-05, + "loss": 0.9758, + "step": 4336 + }, + { + "epoch": 0.2938545971949319, + "grad_norm": 6.972601890563965, + "learning_rate": 9.509069751523035e-05, + "loss": 0.9322, + "step": 4337 + }, + { + "epoch": 0.293922352462904, + "grad_norm": 7.261653423309326, + "learning_rate": 9.508932849613253e-05, + "loss": 0.8714, + "step": 4338 + }, + { + "epoch": 0.2939901077308761, + "grad_norm": 8.368372917175293, + "learning_rate": 9.508795947703471e-05, + "loss": 1.1235, + "step": 4339 + }, + { + "epoch": 0.2940578629988482, + "grad_norm": 8.295226097106934, + "learning_rate": 9.50865904579369e-05, + "loss": 1.0647, + "step": 4340 + }, + { + "epoch": 0.29412561826682027, + "grad_norm": 8.720281600952148, + "learning_rate": 9.508522143883908e-05, + "loss": 1.0779, + "step": 4341 + }, + { + "epoch": 0.29419337353479236, + "grad_norm": 6.721040725708008, + "learning_rate": 9.508385241974126e-05, + "loss": 1.0219, + "step": 4342 + }, + { + "epoch": 0.2942611288027644, + "grad_norm": 6.298686504364014, + "learning_rate": 9.508248340064344e-05, + "loss": 0.8011, + "step": 4343 + }, + { + "epoch": 0.2943288840707365, + "grad_norm": 6.915674686431885, + "learning_rate": 9.508111438154562e-05, + "loss": 0.9702, + "step": 4344 + }, + { + "epoch": 0.2943966393387086, + "grad_norm": 6.907165050506592, + "learning_rate": 9.507974536244782e-05, + "loss": 0.861, + "step": 4345 + }, + { + "epoch": 0.29446439460668067, + "grad_norm": 7.540262699127197, + "learning_rate": 9.507837634335e-05, + "loss": 0.9426, + "step": 4346 + }, + { + "epoch": 0.29453214987465276, + "grad_norm": 7.145787715911865, + "learning_rate": 9.507700732425218e-05, + "loss": 0.911, + "step": 4347 + }, + { + "epoch": 0.29459990514262485, + "grad_norm": 9.937151908874512, + "learning_rate": 9.507563830515436e-05, + "loss": 0.9825, + "step": 4348 + }, + { + "epoch": 0.29466766041059694, + "grad_norm": 7.125603199005127, + "learning_rate": 9.507426928605655e-05, + "loss": 0.855, + "step": 4349 + }, + { + "epoch": 0.29473541567856903, + "grad_norm": 7.014605522155762, + "learning_rate": 9.507290026695873e-05, + "loss": 0.9282, + "step": 4350 + }, + { + "epoch": 0.29480317094654107, + "grad_norm": 8.753725051879883, + "learning_rate": 9.507153124786091e-05, + "loss": 1.1047, + "step": 4351 + }, + { + "epoch": 0.29487092621451316, + "grad_norm": 7.921840190887451, + "learning_rate": 9.50701622287631e-05, + "loss": 0.7857, + "step": 4352 + }, + { + "epoch": 0.29493868148248525, + "grad_norm": 6.762521266937256, + "learning_rate": 9.506879320966527e-05, + "loss": 0.8537, + "step": 4353 + }, + { + "epoch": 0.29500643675045735, + "grad_norm": 8.441102981567383, + "learning_rate": 9.506742419056747e-05, + "loss": 1.1356, + "step": 4354 + }, + { + "epoch": 0.29507419201842944, + "grad_norm": 6.748636245727539, + "learning_rate": 9.506605517146965e-05, + "loss": 1.0962, + "step": 4355 + }, + { + "epoch": 0.29514194728640153, + "grad_norm": 8.168306350708008, + "learning_rate": 9.506468615237183e-05, + "loss": 0.9628, + "step": 4356 + }, + { + "epoch": 0.2952097025543736, + "grad_norm": 6.545300483703613, + "learning_rate": 9.506331713327401e-05, + "loss": 1.0041, + "step": 4357 + }, + { + "epoch": 0.2952774578223457, + "grad_norm": 6.958950996398926, + "learning_rate": 9.50619481141762e-05, + "loss": 0.9742, + "step": 4358 + }, + { + "epoch": 0.29534521309031775, + "grad_norm": 6.817789554595947, + "learning_rate": 9.506057909507838e-05, + "loss": 0.8281, + "step": 4359 + }, + { + "epoch": 0.29541296835828984, + "grad_norm": 8.255404472351074, + "learning_rate": 9.505921007598056e-05, + "loss": 0.9171, + "step": 4360 + }, + { + "epoch": 0.29548072362626193, + "grad_norm": 7.552668571472168, + "learning_rate": 9.505784105688274e-05, + "loss": 0.8665, + "step": 4361 + }, + { + "epoch": 0.295548478894234, + "grad_norm": 6.980686664581299, + "learning_rate": 9.505647203778492e-05, + "loss": 0.9461, + "step": 4362 + }, + { + "epoch": 0.2956162341622061, + "grad_norm": 6.765429973602295, + "learning_rate": 9.505510301868712e-05, + "loss": 0.7946, + "step": 4363 + }, + { + "epoch": 0.2956839894301782, + "grad_norm": 8.403508186340332, + "learning_rate": 9.50537339995893e-05, + "loss": 0.9015, + "step": 4364 + }, + { + "epoch": 0.2957517446981503, + "grad_norm": 7.822012901306152, + "learning_rate": 9.505236498049148e-05, + "loss": 1.0313, + "step": 4365 + }, + { + "epoch": 0.2958194999661224, + "grad_norm": 9.843314170837402, + "learning_rate": 9.505099596139366e-05, + "loss": 1.11, + "step": 4366 + }, + { + "epoch": 0.2958872552340944, + "grad_norm": 8.155379295349121, + "learning_rate": 9.504962694229584e-05, + "loss": 1.1565, + "step": 4367 + }, + { + "epoch": 0.2959550105020665, + "grad_norm": 8.446005821228027, + "learning_rate": 9.504825792319803e-05, + "loss": 1.0588, + "step": 4368 + }, + { + "epoch": 0.2960227657700386, + "grad_norm": 7.244038105010986, + "learning_rate": 9.504688890410021e-05, + "loss": 0.8648, + "step": 4369 + }, + { + "epoch": 0.2960905210380107, + "grad_norm": 8.324630737304688, + "learning_rate": 9.50455198850024e-05, + "loss": 0.9862, + "step": 4370 + }, + { + "epoch": 0.2961582763059828, + "grad_norm": 7.308585166931152, + "learning_rate": 9.504415086590457e-05, + "loss": 1.082, + "step": 4371 + }, + { + "epoch": 0.2962260315739549, + "grad_norm": 8.17287826538086, + "learning_rate": 9.504278184680677e-05, + "loss": 1.0359, + "step": 4372 + }, + { + "epoch": 0.296293786841927, + "grad_norm": 6.8774943351745605, + "learning_rate": 9.504141282770895e-05, + "loss": 0.904, + "step": 4373 + }, + { + "epoch": 0.29636154210989907, + "grad_norm": 8.641596794128418, + "learning_rate": 9.504004380861113e-05, + "loss": 0.9822, + "step": 4374 + }, + { + "epoch": 0.2964292973778711, + "grad_norm": 7.289584159851074, + "learning_rate": 9.503867478951331e-05, + "loss": 1.0274, + "step": 4375 + }, + { + "epoch": 0.2964970526458432, + "grad_norm": 9.820521354675293, + "learning_rate": 9.50373057704155e-05, + "loss": 1.0895, + "step": 4376 + }, + { + "epoch": 0.2965648079138153, + "grad_norm": 8.6587553024292, + "learning_rate": 9.503593675131768e-05, + "loss": 0.7513, + "step": 4377 + }, + { + "epoch": 0.2966325631817874, + "grad_norm": 7.444339752197266, + "learning_rate": 9.503456773221986e-05, + "loss": 1.0892, + "step": 4378 + }, + { + "epoch": 0.29670031844975947, + "grad_norm": 6.4494948387146, + "learning_rate": 9.503319871312206e-05, + "loss": 0.9465, + "step": 4379 + }, + { + "epoch": 0.29676807371773156, + "grad_norm": 7.744002819061279, + "learning_rate": 9.503182969402424e-05, + "loss": 1.0613, + "step": 4380 + }, + { + "epoch": 0.29683582898570365, + "grad_norm": 8.152511596679688, + "learning_rate": 9.503046067492642e-05, + "loss": 1.0115, + "step": 4381 + }, + { + "epoch": 0.29690358425367575, + "grad_norm": 6.076781749725342, + "learning_rate": 9.502909165582861e-05, + "loss": 0.8255, + "step": 4382 + }, + { + "epoch": 0.2969713395216478, + "grad_norm": 6.787397861480713, + "learning_rate": 9.502772263673079e-05, + "loss": 1.1132, + "step": 4383 + }, + { + "epoch": 0.2970390947896199, + "grad_norm": 8.592107772827148, + "learning_rate": 9.502635361763297e-05, + "loss": 0.8715, + "step": 4384 + }, + { + "epoch": 0.29710685005759196, + "grad_norm": 7.620471954345703, + "learning_rate": 9.502498459853515e-05, + "loss": 0.9521, + "step": 4385 + }, + { + "epoch": 0.29717460532556406, + "grad_norm": 8.026017189025879, + "learning_rate": 9.502361557943735e-05, + "loss": 0.8575, + "step": 4386 + }, + { + "epoch": 0.29724236059353615, + "grad_norm": 8.82768726348877, + "learning_rate": 9.502224656033953e-05, + "loss": 1.0642, + "step": 4387 + }, + { + "epoch": 0.29731011586150824, + "grad_norm": 7.179310321807861, + "learning_rate": 9.502087754124171e-05, + "loss": 0.9161, + "step": 4388 + }, + { + "epoch": 0.29737787112948033, + "grad_norm": 7.4363861083984375, + "learning_rate": 9.501950852214389e-05, + "loss": 1.0825, + "step": 4389 + }, + { + "epoch": 0.2974456263974524, + "grad_norm": 6.218450546264648, + "learning_rate": 9.501813950304608e-05, + "loss": 0.8461, + "step": 4390 + }, + { + "epoch": 0.2975133816654245, + "grad_norm": 6.540037631988525, + "learning_rate": 9.501677048394826e-05, + "loss": 0.9099, + "step": 4391 + }, + { + "epoch": 0.29758113693339655, + "grad_norm": 9.617361068725586, + "learning_rate": 9.501540146485044e-05, + "loss": 0.9205, + "step": 4392 + }, + { + "epoch": 0.29764889220136864, + "grad_norm": 8.519254684448242, + "learning_rate": 9.501403244575262e-05, + "loss": 0.9774, + "step": 4393 + }, + { + "epoch": 0.29771664746934073, + "grad_norm": 8.101237297058105, + "learning_rate": 9.50126634266548e-05, + "loss": 0.9019, + "step": 4394 + }, + { + "epoch": 0.2977844027373128, + "grad_norm": 6.703762531280518, + "learning_rate": 9.5011294407557e-05, + "loss": 0.9486, + "step": 4395 + }, + { + "epoch": 0.2978521580052849, + "grad_norm": 8.651348114013672, + "learning_rate": 9.500992538845918e-05, + "loss": 0.9189, + "step": 4396 + }, + { + "epoch": 0.297919913273257, + "grad_norm": 7.783169269561768, + "learning_rate": 9.500855636936136e-05, + "loss": 0.933, + "step": 4397 + }, + { + "epoch": 0.2979876685412291, + "grad_norm": 8.011998176574707, + "learning_rate": 9.500718735026354e-05, + "loss": 1.0642, + "step": 4398 + }, + { + "epoch": 0.2980554238092012, + "grad_norm": 10.782837867736816, + "learning_rate": 9.500581833116572e-05, + "loss": 1.1406, + "step": 4399 + }, + { + "epoch": 0.29812317907717323, + "grad_norm": 7.699460029602051, + "learning_rate": 9.500444931206791e-05, + "loss": 0.9627, + "step": 4400 + }, + { + "epoch": 0.2981909343451453, + "grad_norm": 8.144030570983887, + "learning_rate": 9.500308029297009e-05, + "loss": 0.8278, + "step": 4401 + }, + { + "epoch": 0.2982586896131174, + "grad_norm": 7.882628917694092, + "learning_rate": 9.500171127387227e-05, + "loss": 0.7924, + "step": 4402 + }, + { + "epoch": 0.2983264448810895, + "grad_norm": 6.887333393096924, + "learning_rate": 9.500034225477445e-05, + "loss": 0.8385, + "step": 4403 + }, + { + "epoch": 0.2983942001490616, + "grad_norm": 8.446837425231934, + "learning_rate": 9.499897323567665e-05, + "loss": 1.0352, + "step": 4404 + }, + { + "epoch": 0.2984619554170337, + "grad_norm": 8.389452934265137, + "learning_rate": 9.499760421657883e-05, + "loss": 0.8053, + "step": 4405 + }, + { + "epoch": 0.2985297106850058, + "grad_norm": 7.09201717376709, + "learning_rate": 9.499623519748101e-05, + "loss": 0.999, + "step": 4406 + }, + { + "epoch": 0.29859746595297787, + "grad_norm": 9.670032501220703, + "learning_rate": 9.499486617838319e-05, + "loss": 0.8558, + "step": 4407 + }, + { + "epoch": 0.2986652212209499, + "grad_norm": 9.050987243652344, + "learning_rate": 9.499349715928537e-05, + "loss": 1.0356, + "step": 4408 + }, + { + "epoch": 0.298732976488922, + "grad_norm": 6.220524787902832, + "learning_rate": 9.499212814018756e-05, + "loss": 0.8278, + "step": 4409 + }, + { + "epoch": 0.2988007317568941, + "grad_norm": 7.464169979095459, + "learning_rate": 9.499075912108974e-05, + "loss": 0.963, + "step": 4410 + }, + { + "epoch": 0.2988684870248662, + "grad_norm": 8.479538917541504, + "learning_rate": 9.498939010199192e-05, + "loss": 1.1108, + "step": 4411 + }, + { + "epoch": 0.2989362422928383, + "grad_norm": 8.23353385925293, + "learning_rate": 9.49880210828941e-05, + "loss": 1.0156, + "step": 4412 + }, + { + "epoch": 0.29900399756081036, + "grad_norm": 7.2161335945129395, + "learning_rate": 9.49866520637963e-05, + "loss": 0.9964, + "step": 4413 + }, + { + "epoch": 0.29907175282878246, + "grad_norm": 7.893895149230957, + "learning_rate": 9.498528304469848e-05, + "loss": 0.9353, + "step": 4414 + }, + { + "epoch": 0.29913950809675455, + "grad_norm": 8.448126792907715, + "learning_rate": 9.498391402560066e-05, + "loss": 0.9488, + "step": 4415 + }, + { + "epoch": 0.2992072633647266, + "grad_norm": 8.160601615905762, + "learning_rate": 9.498254500650284e-05, + "loss": 0.8934, + "step": 4416 + }, + { + "epoch": 0.2992750186326987, + "grad_norm": 7.6815266609191895, + "learning_rate": 9.498117598740502e-05, + "loss": 0.8938, + "step": 4417 + }, + { + "epoch": 0.29934277390067077, + "grad_norm": 6.795403480529785, + "learning_rate": 9.497980696830721e-05, + "loss": 0.7074, + "step": 4418 + }, + { + "epoch": 0.29941052916864286, + "grad_norm": 7.904134750366211, + "learning_rate": 9.497843794920939e-05, + "loss": 0.8747, + "step": 4419 + }, + { + "epoch": 0.29947828443661495, + "grad_norm": 7.776915073394775, + "learning_rate": 9.497706893011157e-05, + "loss": 1.1141, + "step": 4420 + }, + { + "epoch": 0.29954603970458704, + "grad_norm": 7.425099849700928, + "learning_rate": 9.497569991101375e-05, + "loss": 0.8826, + "step": 4421 + }, + { + "epoch": 0.29961379497255913, + "grad_norm": 10.530119895935059, + "learning_rate": 9.497433089191595e-05, + "loss": 1.0893, + "step": 4422 + }, + { + "epoch": 0.2996815502405312, + "grad_norm": 7.118913650512695, + "learning_rate": 9.497296187281813e-05, + "loss": 0.9179, + "step": 4423 + }, + { + "epoch": 0.29974930550850326, + "grad_norm": 6.6407060623168945, + "learning_rate": 9.497159285372031e-05, + "loss": 0.8763, + "step": 4424 + }, + { + "epoch": 0.29981706077647535, + "grad_norm": 6.975287437438965, + "learning_rate": 9.49702238346225e-05, + "loss": 0.9192, + "step": 4425 + }, + { + "epoch": 0.29988481604444744, + "grad_norm": 11.773819923400879, + "learning_rate": 9.496885481552468e-05, + "loss": 1.3173, + "step": 4426 + }, + { + "epoch": 0.29995257131241954, + "grad_norm": 13.157849311828613, + "learning_rate": 9.496748579642686e-05, + "loss": 0.8773, + "step": 4427 + }, + { + "epoch": 0.3000203265803916, + "grad_norm": 7.372555732727051, + "learning_rate": 9.496611677732906e-05, + "loss": 1.0028, + "step": 4428 + }, + { + "epoch": 0.3000880818483637, + "grad_norm": 8.804738998413086, + "learning_rate": 9.496474775823124e-05, + "loss": 1.0684, + "step": 4429 + }, + { + "epoch": 0.3001558371163358, + "grad_norm": 7.723897933959961, + "learning_rate": 9.496337873913342e-05, + "loss": 0.9113, + "step": 4430 + }, + { + "epoch": 0.3002235923843079, + "grad_norm": 10.261975288391113, + "learning_rate": 9.49620097200356e-05, + "loss": 1.242, + "step": 4431 + }, + { + "epoch": 0.30029134765227994, + "grad_norm": 7.396425247192383, + "learning_rate": 9.496064070093779e-05, + "loss": 1.1045, + "step": 4432 + }, + { + "epoch": 0.30035910292025203, + "grad_norm": 9.235404014587402, + "learning_rate": 9.495927168183997e-05, + "loss": 1.1058, + "step": 4433 + }, + { + "epoch": 0.3004268581882241, + "grad_norm": 6.465355396270752, + "learning_rate": 9.495790266274215e-05, + "loss": 0.9163, + "step": 4434 + }, + { + "epoch": 0.3004946134561962, + "grad_norm": 6.401477813720703, + "learning_rate": 9.495653364364433e-05, + "loss": 0.8224, + "step": 4435 + }, + { + "epoch": 0.3005623687241683, + "grad_norm": 6.599462985992432, + "learning_rate": 9.495516462454653e-05, + "loss": 0.8104, + "step": 4436 + }, + { + "epoch": 0.3006301239921404, + "grad_norm": 8.553694725036621, + "learning_rate": 9.495379560544871e-05, + "loss": 1.1467, + "step": 4437 + }, + { + "epoch": 0.3006978792601125, + "grad_norm": 6.262206554412842, + "learning_rate": 9.495242658635089e-05, + "loss": 0.7903, + "step": 4438 + }, + { + "epoch": 0.3007656345280846, + "grad_norm": 7.284942150115967, + "learning_rate": 9.495105756725307e-05, + "loss": 1.0585, + "step": 4439 + }, + { + "epoch": 0.3008333897960566, + "grad_norm": 8.880256652832031, + "learning_rate": 9.494968854815525e-05, + "loss": 1.0553, + "step": 4440 + }, + { + "epoch": 0.3009011450640287, + "grad_norm": 9.358749389648438, + "learning_rate": 9.494831952905744e-05, + "loss": 1.1874, + "step": 4441 + }, + { + "epoch": 0.3009689003320008, + "grad_norm": 6.7088470458984375, + "learning_rate": 9.494695050995962e-05, + "loss": 0.8641, + "step": 4442 + }, + { + "epoch": 0.3010366555999729, + "grad_norm": 9.420478820800781, + "learning_rate": 9.49455814908618e-05, + "loss": 0.8829, + "step": 4443 + }, + { + "epoch": 0.301104410867945, + "grad_norm": 8.877934455871582, + "learning_rate": 9.494421247176398e-05, + "loss": 1.1206, + "step": 4444 + }, + { + "epoch": 0.3011721661359171, + "grad_norm": 9.361932754516602, + "learning_rate": 9.494284345266618e-05, + "loss": 0.7993, + "step": 4445 + }, + { + "epoch": 0.30123992140388917, + "grad_norm": 7.4104790687561035, + "learning_rate": 9.494147443356836e-05, + "loss": 1.0655, + "step": 4446 + }, + { + "epoch": 0.30130767667186126, + "grad_norm": 7.0485992431640625, + "learning_rate": 9.494010541447054e-05, + "loss": 0.8878, + "step": 4447 + }, + { + "epoch": 0.30137543193983335, + "grad_norm": 7.162468433380127, + "learning_rate": 9.493873639537272e-05, + "loss": 0.9467, + "step": 4448 + }, + { + "epoch": 0.3014431872078054, + "grad_norm": 9.214662551879883, + "learning_rate": 9.49373673762749e-05, + "loss": 0.9288, + "step": 4449 + }, + { + "epoch": 0.3015109424757775, + "grad_norm": 11.189544677734375, + "learning_rate": 9.493599835717709e-05, + "loss": 0.9933, + "step": 4450 + }, + { + "epoch": 0.30157869774374957, + "grad_norm": 9.398331642150879, + "learning_rate": 9.493462933807927e-05, + "loss": 0.9248, + "step": 4451 + }, + { + "epoch": 0.30164645301172166, + "grad_norm": 8.26975154876709, + "learning_rate": 9.493326031898145e-05, + "loss": 1.0308, + "step": 4452 + }, + { + "epoch": 0.30171420827969375, + "grad_norm": 9.262918472290039, + "learning_rate": 9.493189129988363e-05, + "loss": 0.9367, + "step": 4453 + }, + { + "epoch": 0.30178196354766584, + "grad_norm": 7.079284191131592, + "learning_rate": 9.493052228078581e-05, + "loss": 1.0415, + "step": 4454 + }, + { + "epoch": 0.30184971881563794, + "grad_norm": 9.081875801086426, + "learning_rate": 9.492915326168801e-05, + "loss": 1.0783, + "step": 4455 + }, + { + "epoch": 0.30191747408361, + "grad_norm": 8.476323127746582, + "learning_rate": 9.492778424259019e-05, + "loss": 0.9743, + "step": 4456 + }, + { + "epoch": 0.30198522935158206, + "grad_norm": 7.064181327819824, + "learning_rate": 9.492641522349237e-05, + "loss": 0.7966, + "step": 4457 + }, + { + "epoch": 0.30205298461955415, + "grad_norm": 7.837399959564209, + "learning_rate": 9.492504620439455e-05, + "loss": 0.8757, + "step": 4458 + }, + { + "epoch": 0.30212073988752625, + "grad_norm": 7.2140936851501465, + "learning_rate": 9.492367718529674e-05, + "loss": 1.1334, + "step": 4459 + }, + { + "epoch": 0.30218849515549834, + "grad_norm": 7.836158752441406, + "learning_rate": 9.492230816619892e-05, + "loss": 1.1179, + "step": 4460 + }, + { + "epoch": 0.30225625042347043, + "grad_norm": 8.686471939086914, + "learning_rate": 9.49209391471011e-05, + "loss": 0.9826, + "step": 4461 + }, + { + "epoch": 0.3023240056914425, + "grad_norm": 8.442061424255371, + "learning_rate": 9.491957012800328e-05, + "loss": 1.0392, + "step": 4462 + }, + { + "epoch": 0.3023917609594146, + "grad_norm": 7.024100303649902, + "learning_rate": 9.491820110890546e-05, + "loss": 0.7857, + "step": 4463 + }, + { + "epoch": 0.3024595162273867, + "grad_norm": 7.306611061096191, + "learning_rate": 9.491683208980766e-05, + "loss": 1.0154, + "step": 4464 + }, + { + "epoch": 0.30252727149535874, + "grad_norm": 7.3064470291137695, + "learning_rate": 9.491546307070984e-05, + "loss": 1.0311, + "step": 4465 + }, + { + "epoch": 0.30259502676333083, + "grad_norm": 7.264878749847412, + "learning_rate": 9.491409405161202e-05, + "loss": 0.932, + "step": 4466 + }, + { + "epoch": 0.3026627820313029, + "grad_norm": 7.43487024307251, + "learning_rate": 9.49127250325142e-05, + "loss": 0.8024, + "step": 4467 + }, + { + "epoch": 0.302730537299275, + "grad_norm": 7.710512161254883, + "learning_rate": 9.491135601341639e-05, + "loss": 0.7989, + "step": 4468 + }, + { + "epoch": 0.3027982925672471, + "grad_norm": 7.263044834136963, + "learning_rate": 9.490998699431857e-05, + "loss": 0.9481, + "step": 4469 + }, + { + "epoch": 0.3028660478352192, + "grad_norm": 7.284390449523926, + "learning_rate": 9.490861797522075e-05, + "loss": 0.8008, + "step": 4470 + }, + { + "epoch": 0.3029338031031913, + "grad_norm": 8.490337371826172, + "learning_rate": 9.490724895612295e-05, + "loss": 1.1867, + "step": 4471 + }, + { + "epoch": 0.3030015583711634, + "grad_norm": 6.555050849914551, + "learning_rate": 9.490587993702513e-05, + "loss": 0.8615, + "step": 4472 + }, + { + "epoch": 0.3030693136391354, + "grad_norm": 6.497532367706299, + "learning_rate": 9.490451091792731e-05, + "loss": 0.9643, + "step": 4473 + }, + { + "epoch": 0.3031370689071075, + "grad_norm": 6.556066036224365, + "learning_rate": 9.49031418988295e-05, + "loss": 0.9757, + "step": 4474 + }, + { + "epoch": 0.3032048241750796, + "grad_norm": 6.649857044219971, + "learning_rate": 9.490177287973168e-05, + "loss": 1.0196, + "step": 4475 + }, + { + "epoch": 0.3032725794430517, + "grad_norm": 8.725894927978516, + "learning_rate": 9.490040386063386e-05, + "loss": 1.1662, + "step": 4476 + }, + { + "epoch": 0.3033403347110238, + "grad_norm": 6.520106792449951, + "learning_rate": 9.489903484153604e-05, + "loss": 0.8353, + "step": 4477 + }, + { + "epoch": 0.3034080899789959, + "grad_norm": 6.688689708709717, + "learning_rate": 9.489766582243824e-05, + "loss": 0.8101, + "step": 4478 + }, + { + "epoch": 0.30347584524696797, + "grad_norm": 7.817204475402832, + "learning_rate": 9.489629680334042e-05, + "loss": 0.9354, + "step": 4479 + }, + { + "epoch": 0.30354360051494006, + "grad_norm": 7.536436080932617, + "learning_rate": 9.48949277842426e-05, + "loss": 1.2358, + "step": 4480 + }, + { + "epoch": 0.3036113557829121, + "grad_norm": 8.635372161865234, + "learning_rate": 9.489355876514478e-05, + "loss": 0.8344, + "step": 4481 + }, + { + "epoch": 0.3036791110508842, + "grad_norm": 7.561103343963623, + "learning_rate": 9.489218974604697e-05, + "loss": 0.864, + "step": 4482 + }, + { + "epoch": 0.3037468663188563, + "grad_norm": 8.284873962402344, + "learning_rate": 9.489082072694915e-05, + "loss": 0.8756, + "step": 4483 + }, + { + "epoch": 0.30381462158682837, + "grad_norm": 8.976608276367188, + "learning_rate": 9.488945170785133e-05, + "loss": 1.0523, + "step": 4484 + }, + { + "epoch": 0.30388237685480046, + "grad_norm": 11.891230583190918, + "learning_rate": 9.488808268875351e-05, + "loss": 1.0367, + "step": 4485 + }, + { + "epoch": 0.30395013212277255, + "grad_norm": 8.247873306274414, + "learning_rate": 9.488671366965569e-05, + "loss": 0.8173, + "step": 4486 + }, + { + "epoch": 0.30401788739074465, + "grad_norm": 6.74050760269165, + "learning_rate": 9.488534465055789e-05, + "loss": 0.8192, + "step": 4487 + }, + { + "epoch": 0.30408564265871674, + "grad_norm": 7.035127639770508, + "learning_rate": 9.488397563146007e-05, + "loss": 0.8416, + "step": 4488 + }, + { + "epoch": 0.3041533979266888, + "grad_norm": 7.281885623931885, + "learning_rate": 9.488260661236225e-05, + "loss": 1.072, + "step": 4489 + }, + { + "epoch": 0.30422115319466086, + "grad_norm": 6.236363887786865, + "learning_rate": 9.488123759326443e-05, + "loss": 0.8738, + "step": 4490 + }, + { + "epoch": 0.30428890846263296, + "grad_norm": 8.20182991027832, + "learning_rate": 9.487986857416662e-05, + "loss": 1.1965, + "step": 4491 + }, + { + "epoch": 0.30435666373060505, + "grad_norm": 9.837231636047363, + "learning_rate": 9.48784995550688e-05, + "loss": 0.9125, + "step": 4492 + }, + { + "epoch": 0.30442441899857714, + "grad_norm": 8.810354232788086, + "learning_rate": 9.487713053597098e-05, + "loss": 1.0623, + "step": 4493 + }, + { + "epoch": 0.30449217426654923, + "grad_norm": 6.360396385192871, + "learning_rate": 9.487576151687316e-05, + "loss": 0.8683, + "step": 4494 + }, + { + "epoch": 0.3045599295345213, + "grad_norm": 8.158722877502441, + "learning_rate": 9.487439249777534e-05, + "loss": 0.9562, + "step": 4495 + }, + { + "epoch": 0.3046276848024934, + "grad_norm": 12.142204284667969, + "learning_rate": 9.487302347867754e-05, + "loss": 0.8937, + "step": 4496 + }, + { + "epoch": 0.3046954400704655, + "grad_norm": 8.125468254089355, + "learning_rate": 9.487165445957972e-05, + "loss": 1.0587, + "step": 4497 + }, + { + "epoch": 0.30476319533843754, + "grad_norm": 7.607559680938721, + "learning_rate": 9.48702854404819e-05, + "loss": 0.8296, + "step": 4498 + }, + { + "epoch": 0.30483095060640963, + "grad_norm": 6.430202484130859, + "learning_rate": 9.486891642138408e-05, + "loss": 1.067, + "step": 4499 + }, + { + "epoch": 0.3048987058743817, + "grad_norm": 7.2482805252075195, + "learning_rate": 9.486754740228626e-05, + "loss": 1.0611, + "step": 4500 + }, + { + "epoch": 0.3049664611423538, + "grad_norm": 9.431131362915039, + "learning_rate": 9.486617838318845e-05, + "loss": 0.9665, + "step": 4501 + }, + { + "epoch": 0.3050342164103259, + "grad_norm": 8.319073677062988, + "learning_rate": 9.486480936409063e-05, + "loss": 0.983, + "step": 4502 + }, + { + "epoch": 0.305101971678298, + "grad_norm": 6.784060478210449, + "learning_rate": 9.486344034499281e-05, + "loss": 0.8534, + "step": 4503 + }, + { + "epoch": 0.3051697269462701, + "grad_norm": 7.521092414855957, + "learning_rate": 9.486207132589499e-05, + "loss": 0.9167, + "step": 4504 + }, + { + "epoch": 0.3052374822142422, + "grad_norm": 7.7156982421875, + "learning_rate": 9.486070230679719e-05, + "loss": 0.7691, + "step": 4505 + }, + { + "epoch": 0.3053052374822142, + "grad_norm": 6.727222919464111, + "learning_rate": 9.485933328769937e-05, + "loss": 0.7156, + "step": 4506 + }, + { + "epoch": 0.3053729927501863, + "grad_norm": 7.416358470916748, + "learning_rate": 9.485796426860155e-05, + "loss": 1.0458, + "step": 4507 + }, + { + "epoch": 0.3054407480181584, + "grad_norm": 8.601702690124512, + "learning_rate": 9.485659524950373e-05, + "loss": 1.1664, + "step": 4508 + }, + { + "epoch": 0.3055085032861305, + "grad_norm": 6.742228984832764, + "learning_rate": 9.485522623040591e-05, + "loss": 0.9715, + "step": 4509 + }, + { + "epoch": 0.3055762585541026, + "grad_norm": 6.782869815826416, + "learning_rate": 9.48538572113081e-05, + "loss": 0.7644, + "step": 4510 + }, + { + "epoch": 0.3056440138220747, + "grad_norm": 5.753969669342041, + "learning_rate": 9.485248819221028e-05, + "loss": 0.8468, + "step": 4511 + }, + { + "epoch": 0.30571176909004677, + "grad_norm": 10.150800704956055, + "learning_rate": 9.485111917311246e-05, + "loss": 1.1521, + "step": 4512 + }, + { + "epoch": 0.30577952435801886, + "grad_norm": 7.476456642150879, + "learning_rate": 9.484975015401464e-05, + "loss": 1.0091, + "step": 4513 + }, + { + "epoch": 0.3058472796259909, + "grad_norm": 8.955781936645508, + "learning_rate": 9.484838113491684e-05, + "loss": 1.1837, + "step": 4514 + }, + { + "epoch": 0.305915034893963, + "grad_norm": 8.779487609863281, + "learning_rate": 9.484701211581902e-05, + "loss": 0.9747, + "step": 4515 + }, + { + "epoch": 0.3059827901619351, + "grad_norm": 9.432082176208496, + "learning_rate": 9.48456430967212e-05, + "loss": 0.9298, + "step": 4516 + }, + { + "epoch": 0.3060505454299072, + "grad_norm": 8.186026573181152, + "learning_rate": 9.484427407762339e-05, + "loss": 0.9026, + "step": 4517 + }, + { + "epoch": 0.30611830069787926, + "grad_norm": 7.902477264404297, + "learning_rate": 9.484290505852557e-05, + "loss": 1.0116, + "step": 4518 + }, + { + "epoch": 0.30618605596585136, + "grad_norm": 9.117185592651367, + "learning_rate": 9.484153603942775e-05, + "loss": 1.0867, + "step": 4519 + }, + { + "epoch": 0.30625381123382345, + "grad_norm": 8.473755836486816, + "learning_rate": 9.484016702032995e-05, + "loss": 0.8742, + "step": 4520 + }, + { + "epoch": 0.30632156650179554, + "grad_norm": 12.328755378723145, + "learning_rate": 9.483879800123213e-05, + "loss": 1.2981, + "step": 4521 + }, + { + "epoch": 0.3063893217697676, + "grad_norm": 7.099585056304932, + "learning_rate": 9.483742898213431e-05, + "loss": 0.9685, + "step": 4522 + }, + { + "epoch": 0.30645707703773967, + "grad_norm": 8.027973175048828, + "learning_rate": 9.48360599630365e-05, + "loss": 1.0194, + "step": 4523 + }, + { + "epoch": 0.30652483230571176, + "grad_norm": 8.275866508483887, + "learning_rate": 9.483469094393868e-05, + "loss": 1.2424, + "step": 4524 + }, + { + "epoch": 0.30659258757368385, + "grad_norm": 8.546550750732422, + "learning_rate": 9.483332192484086e-05, + "loss": 0.9284, + "step": 4525 + }, + { + "epoch": 0.30666034284165594, + "grad_norm": 5.6628522872924805, + "learning_rate": 9.483195290574304e-05, + "loss": 0.882, + "step": 4526 + }, + { + "epoch": 0.30672809810962803, + "grad_norm": 7.189582824707031, + "learning_rate": 9.483058388664522e-05, + "loss": 0.8087, + "step": 4527 + }, + { + "epoch": 0.3067958533776001, + "grad_norm": 6.341355323791504, + "learning_rate": 9.482921486754742e-05, + "loss": 0.8759, + "step": 4528 + }, + { + "epoch": 0.3068636086455722, + "grad_norm": 6.838935852050781, + "learning_rate": 9.48278458484496e-05, + "loss": 0.6662, + "step": 4529 + }, + { + "epoch": 0.30693136391354425, + "grad_norm": 5.441661834716797, + "learning_rate": 9.482647682935178e-05, + "loss": 0.7008, + "step": 4530 + }, + { + "epoch": 0.30699911918151634, + "grad_norm": 9.242436408996582, + "learning_rate": 9.482510781025396e-05, + "loss": 0.8675, + "step": 4531 + }, + { + "epoch": 0.30706687444948844, + "grad_norm": 7.393209934234619, + "learning_rate": 9.482373879115614e-05, + "loss": 0.7829, + "step": 4532 + }, + { + "epoch": 0.30713462971746053, + "grad_norm": 8.613391876220703, + "learning_rate": 9.482236977205833e-05, + "loss": 1.0276, + "step": 4533 + }, + { + "epoch": 0.3072023849854326, + "grad_norm": 8.517586708068848, + "learning_rate": 9.482100075296051e-05, + "loss": 0.8115, + "step": 4534 + }, + { + "epoch": 0.3072701402534047, + "grad_norm": 7.575396537780762, + "learning_rate": 9.481963173386269e-05, + "loss": 0.8254, + "step": 4535 + }, + { + "epoch": 0.3073378955213768, + "grad_norm": 8.90748405456543, + "learning_rate": 9.481826271476487e-05, + "loss": 0.9864, + "step": 4536 + }, + { + "epoch": 0.3074056507893489, + "grad_norm": 10.321487426757812, + "learning_rate": 9.481689369566707e-05, + "loss": 1.0247, + "step": 4537 + }, + { + "epoch": 0.30747340605732093, + "grad_norm": 8.15263843536377, + "learning_rate": 9.481552467656925e-05, + "loss": 0.8475, + "step": 4538 + }, + { + "epoch": 0.307541161325293, + "grad_norm": 7.198974609375, + "learning_rate": 9.481415565747143e-05, + "loss": 1.1028, + "step": 4539 + }, + { + "epoch": 0.3076089165932651, + "grad_norm": 8.286428451538086, + "learning_rate": 9.481278663837361e-05, + "loss": 0.9451, + "step": 4540 + }, + { + "epoch": 0.3076766718612372, + "grad_norm": 7.620658874511719, + "learning_rate": 9.481141761927579e-05, + "loss": 0.8243, + "step": 4541 + }, + { + "epoch": 0.3077444271292093, + "grad_norm": 7.8145341873168945, + "learning_rate": 9.481004860017798e-05, + "loss": 0.7741, + "step": 4542 + }, + { + "epoch": 0.3078121823971814, + "grad_norm": 7.216073989868164, + "learning_rate": 9.480867958108016e-05, + "loss": 0.7973, + "step": 4543 + }, + { + "epoch": 0.3078799376651535, + "grad_norm": 7.704121112823486, + "learning_rate": 9.480731056198234e-05, + "loss": 0.9388, + "step": 4544 + }, + { + "epoch": 0.30794769293312557, + "grad_norm": 7.872618675231934, + "learning_rate": 9.480594154288452e-05, + "loss": 0.8628, + "step": 4545 + }, + { + "epoch": 0.3080154482010976, + "grad_norm": 5.696948528289795, + "learning_rate": 9.480457252378672e-05, + "loss": 0.8319, + "step": 4546 + }, + { + "epoch": 0.3080832034690697, + "grad_norm": 7.372182846069336, + "learning_rate": 9.48032035046889e-05, + "loss": 0.9198, + "step": 4547 + }, + { + "epoch": 0.3081509587370418, + "grad_norm": 6.9523420333862305, + "learning_rate": 9.480183448559108e-05, + "loss": 1.0383, + "step": 4548 + }, + { + "epoch": 0.3082187140050139, + "grad_norm": 7.862281799316406, + "learning_rate": 9.480046546649326e-05, + "loss": 0.9764, + "step": 4549 + }, + { + "epoch": 0.308286469272986, + "grad_norm": 6.563671588897705, + "learning_rate": 9.479909644739544e-05, + "loss": 0.9587, + "step": 4550 + }, + { + "epoch": 0.30835422454095807, + "grad_norm": 6.150197505950928, + "learning_rate": 9.479772742829763e-05, + "loss": 0.8265, + "step": 4551 + }, + { + "epoch": 0.30842197980893016, + "grad_norm": 5.330137729644775, + "learning_rate": 9.479635840919981e-05, + "loss": 0.697, + "step": 4552 + }, + { + "epoch": 0.30848973507690225, + "grad_norm": 7.886617183685303, + "learning_rate": 9.479498939010199e-05, + "loss": 0.8645, + "step": 4553 + }, + { + "epoch": 0.30855749034487434, + "grad_norm": 8.9354248046875, + "learning_rate": 9.479362037100417e-05, + "loss": 0.9669, + "step": 4554 + }, + { + "epoch": 0.3086252456128464, + "grad_norm": 6.7398457527160645, + "learning_rate": 9.479225135190635e-05, + "loss": 0.6935, + "step": 4555 + }, + { + "epoch": 0.30869300088081847, + "grad_norm": 6.668737888336182, + "learning_rate": 9.479088233280855e-05, + "loss": 1.0038, + "step": 4556 + }, + { + "epoch": 0.30876075614879056, + "grad_norm": 9.82985782623291, + "learning_rate": 9.478951331371073e-05, + "loss": 1.1031, + "step": 4557 + }, + { + "epoch": 0.30882851141676265, + "grad_norm": 7.345351696014404, + "learning_rate": 9.478814429461291e-05, + "loss": 1.032, + "step": 4558 + }, + { + "epoch": 0.30889626668473474, + "grad_norm": 8.863839149475098, + "learning_rate": 9.478677527551509e-05, + "loss": 1.0042, + "step": 4559 + }, + { + "epoch": 0.30896402195270684, + "grad_norm": 7.280172824859619, + "learning_rate": 9.478540625641728e-05, + "loss": 0.977, + "step": 4560 + }, + { + "epoch": 0.3090317772206789, + "grad_norm": 7.121008396148682, + "learning_rate": 9.478403723731946e-05, + "loss": 0.7969, + "step": 4561 + }, + { + "epoch": 0.309099532488651, + "grad_norm": 8.222799301147461, + "learning_rate": 9.478266821822164e-05, + "loss": 1.0655, + "step": 4562 + }, + { + "epoch": 0.30916728775662305, + "grad_norm": 7.74297571182251, + "learning_rate": 9.478129919912384e-05, + "loss": 0.7976, + "step": 4563 + }, + { + "epoch": 0.30923504302459515, + "grad_norm": 8.211404800415039, + "learning_rate": 9.477993018002602e-05, + "loss": 0.9004, + "step": 4564 + }, + { + "epoch": 0.30930279829256724, + "grad_norm": 6.897336006164551, + "learning_rate": 9.47785611609282e-05, + "loss": 0.9347, + "step": 4565 + }, + { + "epoch": 0.30937055356053933, + "grad_norm": 6.508474826812744, + "learning_rate": 9.477719214183039e-05, + "loss": 0.862, + "step": 4566 + }, + { + "epoch": 0.3094383088285114, + "grad_norm": 6.972619533538818, + "learning_rate": 9.477582312273257e-05, + "loss": 0.8142, + "step": 4567 + }, + { + "epoch": 0.3095060640964835, + "grad_norm": 8.018939971923828, + "learning_rate": 9.477445410363475e-05, + "loss": 1.1858, + "step": 4568 + }, + { + "epoch": 0.3095738193644556, + "grad_norm": 7.123754024505615, + "learning_rate": 9.477308508453695e-05, + "loss": 0.9347, + "step": 4569 + }, + { + "epoch": 0.3096415746324277, + "grad_norm": 7.701295375823975, + "learning_rate": 9.477171606543913e-05, + "loss": 1.0511, + "step": 4570 + }, + { + "epoch": 0.30970932990039973, + "grad_norm": 7.858259677886963, + "learning_rate": 9.47703470463413e-05, + "loss": 0.8332, + "step": 4571 + }, + { + "epoch": 0.3097770851683718, + "grad_norm": 9.541449546813965, + "learning_rate": 9.476897802724349e-05, + "loss": 1.3054, + "step": 4572 + }, + { + "epoch": 0.3098448404363439, + "grad_norm": 7.877323150634766, + "learning_rate": 9.476760900814567e-05, + "loss": 0.9087, + "step": 4573 + }, + { + "epoch": 0.309912595704316, + "grad_norm": 6.667144775390625, + "learning_rate": 9.476623998904786e-05, + "loss": 0.9066, + "step": 4574 + }, + { + "epoch": 0.3099803509722881, + "grad_norm": 6.78439998626709, + "learning_rate": 9.476487096995004e-05, + "loss": 0.9524, + "step": 4575 + }, + { + "epoch": 0.3100481062402602, + "grad_norm": 6.174951076507568, + "learning_rate": 9.476350195085222e-05, + "loss": 0.9853, + "step": 4576 + }, + { + "epoch": 0.3101158615082323, + "grad_norm": 7.872679233551025, + "learning_rate": 9.47621329317544e-05, + "loss": 1.2686, + "step": 4577 + }, + { + "epoch": 0.3101836167762044, + "grad_norm": 6.736785411834717, + "learning_rate": 9.47607639126566e-05, + "loss": 0.8149, + "step": 4578 + }, + { + "epoch": 0.3102513720441764, + "grad_norm": 7.353178977966309, + "learning_rate": 9.475939489355878e-05, + "loss": 0.7092, + "step": 4579 + }, + { + "epoch": 0.3103191273121485, + "grad_norm": 8.10339069366455, + "learning_rate": 9.475802587446096e-05, + "loss": 1.1926, + "step": 4580 + }, + { + "epoch": 0.3103868825801206, + "grad_norm": 7.225834369659424, + "learning_rate": 9.475665685536314e-05, + "loss": 1.1534, + "step": 4581 + }, + { + "epoch": 0.3104546378480927, + "grad_norm": 7.356152057647705, + "learning_rate": 9.475528783626532e-05, + "loss": 1.1098, + "step": 4582 + }, + { + "epoch": 0.3105223931160648, + "grad_norm": 10.347126960754395, + "learning_rate": 9.475391881716751e-05, + "loss": 1.136, + "step": 4583 + }, + { + "epoch": 0.31059014838403687, + "grad_norm": 7.562332630157471, + "learning_rate": 9.475254979806969e-05, + "loss": 0.9211, + "step": 4584 + }, + { + "epoch": 0.31065790365200896, + "grad_norm": 6.505655288696289, + "learning_rate": 9.475118077897187e-05, + "loss": 0.9538, + "step": 4585 + }, + { + "epoch": 0.31072565891998105, + "grad_norm": 6.551809787750244, + "learning_rate": 9.474981175987405e-05, + "loss": 1.1335, + "step": 4586 + }, + { + "epoch": 0.3107934141879531, + "grad_norm": 7.4724321365356445, + "learning_rate": 9.474844274077623e-05, + "loss": 1.0441, + "step": 4587 + }, + { + "epoch": 0.3108611694559252, + "grad_norm": 8.463167190551758, + "learning_rate": 9.474707372167843e-05, + "loss": 1.0979, + "step": 4588 + }, + { + "epoch": 0.31092892472389727, + "grad_norm": 6.868551731109619, + "learning_rate": 9.47457047025806e-05, + "loss": 0.6893, + "step": 4589 + }, + { + "epoch": 0.31099667999186936, + "grad_norm": 5.831715106964111, + "learning_rate": 9.474433568348279e-05, + "loss": 0.9197, + "step": 4590 + }, + { + "epoch": 0.31106443525984145, + "grad_norm": 7.562331676483154, + "learning_rate": 9.474296666438497e-05, + "loss": 0.8768, + "step": 4591 + }, + { + "epoch": 0.31113219052781355, + "grad_norm": 7.750473499298096, + "learning_rate": 9.474159764528716e-05, + "loss": 0.8046, + "step": 4592 + }, + { + "epoch": 0.31119994579578564, + "grad_norm": 7.836174964904785, + "learning_rate": 9.474022862618934e-05, + "loss": 0.9346, + "step": 4593 + }, + { + "epoch": 0.31126770106375773, + "grad_norm": 6.159252166748047, + "learning_rate": 9.473885960709152e-05, + "loss": 0.8661, + "step": 4594 + }, + { + "epoch": 0.31133545633172977, + "grad_norm": 7.959331512451172, + "learning_rate": 9.47374905879937e-05, + "loss": 0.919, + "step": 4595 + }, + { + "epoch": 0.31140321159970186, + "grad_norm": 8.349802017211914, + "learning_rate": 9.473612156889588e-05, + "loss": 0.9762, + "step": 4596 + }, + { + "epoch": 0.31147096686767395, + "grad_norm": 6.385254859924316, + "learning_rate": 9.473475254979808e-05, + "loss": 1.0168, + "step": 4597 + }, + { + "epoch": 0.31153872213564604, + "grad_norm": 6.848907947540283, + "learning_rate": 9.473338353070026e-05, + "loss": 0.8235, + "step": 4598 + }, + { + "epoch": 0.31160647740361813, + "grad_norm": 8.07978630065918, + "learning_rate": 9.473201451160244e-05, + "loss": 0.8394, + "step": 4599 + }, + { + "epoch": 0.3116742326715902, + "grad_norm": 8.702666282653809, + "learning_rate": 9.473064549250462e-05, + "loss": 1.1859, + "step": 4600 + }, + { + "epoch": 0.3117419879395623, + "grad_norm": 7.322251796722412, + "learning_rate": 9.472927647340681e-05, + "loss": 0.7089, + "step": 4601 + }, + { + "epoch": 0.3118097432075344, + "grad_norm": 7.717091083526611, + "learning_rate": 9.472790745430899e-05, + "loss": 0.8609, + "step": 4602 + }, + { + "epoch": 0.3118774984755065, + "grad_norm": 7.298448085784912, + "learning_rate": 9.472653843521117e-05, + "loss": 1.0044, + "step": 4603 + }, + { + "epoch": 0.31194525374347853, + "grad_norm": 8.221037864685059, + "learning_rate": 9.472516941611335e-05, + "loss": 1.0403, + "step": 4604 + }, + { + "epoch": 0.3120130090114506, + "grad_norm": 8.83371639251709, + "learning_rate": 9.472380039701553e-05, + "loss": 1.2884, + "step": 4605 + }, + { + "epoch": 0.3120807642794227, + "grad_norm": 6.774711608886719, + "learning_rate": 9.472243137791773e-05, + "loss": 0.8077, + "step": 4606 + }, + { + "epoch": 0.3121485195473948, + "grad_norm": 8.470376968383789, + "learning_rate": 9.472106235881991e-05, + "loss": 0.9549, + "step": 4607 + }, + { + "epoch": 0.3122162748153669, + "grad_norm": 6.509616374969482, + "learning_rate": 9.471969333972209e-05, + "loss": 0.9595, + "step": 4608 + }, + { + "epoch": 0.312284030083339, + "grad_norm": 6.885564804077148, + "learning_rate": 9.471832432062427e-05, + "loss": 0.9934, + "step": 4609 + }, + { + "epoch": 0.3123517853513111, + "grad_norm": 12.500927925109863, + "learning_rate": 9.471695530152646e-05, + "loss": 0.9493, + "step": 4610 + }, + { + "epoch": 0.3124195406192832, + "grad_norm": 8.701812744140625, + "learning_rate": 9.471558628242864e-05, + "loss": 0.9975, + "step": 4611 + }, + { + "epoch": 0.3124872958872552, + "grad_norm": 8.313292503356934, + "learning_rate": 9.471421726333082e-05, + "loss": 1.0069, + "step": 4612 + }, + { + "epoch": 0.3125550511552273, + "grad_norm": 6.841222763061523, + "learning_rate": 9.471284824423302e-05, + "loss": 0.8078, + "step": 4613 + }, + { + "epoch": 0.3126228064231994, + "grad_norm": 9.038453102111816, + "learning_rate": 9.47114792251352e-05, + "loss": 1.0445, + "step": 4614 + }, + { + "epoch": 0.3126905616911715, + "grad_norm": 7.772367000579834, + "learning_rate": 9.471011020603738e-05, + "loss": 1.0336, + "step": 4615 + }, + { + "epoch": 0.3127583169591436, + "grad_norm": 6.843810558319092, + "learning_rate": 9.470874118693957e-05, + "loss": 0.8171, + "step": 4616 + }, + { + "epoch": 0.31282607222711567, + "grad_norm": 7.012472629547119, + "learning_rate": 9.470737216784175e-05, + "loss": 0.8906, + "step": 4617 + }, + { + "epoch": 0.31289382749508776, + "grad_norm": 7.138260364532471, + "learning_rate": 9.470600314874393e-05, + "loss": 0.9742, + "step": 4618 + }, + { + "epoch": 0.31296158276305985, + "grad_norm": 7.92933988571167, + "learning_rate": 9.470463412964611e-05, + "loss": 0.8097, + "step": 4619 + }, + { + "epoch": 0.3130293380310319, + "grad_norm": 8.222073554992676, + "learning_rate": 9.47032651105483e-05, + "loss": 0.9354, + "step": 4620 + }, + { + "epoch": 0.313097093299004, + "grad_norm": 7.705626010894775, + "learning_rate": 9.470189609145049e-05, + "loss": 0.7776, + "step": 4621 + }, + { + "epoch": 0.3131648485669761, + "grad_norm": 7.6136345863342285, + "learning_rate": 9.470052707235267e-05, + "loss": 1.0204, + "step": 4622 + }, + { + "epoch": 0.31323260383494816, + "grad_norm": 6.365856647491455, + "learning_rate": 9.469915805325485e-05, + "loss": 0.8812, + "step": 4623 + }, + { + "epoch": 0.31330035910292026, + "grad_norm": 6.876771926879883, + "learning_rate": 9.469778903415704e-05, + "loss": 0.7518, + "step": 4624 + }, + { + "epoch": 0.31336811437089235, + "grad_norm": 6.329056262969971, + "learning_rate": 9.469642001505922e-05, + "loss": 0.8902, + "step": 4625 + }, + { + "epoch": 0.31343586963886444, + "grad_norm": 6.1461310386657715, + "learning_rate": 9.46950509959614e-05, + "loss": 1.0617, + "step": 4626 + }, + { + "epoch": 0.31350362490683653, + "grad_norm": 7.062450408935547, + "learning_rate": 9.469368197686358e-05, + "loss": 0.9081, + "step": 4627 + }, + { + "epoch": 0.31357138017480857, + "grad_norm": 7.412983417510986, + "learning_rate": 9.469231295776576e-05, + "loss": 1.0845, + "step": 4628 + }, + { + "epoch": 0.31363913544278066, + "grad_norm": 7.904543876647949, + "learning_rate": 9.469094393866796e-05, + "loss": 0.803, + "step": 4629 + }, + { + "epoch": 0.31370689071075275, + "grad_norm": 6.614920616149902, + "learning_rate": 9.468957491957014e-05, + "loss": 0.8369, + "step": 4630 + }, + { + "epoch": 0.31377464597872484, + "grad_norm": 8.245738983154297, + "learning_rate": 9.468820590047232e-05, + "loss": 0.9232, + "step": 4631 + }, + { + "epoch": 0.31384240124669693, + "grad_norm": 7.923001766204834, + "learning_rate": 9.46868368813745e-05, + "loss": 0.822, + "step": 4632 + }, + { + "epoch": 0.313910156514669, + "grad_norm": 8.80931282043457, + "learning_rate": 9.468546786227668e-05, + "loss": 0.9253, + "step": 4633 + }, + { + "epoch": 0.3139779117826411, + "grad_norm": 6.997625350952148, + "learning_rate": 9.468409884317887e-05, + "loss": 0.8564, + "step": 4634 + }, + { + "epoch": 0.3140456670506132, + "grad_norm": 7.699014186859131, + "learning_rate": 9.468272982408105e-05, + "loss": 1.0097, + "step": 4635 + }, + { + "epoch": 0.31411342231858524, + "grad_norm": 7.379316329956055, + "learning_rate": 9.468136080498323e-05, + "loss": 0.8455, + "step": 4636 + }, + { + "epoch": 0.31418117758655734, + "grad_norm": 7.040482521057129, + "learning_rate": 9.467999178588541e-05, + "loss": 0.9065, + "step": 4637 + }, + { + "epoch": 0.31424893285452943, + "grad_norm": 8.588302612304688, + "learning_rate": 9.46786227667876e-05, + "loss": 0.8063, + "step": 4638 + }, + { + "epoch": 0.3143166881225015, + "grad_norm": 8.310260772705078, + "learning_rate": 9.467725374768979e-05, + "loss": 0.9012, + "step": 4639 + }, + { + "epoch": 0.3143844433904736, + "grad_norm": 5.794011116027832, + "learning_rate": 9.467588472859197e-05, + "loss": 0.7026, + "step": 4640 + }, + { + "epoch": 0.3144521986584457, + "grad_norm": 8.277170181274414, + "learning_rate": 9.467451570949415e-05, + "loss": 0.8472, + "step": 4641 + }, + { + "epoch": 0.3145199539264178, + "grad_norm": 8.856977462768555, + "learning_rate": 9.467314669039633e-05, + "loss": 1.1543, + "step": 4642 + }, + { + "epoch": 0.3145877091943899, + "grad_norm": 7.343659400939941, + "learning_rate": 9.467177767129852e-05, + "loss": 0.8366, + "step": 4643 + }, + { + "epoch": 0.3146554644623619, + "grad_norm": 6.619773864746094, + "learning_rate": 9.46704086522007e-05, + "loss": 0.9176, + "step": 4644 + }, + { + "epoch": 0.314723219730334, + "grad_norm": 8.375277519226074, + "learning_rate": 9.466903963310288e-05, + "loss": 1.0437, + "step": 4645 + }, + { + "epoch": 0.3147909749983061, + "grad_norm": 6.792671203613281, + "learning_rate": 9.466767061400506e-05, + "loss": 0.7271, + "step": 4646 + }, + { + "epoch": 0.3148587302662782, + "grad_norm": 7.419206619262695, + "learning_rate": 9.466630159490726e-05, + "loss": 0.9376, + "step": 4647 + }, + { + "epoch": 0.3149264855342503, + "grad_norm": 6.413398742675781, + "learning_rate": 9.466493257580944e-05, + "loss": 0.8988, + "step": 4648 + }, + { + "epoch": 0.3149942408022224, + "grad_norm": 7.8477373123168945, + "learning_rate": 9.466356355671162e-05, + "loss": 1.0791, + "step": 4649 + }, + { + "epoch": 0.31506199607019447, + "grad_norm": 7.070709705352783, + "learning_rate": 9.46621945376138e-05, + "loss": 0.7796, + "step": 4650 + }, + { + "epoch": 0.31512975133816656, + "grad_norm": 7.023820400238037, + "learning_rate": 9.466082551851598e-05, + "loss": 0.928, + "step": 4651 + }, + { + "epoch": 0.3151975066061386, + "grad_norm": 8.347658157348633, + "learning_rate": 9.465945649941817e-05, + "loss": 0.8768, + "step": 4652 + }, + { + "epoch": 0.3152652618741107, + "grad_norm": 7.446930408477783, + "learning_rate": 9.465808748032035e-05, + "loss": 0.7215, + "step": 4653 + }, + { + "epoch": 0.3153330171420828, + "grad_norm": 9.261749267578125, + "learning_rate": 9.465671846122253e-05, + "loss": 1.0612, + "step": 4654 + }, + { + "epoch": 0.3154007724100549, + "grad_norm": 6.765881061553955, + "learning_rate": 9.465534944212471e-05, + "loss": 0.9636, + "step": 4655 + }, + { + "epoch": 0.31546852767802697, + "grad_norm": 7.752283096313477, + "learning_rate": 9.46539804230269e-05, + "loss": 1.0348, + "step": 4656 + }, + { + "epoch": 0.31553628294599906, + "grad_norm": 8.769716262817383, + "learning_rate": 9.465261140392909e-05, + "loss": 0.9147, + "step": 4657 + }, + { + "epoch": 0.31560403821397115, + "grad_norm": 9.27787971496582, + "learning_rate": 9.465124238483127e-05, + "loss": 1.0287, + "step": 4658 + }, + { + "epoch": 0.31567179348194324, + "grad_norm": 5.802475452423096, + "learning_rate": 9.464987336573346e-05, + "loss": 0.7968, + "step": 4659 + }, + { + "epoch": 0.31573954874991533, + "grad_norm": 7.436465263366699, + "learning_rate": 9.464850434663564e-05, + "loss": 1.1087, + "step": 4660 + }, + { + "epoch": 0.31580730401788737, + "grad_norm": 8.243688583374023, + "learning_rate": 9.464713532753782e-05, + "loss": 1.0257, + "step": 4661 + }, + { + "epoch": 0.31587505928585946, + "grad_norm": 7.885406017303467, + "learning_rate": 9.464576630844002e-05, + "loss": 0.9944, + "step": 4662 + }, + { + "epoch": 0.31594281455383155, + "grad_norm": 8.342294692993164, + "learning_rate": 9.46443972893422e-05, + "loss": 0.8351, + "step": 4663 + }, + { + "epoch": 0.31601056982180364, + "grad_norm": 7.476551055908203, + "learning_rate": 9.464302827024438e-05, + "loss": 0.8484, + "step": 4664 + }, + { + "epoch": 0.31607832508977574, + "grad_norm": 7.239434719085693, + "learning_rate": 9.464165925114656e-05, + "loss": 0.749, + "step": 4665 + }, + { + "epoch": 0.3161460803577478, + "grad_norm": 8.02225399017334, + "learning_rate": 9.464029023204875e-05, + "loss": 1.0679, + "step": 4666 + }, + { + "epoch": 0.3162138356257199, + "grad_norm": 5.708010673522949, + "learning_rate": 9.463892121295093e-05, + "loss": 0.6935, + "step": 4667 + }, + { + "epoch": 0.316281590893692, + "grad_norm": 9.107060432434082, + "learning_rate": 9.463755219385311e-05, + "loss": 0.7951, + "step": 4668 + }, + { + "epoch": 0.31634934616166405, + "grad_norm": 8.045939445495605, + "learning_rate": 9.463618317475529e-05, + "loss": 0.9943, + "step": 4669 + }, + { + "epoch": 0.31641710142963614, + "grad_norm": 8.284626007080078, + "learning_rate": 9.463481415565749e-05, + "loss": 0.9638, + "step": 4670 + }, + { + "epoch": 0.31648485669760823, + "grad_norm": 6.9800920486450195, + "learning_rate": 9.463344513655967e-05, + "loss": 0.9211, + "step": 4671 + }, + { + "epoch": 0.3165526119655803, + "grad_norm": 7.476776599884033, + "learning_rate": 9.463207611746185e-05, + "loss": 0.8704, + "step": 4672 + }, + { + "epoch": 0.3166203672335524, + "grad_norm": 6.303709506988525, + "learning_rate": 9.463070709836403e-05, + "loss": 0.9031, + "step": 4673 + }, + { + "epoch": 0.3166881225015245, + "grad_norm": 8.267644882202148, + "learning_rate": 9.46293380792662e-05, + "loss": 1.0899, + "step": 4674 + }, + { + "epoch": 0.3167558777694966, + "grad_norm": 8.25999641418457, + "learning_rate": 9.46279690601684e-05, + "loss": 0.9286, + "step": 4675 + }, + { + "epoch": 0.3168236330374687, + "grad_norm": 8.577073097229004, + "learning_rate": 9.462660004107058e-05, + "loss": 0.8837, + "step": 4676 + }, + { + "epoch": 0.3168913883054407, + "grad_norm": 7.183437824249268, + "learning_rate": 9.462523102197276e-05, + "loss": 0.9972, + "step": 4677 + }, + { + "epoch": 0.3169591435734128, + "grad_norm": 7.93414831161499, + "learning_rate": 9.462386200287494e-05, + "loss": 0.9806, + "step": 4678 + }, + { + "epoch": 0.3170268988413849, + "grad_norm": 8.037418365478516, + "learning_rate": 9.462249298377714e-05, + "loss": 0.8945, + "step": 4679 + }, + { + "epoch": 0.317094654109357, + "grad_norm": 7.5245585441589355, + "learning_rate": 9.462112396467932e-05, + "loss": 0.9444, + "step": 4680 + }, + { + "epoch": 0.3171624093773291, + "grad_norm": 7.428105354309082, + "learning_rate": 9.46197549455815e-05, + "loss": 0.9916, + "step": 4681 + }, + { + "epoch": 0.3172301646453012, + "grad_norm": 7.977080345153809, + "learning_rate": 9.461838592648368e-05, + "loss": 1.2206, + "step": 4682 + }, + { + "epoch": 0.3172979199132733, + "grad_norm": 6.285130023956299, + "learning_rate": 9.461701690738586e-05, + "loss": 0.8222, + "step": 4683 + }, + { + "epoch": 0.31736567518124537, + "grad_norm": 6.9027018547058105, + "learning_rate": 9.461564788828805e-05, + "loss": 1.0265, + "step": 4684 + }, + { + "epoch": 0.3174334304492174, + "grad_norm": 7.620064735412598, + "learning_rate": 9.461427886919023e-05, + "loss": 0.9672, + "step": 4685 + }, + { + "epoch": 0.3175011857171895, + "grad_norm": 7.635505676269531, + "learning_rate": 9.461290985009241e-05, + "loss": 0.9329, + "step": 4686 + }, + { + "epoch": 0.3175689409851616, + "grad_norm": 7.532449245452881, + "learning_rate": 9.461154083099459e-05, + "loss": 1.0509, + "step": 4687 + }, + { + "epoch": 0.3176366962531337, + "grad_norm": 6.428747177124023, + "learning_rate": 9.461017181189677e-05, + "loss": 0.9151, + "step": 4688 + }, + { + "epoch": 0.31770445152110577, + "grad_norm": 7.214437007904053, + "learning_rate": 9.460880279279897e-05, + "loss": 0.9758, + "step": 4689 + }, + { + "epoch": 0.31777220678907786, + "grad_norm": 7.525144577026367, + "learning_rate": 9.460743377370115e-05, + "loss": 0.7723, + "step": 4690 + }, + { + "epoch": 0.31783996205704995, + "grad_norm": 8.008615493774414, + "learning_rate": 9.460606475460333e-05, + "loss": 0.9945, + "step": 4691 + }, + { + "epoch": 0.31790771732502204, + "grad_norm": 6.715225696563721, + "learning_rate": 9.460469573550551e-05, + "loss": 0.9805, + "step": 4692 + }, + { + "epoch": 0.3179754725929941, + "grad_norm": 7.146914005279541, + "learning_rate": 9.46033267164077e-05, + "loss": 0.9583, + "step": 4693 + }, + { + "epoch": 0.31804322786096617, + "grad_norm": 7.280580997467041, + "learning_rate": 9.460195769730988e-05, + "loss": 0.9772, + "step": 4694 + }, + { + "epoch": 0.31811098312893826, + "grad_norm": 7.84500789642334, + "learning_rate": 9.460058867821206e-05, + "loss": 0.9755, + "step": 4695 + }, + { + "epoch": 0.31817873839691035, + "grad_norm": 7.289769649505615, + "learning_rate": 9.459921965911424e-05, + "loss": 1.1441, + "step": 4696 + }, + { + "epoch": 0.31824649366488245, + "grad_norm": 9.061880111694336, + "learning_rate": 9.459785064001642e-05, + "loss": 0.9699, + "step": 4697 + }, + { + "epoch": 0.31831424893285454, + "grad_norm": 7.284442901611328, + "learning_rate": 9.459648162091862e-05, + "loss": 1.0023, + "step": 4698 + }, + { + "epoch": 0.31838200420082663, + "grad_norm": 7.017299175262451, + "learning_rate": 9.45951126018208e-05, + "loss": 0.8598, + "step": 4699 + }, + { + "epoch": 0.3184497594687987, + "grad_norm": 7.236936092376709, + "learning_rate": 9.459374358272298e-05, + "loss": 0.9888, + "step": 4700 + }, + { + "epoch": 0.31851751473677076, + "grad_norm": 8.148765563964844, + "learning_rate": 9.459237456362516e-05, + "loss": 0.9978, + "step": 4701 + }, + { + "epoch": 0.31858527000474285, + "grad_norm": 8.023640632629395, + "learning_rate": 9.459100554452735e-05, + "loss": 1.0941, + "step": 4702 + }, + { + "epoch": 0.31865302527271494, + "grad_norm": 6.2704291343688965, + "learning_rate": 9.458963652542953e-05, + "loss": 0.8616, + "step": 4703 + }, + { + "epoch": 0.31872078054068703, + "grad_norm": 7.873881816864014, + "learning_rate": 9.458826750633171e-05, + "loss": 0.9212, + "step": 4704 + }, + { + "epoch": 0.3187885358086591, + "grad_norm": 7.088031768798828, + "learning_rate": 9.45868984872339e-05, + "loss": 0.6453, + "step": 4705 + }, + { + "epoch": 0.3188562910766312, + "grad_norm": 7.319702625274658, + "learning_rate": 9.458552946813609e-05, + "loss": 0.8328, + "step": 4706 + }, + { + "epoch": 0.3189240463446033, + "grad_norm": 6.979453086853027, + "learning_rate": 9.458416044903827e-05, + "loss": 0.7806, + "step": 4707 + }, + { + "epoch": 0.3189918016125754, + "grad_norm": 7.868389129638672, + "learning_rate": 9.458279142994046e-05, + "loss": 0.8052, + "step": 4708 + }, + { + "epoch": 0.3190595568805475, + "grad_norm": 7.2435431480407715, + "learning_rate": 9.458142241084264e-05, + "loss": 0.8922, + "step": 4709 + }, + { + "epoch": 0.3191273121485195, + "grad_norm": 8.62594985961914, + "learning_rate": 9.458005339174482e-05, + "loss": 1.0572, + "step": 4710 + }, + { + "epoch": 0.3191950674164916, + "grad_norm": 8.404034614562988, + "learning_rate": 9.457868437264701e-05, + "loss": 1.1592, + "step": 4711 + }, + { + "epoch": 0.3192628226844637, + "grad_norm": 6.669247150421143, + "learning_rate": 9.45773153535492e-05, + "loss": 0.8727, + "step": 4712 + }, + { + "epoch": 0.3193305779524358, + "grad_norm": 7.565460681915283, + "learning_rate": 9.457594633445138e-05, + "loss": 0.7803, + "step": 4713 + }, + { + "epoch": 0.3193983332204079, + "grad_norm": 7.726403713226318, + "learning_rate": 9.457457731535356e-05, + "loss": 0.81, + "step": 4714 + }, + { + "epoch": 0.31946608848838, + "grad_norm": 7.344959259033203, + "learning_rate": 9.457320829625574e-05, + "loss": 0.6959, + "step": 4715 + }, + { + "epoch": 0.3195338437563521, + "grad_norm": 5.9748759269714355, + "learning_rate": 9.457183927715793e-05, + "loss": 0.8332, + "step": 4716 + }, + { + "epoch": 0.31960159902432417, + "grad_norm": 7.241995334625244, + "learning_rate": 9.457047025806011e-05, + "loss": 0.8243, + "step": 4717 + }, + { + "epoch": 0.3196693542922962, + "grad_norm": 7.812702655792236, + "learning_rate": 9.456910123896229e-05, + "loss": 0.8603, + "step": 4718 + }, + { + "epoch": 0.3197371095602683, + "grad_norm": 9.492036819458008, + "learning_rate": 9.456773221986447e-05, + "loss": 1.0208, + "step": 4719 + }, + { + "epoch": 0.3198048648282404, + "grad_norm": 8.922654151916504, + "learning_rate": 9.456636320076665e-05, + "loss": 1.1939, + "step": 4720 + }, + { + "epoch": 0.3198726200962125, + "grad_norm": 6.459314346313477, + "learning_rate": 9.456499418166885e-05, + "loss": 1.0332, + "step": 4721 + }, + { + "epoch": 0.31994037536418457, + "grad_norm": 7.104556560516357, + "learning_rate": 9.456362516257103e-05, + "loss": 1.0608, + "step": 4722 + }, + { + "epoch": 0.32000813063215666, + "grad_norm": 7.620473861694336, + "learning_rate": 9.45622561434732e-05, + "loss": 0.888, + "step": 4723 + }, + { + "epoch": 0.32007588590012875, + "grad_norm": 7.757092475891113, + "learning_rate": 9.456088712437539e-05, + "loss": 1.0355, + "step": 4724 + }, + { + "epoch": 0.32014364116810085, + "grad_norm": 7.084576606750488, + "learning_rate": 9.455951810527758e-05, + "loss": 0.7148, + "step": 4725 + }, + { + "epoch": 0.3202113964360729, + "grad_norm": 6.5413079261779785, + "learning_rate": 9.455814908617976e-05, + "loss": 0.8678, + "step": 4726 + }, + { + "epoch": 0.320279151704045, + "grad_norm": 8.085969924926758, + "learning_rate": 9.455678006708194e-05, + "loss": 1.0137, + "step": 4727 + }, + { + "epoch": 0.32034690697201706, + "grad_norm": 6.338340759277344, + "learning_rate": 9.455541104798412e-05, + "loss": 0.9573, + "step": 4728 + }, + { + "epoch": 0.32041466223998916, + "grad_norm": 8.724467277526855, + "learning_rate": 9.45540420288863e-05, + "loss": 1.0585, + "step": 4729 + }, + { + "epoch": 0.32048241750796125, + "grad_norm": 5.904287815093994, + "learning_rate": 9.45526730097885e-05, + "loss": 0.6882, + "step": 4730 + }, + { + "epoch": 0.32055017277593334, + "grad_norm": 6.975876331329346, + "learning_rate": 9.455130399069068e-05, + "loss": 0.8641, + "step": 4731 + }, + { + "epoch": 0.32061792804390543, + "grad_norm": 7.307252407073975, + "learning_rate": 9.454993497159286e-05, + "loss": 0.8961, + "step": 4732 + }, + { + "epoch": 0.3206856833118775, + "grad_norm": 6.06977653503418, + "learning_rate": 9.454856595249504e-05, + "loss": 0.8388, + "step": 4733 + }, + { + "epoch": 0.32075343857984956, + "grad_norm": 5.903672218322754, + "learning_rate": 9.454719693339723e-05, + "loss": 0.6962, + "step": 4734 + }, + { + "epoch": 0.32082119384782165, + "grad_norm": 9.061569213867188, + "learning_rate": 9.454582791429941e-05, + "loss": 1.0246, + "step": 4735 + }, + { + "epoch": 0.32088894911579374, + "grad_norm": 9.154926300048828, + "learning_rate": 9.454445889520159e-05, + "loss": 0.9336, + "step": 4736 + }, + { + "epoch": 0.32095670438376583, + "grad_norm": 6.51992654800415, + "learning_rate": 9.454308987610377e-05, + "loss": 0.8753, + "step": 4737 + }, + { + "epoch": 0.3210244596517379, + "grad_norm": 7.160184383392334, + "learning_rate": 9.454172085700595e-05, + "loss": 1.0036, + "step": 4738 + }, + { + "epoch": 0.32109221491971, + "grad_norm": 6.895291805267334, + "learning_rate": 9.454035183790815e-05, + "loss": 0.8359, + "step": 4739 + }, + { + "epoch": 0.3211599701876821, + "grad_norm": 7.363986015319824, + "learning_rate": 9.453898281881033e-05, + "loss": 1.0207, + "step": 4740 + }, + { + "epoch": 0.3212277254556542, + "grad_norm": 9.040234565734863, + "learning_rate": 9.45376137997125e-05, + "loss": 1.0123, + "step": 4741 + }, + { + "epoch": 0.32129548072362624, + "grad_norm": 7.679563045501709, + "learning_rate": 9.453624478061469e-05, + "loss": 1.1201, + "step": 4742 + }, + { + "epoch": 0.32136323599159833, + "grad_norm": 7.640948295593262, + "learning_rate": 9.453487576151687e-05, + "loss": 1.1834, + "step": 4743 + }, + { + "epoch": 0.3214309912595704, + "grad_norm": 6.348153591156006, + "learning_rate": 9.453350674241906e-05, + "loss": 0.735, + "step": 4744 + }, + { + "epoch": 0.3214987465275425, + "grad_norm": 6.899050712585449, + "learning_rate": 9.453213772332124e-05, + "loss": 0.9054, + "step": 4745 + }, + { + "epoch": 0.3215665017955146, + "grad_norm": 8.880489349365234, + "learning_rate": 9.453076870422342e-05, + "loss": 1.0253, + "step": 4746 + }, + { + "epoch": 0.3216342570634867, + "grad_norm": 7.2754597663879395, + "learning_rate": 9.45293996851256e-05, + "loss": 0.75, + "step": 4747 + }, + { + "epoch": 0.3217020123314588, + "grad_norm": 7.26841402053833, + "learning_rate": 9.45280306660278e-05, + "loss": 1.2198, + "step": 4748 + }, + { + "epoch": 0.3217697675994309, + "grad_norm": 6.0724310874938965, + "learning_rate": 9.452666164692998e-05, + "loss": 0.7444, + "step": 4749 + }, + { + "epoch": 0.3218375228674029, + "grad_norm": 7.82569694519043, + "learning_rate": 9.452529262783216e-05, + "loss": 0.9266, + "step": 4750 + }, + { + "epoch": 0.321905278135375, + "grad_norm": 7.153421878814697, + "learning_rate": 9.452392360873435e-05, + "loss": 1.2223, + "step": 4751 + }, + { + "epoch": 0.3219730334033471, + "grad_norm": 6.586106777191162, + "learning_rate": 9.452255458963653e-05, + "loss": 0.9157, + "step": 4752 + }, + { + "epoch": 0.3220407886713192, + "grad_norm": 7.776567459106445, + "learning_rate": 9.452118557053871e-05, + "loss": 0.7396, + "step": 4753 + }, + { + "epoch": 0.3221085439392913, + "grad_norm": 7.4870381355285645, + "learning_rate": 9.45198165514409e-05, + "loss": 1.0812, + "step": 4754 + }, + { + "epoch": 0.3221762992072634, + "grad_norm": 8.112491607666016, + "learning_rate": 9.451844753234309e-05, + "loss": 1.2816, + "step": 4755 + }, + { + "epoch": 0.32224405447523546, + "grad_norm": 8.562600135803223, + "learning_rate": 9.451707851324527e-05, + "loss": 0.9872, + "step": 4756 + }, + { + "epoch": 0.32231180974320756, + "grad_norm": 9.139601707458496, + "learning_rate": 9.451570949414746e-05, + "loss": 1.019, + "step": 4757 + }, + { + "epoch": 0.3223795650111796, + "grad_norm": 7.6649370193481445, + "learning_rate": 9.451434047504964e-05, + "loss": 1.1594, + "step": 4758 + }, + { + "epoch": 0.3224473202791517, + "grad_norm": 8.193527221679688, + "learning_rate": 9.451297145595182e-05, + "loss": 0.8804, + "step": 4759 + }, + { + "epoch": 0.3225150755471238, + "grad_norm": 5.558340072631836, + "learning_rate": 9.4511602436854e-05, + "loss": 0.8778, + "step": 4760 + }, + { + "epoch": 0.32258283081509587, + "grad_norm": 7.296480178833008, + "learning_rate": 9.451023341775618e-05, + "loss": 1.0039, + "step": 4761 + }, + { + "epoch": 0.32265058608306796, + "grad_norm": 8.916117668151855, + "learning_rate": 9.450886439865837e-05, + "loss": 0.9516, + "step": 4762 + }, + { + "epoch": 0.32271834135104005, + "grad_norm": 7.691675662994385, + "learning_rate": 9.450749537956056e-05, + "loss": 1.1872, + "step": 4763 + }, + { + "epoch": 0.32278609661901214, + "grad_norm": 6.425968647003174, + "learning_rate": 9.450612636046274e-05, + "loss": 0.7914, + "step": 4764 + }, + { + "epoch": 0.32285385188698423, + "grad_norm": 6.476465225219727, + "learning_rate": 9.450475734136492e-05, + "loss": 0.9449, + "step": 4765 + }, + { + "epoch": 0.3229216071549563, + "grad_norm": 7.470714569091797, + "learning_rate": 9.450338832226711e-05, + "loss": 0.8132, + "step": 4766 + }, + { + "epoch": 0.32298936242292836, + "grad_norm": 6.947244644165039, + "learning_rate": 9.450201930316929e-05, + "loss": 1.0203, + "step": 4767 + }, + { + "epoch": 0.32305711769090045, + "grad_norm": 6.939558982849121, + "learning_rate": 9.450065028407147e-05, + "loss": 0.7484, + "step": 4768 + }, + { + "epoch": 0.32312487295887254, + "grad_norm": 6.562455177307129, + "learning_rate": 9.449928126497365e-05, + "loss": 0.9659, + "step": 4769 + }, + { + "epoch": 0.32319262822684464, + "grad_norm": 7.884627819061279, + "learning_rate": 9.449791224587583e-05, + "loss": 0.9394, + "step": 4770 + }, + { + "epoch": 0.3232603834948167, + "grad_norm": 8.175631523132324, + "learning_rate": 9.449654322677802e-05, + "loss": 1.3089, + "step": 4771 + }, + { + "epoch": 0.3233281387627888, + "grad_norm": 7.907345294952393, + "learning_rate": 9.44951742076802e-05, + "loss": 0.7641, + "step": 4772 + }, + { + "epoch": 0.3233958940307609, + "grad_norm": 8.59745979309082, + "learning_rate": 9.449380518858239e-05, + "loss": 1.1506, + "step": 4773 + }, + { + "epoch": 0.323463649298733, + "grad_norm": 8.216800689697266, + "learning_rate": 9.449243616948457e-05, + "loss": 1.1007, + "step": 4774 + }, + { + "epoch": 0.32353140456670504, + "grad_norm": 7.630847930908203, + "learning_rate": 9.449106715038675e-05, + "loss": 0.6435, + "step": 4775 + }, + { + "epoch": 0.32359915983467713, + "grad_norm": 8.40709400177002, + "learning_rate": 9.448969813128894e-05, + "loss": 1.0299, + "step": 4776 + }, + { + "epoch": 0.3236669151026492, + "grad_norm": 7.3637166023254395, + "learning_rate": 9.448832911219112e-05, + "loss": 1.2733, + "step": 4777 + }, + { + "epoch": 0.3237346703706213, + "grad_norm": 7.707301139831543, + "learning_rate": 9.44869600930933e-05, + "loss": 0.91, + "step": 4778 + }, + { + "epoch": 0.3238024256385934, + "grad_norm": 6.911712169647217, + "learning_rate": 9.448559107399548e-05, + "loss": 1.0425, + "step": 4779 + }, + { + "epoch": 0.3238701809065655, + "grad_norm": 7.387215614318848, + "learning_rate": 9.448422205489768e-05, + "loss": 1.1508, + "step": 4780 + }, + { + "epoch": 0.3239379361745376, + "grad_norm": 6.074267864227295, + "learning_rate": 9.448285303579986e-05, + "loss": 0.7861, + "step": 4781 + }, + { + "epoch": 0.3240056914425097, + "grad_norm": 7.4029436111450195, + "learning_rate": 9.448148401670204e-05, + "loss": 1.1438, + "step": 4782 + }, + { + "epoch": 0.3240734467104817, + "grad_norm": 8.514384269714355, + "learning_rate": 9.448011499760422e-05, + "loss": 1.1075, + "step": 4783 + }, + { + "epoch": 0.3241412019784538, + "grad_norm": 8.503793716430664, + "learning_rate": 9.44787459785064e-05, + "loss": 0.9377, + "step": 4784 + }, + { + "epoch": 0.3242089572464259, + "grad_norm": 6.770750522613525, + "learning_rate": 9.447737695940859e-05, + "loss": 0.8342, + "step": 4785 + }, + { + "epoch": 0.324276712514398, + "grad_norm": 7.019729137420654, + "learning_rate": 9.447600794031077e-05, + "loss": 0.8236, + "step": 4786 + }, + { + "epoch": 0.3243444677823701, + "grad_norm": 6.147645950317383, + "learning_rate": 9.447463892121295e-05, + "loss": 0.9493, + "step": 4787 + }, + { + "epoch": 0.3244122230503422, + "grad_norm": 9.160635948181152, + "learning_rate": 9.447326990211513e-05, + "loss": 1.1632, + "step": 4788 + }, + { + "epoch": 0.32447997831831427, + "grad_norm": 7.918509483337402, + "learning_rate": 9.447190088301733e-05, + "loss": 1.0154, + "step": 4789 + }, + { + "epoch": 0.32454773358628636, + "grad_norm": 8.457235336303711, + "learning_rate": 9.44705318639195e-05, + "loss": 1.0373, + "step": 4790 + }, + { + "epoch": 0.3246154888542584, + "grad_norm": 7.332309722900391, + "learning_rate": 9.446916284482169e-05, + "loss": 1.09, + "step": 4791 + }, + { + "epoch": 0.3246832441222305, + "grad_norm": 7.7085700035095215, + "learning_rate": 9.446779382572387e-05, + "loss": 0.8492, + "step": 4792 + }, + { + "epoch": 0.3247509993902026, + "grad_norm": 6.121610641479492, + "learning_rate": 9.446642480662605e-05, + "loss": 1.0399, + "step": 4793 + }, + { + "epoch": 0.32481875465817467, + "grad_norm": 6.767165184020996, + "learning_rate": 9.446505578752824e-05, + "loss": 0.6908, + "step": 4794 + }, + { + "epoch": 0.32488650992614676, + "grad_norm": 7.629088401794434, + "learning_rate": 9.446368676843042e-05, + "loss": 0.9356, + "step": 4795 + }, + { + "epoch": 0.32495426519411885, + "grad_norm": 7.590803146362305, + "learning_rate": 9.44623177493326e-05, + "loss": 0.7671, + "step": 4796 + }, + { + "epoch": 0.32502202046209094, + "grad_norm": 7.876105785369873, + "learning_rate": 9.44609487302348e-05, + "loss": 0.9654, + "step": 4797 + }, + { + "epoch": 0.32508977573006304, + "grad_norm": 8.503900527954102, + "learning_rate": 9.445957971113698e-05, + "loss": 0.9726, + "step": 4798 + }, + { + "epoch": 0.32515753099803507, + "grad_norm": 9.96375846862793, + "learning_rate": 9.445821069203916e-05, + "loss": 0.7587, + "step": 4799 + }, + { + "epoch": 0.32522528626600716, + "grad_norm": 6.24782133102417, + "learning_rate": 9.445684167294135e-05, + "loss": 0.8028, + "step": 4800 + }, + { + "epoch": 0.32529304153397925, + "grad_norm": 7.5481181144714355, + "learning_rate": 9.445547265384353e-05, + "loss": 0.8194, + "step": 4801 + }, + { + "epoch": 0.32536079680195135, + "grad_norm": 8.148533821105957, + "learning_rate": 9.445410363474571e-05, + "loss": 1.0009, + "step": 4802 + }, + { + "epoch": 0.32542855206992344, + "grad_norm": 7.8531341552734375, + "learning_rate": 9.44527346156479e-05, + "loss": 1.0629, + "step": 4803 + }, + { + "epoch": 0.32549630733789553, + "grad_norm": 7.7668843269348145, + "learning_rate": 9.445136559655008e-05, + "loss": 1.1211, + "step": 4804 + }, + { + "epoch": 0.3255640626058676, + "grad_norm": 9.245609283447266, + "learning_rate": 9.444999657745226e-05, + "loss": 1.0407, + "step": 4805 + }, + { + "epoch": 0.3256318178738397, + "grad_norm": 7.387469291687012, + "learning_rate": 9.444862755835445e-05, + "loss": 1.128, + "step": 4806 + }, + { + "epoch": 0.32569957314181175, + "grad_norm": 6.636757850646973, + "learning_rate": 9.444725853925663e-05, + "loss": 0.8386, + "step": 4807 + }, + { + "epoch": 0.32576732840978384, + "grad_norm": 8.264979362487793, + "learning_rate": 9.444588952015882e-05, + "loss": 0.904, + "step": 4808 + }, + { + "epoch": 0.32583508367775593, + "grad_norm": 7.77110481262207, + "learning_rate": 9.4444520501061e-05, + "loss": 1.0292, + "step": 4809 + }, + { + "epoch": 0.325902838945728, + "grad_norm": 6.825851917266846, + "learning_rate": 9.444315148196318e-05, + "loss": 0.7484, + "step": 4810 + }, + { + "epoch": 0.3259705942137001, + "grad_norm": 6.475069999694824, + "learning_rate": 9.444178246286536e-05, + "loss": 0.9085, + "step": 4811 + }, + { + "epoch": 0.3260383494816722, + "grad_norm": 8.736098289489746, + "learning_rate": 9.444041344376755e-05, + "loss": 1.1817, + "step": 4812 + }, + { + "epoch": 0.3261061047496443, + "grad_norm": 7.0992608070373535, + "learning_rate": 9.443904442466973e-05, + "loss": 0.8654, + "step": 4813 + }, + { + "epoch": 0.3261738600176164, + "grad_norm": 6.594883441925049, + "learning_rate": 9.443767540557192e-05, + "loss": 1.0002, + "step": 4814 + }, + { + "epoch": 0.3262416152855885, + "grad_norm": 7.857585906982422, + "learning_rate": 9.44363063864741e-05, + "loss": 1.1711, + "step": 4815 + }, + { + "epoch": 0.3263093705535605, + "grad_norm": 6.884295463562012, + "learning_rate": 9.443493736737628e-05, + "loss": 0.8003, + "step": 4816 + }, + { + "epoch": 0.3263771258215326, + "grad_norm": 7.344529628753662, + "learning_rate": 9.443356834827847e-05, + "loss": 1.0009, + "step": 4817 + }, + { + "epoch": 0.3264448810895047, + "grad_norm": 6.892088413238525, + "learning_rate": 9.443219932918065e-05, + "loss": 0.9797, + "step": 4818 + }, + { + "epoch": 0.3265126363574768, + "grad_norm": 12.833809852600098, + "learning_rate": 9.443083031008283e-05, + "loss": 1.1143, + "step": 4819 + }, + { + "epoch": 0.3265803916254489, + "grad_norm": 7.343682765960693, + "learning_rate": 9.442946129098501e-05, + "loss": 0.6302, + "step": 4820 + }, + { + "epoch": 0.326648146893421, + "grad_norm": 7.089320659637451, + "learning_rate": 9.442809227188719e-05, + "loss": 1.0129, + "step": 4821 + }, + { + "epoch": 0.32671590216139307, + "grad_norm": 6.60029935836792, + "learning_rate": 9.442672325278938e-05, + "loss": 0.9259, + "step": 4822 + }, + { + "epoch": 0.32678365742936516, + "grad_norm": 10.470613479614258, + "learning_rate": 9.442535423369157e-05, + "loss": 1.1124, + "step": 4823 + }, + { + "epoch": 0.3268514126973372, + "grad_norm": 8.491450309753418, + "learning_rate": 9.442398521459375e-05, + "loss": 0.9943, + "step": 4824 + }, + { + "epoch": 0.3269191679653093, + "grad_norm": 6.033353328704834, + "learning_rate": 9.442261619549593e-05, + "loss": 1.0003, + "step": 4825 + }, + { + "epoch": 0.3269869232332814, + "grad_norm": 7.0092549324035645, + "learning_rate": 9.442124717639812e-05, + "loss": 0.7843, + "step": 4826 + }, + { + "epoch": 0.32705467850125347, + "grad_norm": 8.52950382232666, + "learning_rate": 9.44198781573003e-05, + "loss": 1.1036, + "step": 4827 + }, + { + "epoch": 0.32712243376922556, + "grad_norm": 7.509829044342041, + "learning_rate": 9.441850913820248e-05, + "loss": 0.9969, + "step": 4828 + }, + { + "epoch": 0.32719018903719765, + "grad_norm": 7.235622406005859, + "learning_rate": 9.441714011910466e-05, + "loss": 0.9131, + "step": 4829 + }, + { + "epoch": 0.32725794430516975, + "grad_norm": 8.332746505737305, + "learning_rate": 9.441577110000684e-05, + "loss": 1.1663, + "step": 4830 + }, + { + "epoch": 0.32732569957314184, + "grad_norm": 8.321161270141602, + "learning_rate": 9.441440208090904e-05, + "loss": 1.0091, + "step": 4831 + }, + { + "epoch": 0.3273934548411139, + "grad_norm": 7.0502753257751465, + "learning_rate": 9.441303306181122e-05, + "loss": 1.0273, + "step": 4832 + }, + { + "epoch": 0.32746121010908597, + "grad_norm": 6.921250343322754, + "learning_rate": 9.44116640427134e-05, + "loss": 0.873, + "step": 4833 + }, + { + "epoch": 0.32752896537705806, + "grad_norm": 7.386787414550781, + "learning_rate": 9.441029502361558e-05, + "loss": 0.7415, + "step": 4834 + }, + { + "epoch": 0.32759672064503015, + "grad_norm": 6.814700603485107, + "learning_rate": 9.440892600451777e-05, + "loss": 0.8827, + "step": 4835 + }, + { + "epoch": 0.32766447591300224, + "grad_norm": 7.551968574523926, + "learning_rate": 9.440755698541995e-05, + "loss": 0.9752, + "step": 4836 + }, + { + "epoch": 0.32773223118097433, + "grad_norm": 8.299920082092285, + "learning_rate": 9.440618796632213e-05, + "loss": 1.0253, + "step": 4837 + }, + { + "epoch": 0.3277999864489464, + "grad_norm": 7.607963562011719, + "learning_rate": 9.440481894722431e-05, + "loss": 1.0562, + "step": 4838 + }, + { + "epoch": 0.3278677417169185, + "grad_norm": 8.098003387451172, + "learning_rate": 9.440344992812649e-05, + "loss": 0.9055, + "step": 4839 + }, + { + "epoch": 0.32793549698489055, + "grad_norm": 6.6647257804870605, + "learning_rate": 9.440208090902869e-05, + "loss": 0.8188, + "step": 4840 + }, + { + "epoch": 0.32800325225286264, + "grad_norm": 7.541131496429443, + "learning_rate": 9.440071188993087e-05, + "loss": 0.7095, + "step": 4841 + }, + { + "epoch": 0.32807100752083473, + "grad_norm": 10.203042984008789, + "learning_rate": 9.439934287083305e-05, + "loss": 0.8256, + "step": 4842 + }, + { + "epoch": 0.3281387627888068, + "grad_norm": 8.049646377563477, + "learning_rate": 9.439797385173524e-05, + "loss": 1.0244, + "step": 4843 + }, + { + "epoch": 0.3282065180567789, + "grad_norm": 6.727071285247803, + "learning_rate": 9.439660483263742e-05, + "loss": 0.9681, + "step": 4844 + }, + { + "epoch": 0.328274273324751, + "grad_norm": 8.106125831604004, + "learning_rate": 9.43952358135396e-05, + "loss": 1.0944, + "step": 4845 + }, + { + "epoch": 0.3283420285927231, + "grad_norm": 8.768218994140625, + "learning_rate": 9.43938667944418e-05, + "loss": 0.7493, + "step": 4846 + }, + { + "epoch": 0.3284097838606952, + "grad_norm": 6.333378791809082, + "learning_rate": 9.439249777534397e-05, + "loss": 0.9187, + "step": 4847 + }, + { + "epoch": 0.32847753912866723, + "grad_norm": 7.732221603393555, + "learning_rate": 9.439112875624616e-05, + "loss": 1.2522, + "step": 4848 + }, + { + "epoch": 0.3285452943966393, + "grad_norm": 7.67783784866333, + "learning_rate": 9.438975973714835e-05, + "loss": 0.8682, + "step": 4849 + }, + { + "epoch": 0.3286130496646114, + "grad_norm": 7.557129859924316, + "learning_rate": 9.438839071805053e-05, + "loss": 0.7843, + "step": 4850 + }, + { + "epoch": 0.3286808049325835, + "grad_norm": 6.559933185577393, + "learning_rate": 9.438702169895271e-05, + "loss": 0.902, + "step": 4851 + }, + { + "epoch": 0.3287485602005556, + "grad_norm": 7.604280471801758, + "learning_rate": 9.438565267985489e-05, + "loss": 0.8905, + "step": 4852 + }, + { + "epoch": 0.3288163154685277, + "grad_norm": 7.508764266967773, + "learning_rate": 9.438428366075707e-05, + "loss": 0.932, + "step": 4853 + }, + { + "epoch": 0.3288840707364998, + "grad_norm": 6.650167942047119, + "learning_rate": 9.438291464165926e-05, + "loss": 1.0507, + "step": 4854 + }, + { + "epoch": 0.32895182600447187, + "grad_norm": 8.334061622619629, + "learning_rate": 9.438154562256144e-05, + "loss": 0.7949, + "step": 4855 + }, + { + "epoch": 0.3290195812724439, + "grad_norm": 9.498878479003906, + "learning_rate": 9.438017660346362e-05, + "loss": 0.8487, + "step": 4856 + }, + { + "epoch": 0.329087336540416, + "grad_norm": 7.810210227966309, + "learning_rate": 9.43788075843658e-05, + "loss": 0.9917, + "step": 4857 + }, + { + "epoch": 0.3291550918083881, + "grad_norm": 7.337824821472168, + "learning_rate": 9.4377438565268e-05, + "loss": 0.8683, + "step": 4858 + }, + { + "epoch": 0.3292228470763602, + "grad_norm": 8.331491470336914, + "learning_rate": 9.437606954617018e-05, + "loss": 1.178, + "step": 4859 + }, + { + "epoch": 0.3292906023443323, + "grad_norm": 6.5013227462768555, + "learning_rate": 9.437470052707236e-05, + "loss": 0.7093, + "step": 4860 + }, + { + "epoch": 0.32935835761230436, + "grad_norm": 8.814985275268555, + "learning_rate": 9.437333150797454e-05, + "loss": 0.9212, + "step": 4861 + }, + { + "epoch": 0.32942611288027646, + "grad_norm": 8.78661060333252, + "learning_rate": 9.437196248887672e-05, + "loss": 0.8827, + "step": 4862 + }, + { + "epoch": 0.32949386814824855, + "grad_norm": 8.912193298339844, + "learning_rate": 9.437059346977891e-05, + "loss": 1.133, + "step": 4863 + }, + { + "epoch": 0.3295616234162206, + "grad_norm": 7.901734352111816, + "learning_rate": 9.43692244506811e-05, + "loss": 0.8293, + "step": 4864 + }, + { + "epoch": 0.3296293786841927, + "grad_norm": 8.270600318908691, + "learning_rate": 9.436785543158328e-05, + "loss": 1.1584, + "step": 4865 + }, + { + "epoch": 0.32969713395216477, + "grad_norm": 6.765751838684082, + "learning_rate": 9.436648641248546e-05, + "loss": 0.7705, + "step": 4866 + }, + { + "epoch": 0.32976488922013686, + "grad_norm": 7.272820949554443, + "learning_rate": 9.436511739338765e-05, + "loss": 1.2151, + "step": 4867 + }, + { + "epoch": 0.32983264448810895, + "grad_norm": 6.639741897583008, + "learning_rate": 9.436374837428983e-05, + "loss": 0.8868, + "step": 4868 + }, + { + "epoch": 0.32990039975608104, + "grad_norm": 7.6254143714904785, + "learning_rate": 9.436237935519201e-05, + "loss": 1.0342, + "step": 4869 + }, + { + "epoch": 0.32996815502405313, + "grad_norm": 6.546008586883545, + "learning_rate": 9.436101033609419e-05, + "loss": 0.7223, + "step": 4870 + }, + { + "epoch": 0.3300359102920252, + "grad_norm": 6.215434551239014, + "learning_rate": 9.435964131699637e-05, + "loss": 0.8506, + "step": 4871 + }, + { + "epoch": 0.3301036655599973, + "grad_norm": 7.897797584533691, + "learning_rate": 9.435827229789856e-05, + "loss": 0.8699, + "step": 4872 + }, + { + "epoch": 0.33017142082796935, + "grad_norm": 8.3803129196167, + "learning_rate": 9.435690327880074e-05, + "loss": 1.0922, + "step": 4873 + }, + { + "epoch": 0.33023917609594144, + "grad_norm": 6.675380706787109, + "learning_rate": 9.435553425970293e-05, + "loss": 0.8401, + "step": 4874 + }, + { + "epoch": 0.33030693136391354, + "grad_norm": 7.380627632141113, + "learning_rate": 9.43541652406051e-05, + "loss": 0.615, + "step": 4875 + }, + { + "epoch": 0.33037468663188563, + "grad_norm": 8.120410919189453, + "learning_rate": 9.435279622150729e-05, + "loss": 1.2622, + "step": 4876 + }, + { + "epoch": 0.3304424418998577, + "grad_norm": 7.170663356781006, + "learning_rate": 9.435142720240948e-05, + "loss": 0.893, + "step": 4877 + }, + { + "epoch": 0.3305101971678298, + "grad_norm": 12.477727890014648, + "learning_rate": 9.435005818331166e-05, + "loss": 0.9654, + "step": 4878 + }, + { + "epoch": 0.3305779524358019, + "grad_norm": 6.562829971313477, + "learning_rate": 9.434868916421384e-05, + "loss": 0.7042, + "step": 4879 + }, + { + "epoch": 0.330645707703774, + "grad_norm": 8.406328201293945, + "learning_rate": 9.434732014511602e-05, + "loss": 1.0807, + "step": 4880 + }, + { + "epoch": 0.33071346297174603, + "grad_norm": 9.06248950958252, + "learning_rate": 9.434595112601821e-05, + "loss": 0.8586, + "step": 4881 + }, + { + "epoch": 0.3307812182397181, + "grad_norm": 8.743175506591797, + "learning_rate": 9.43445821069204e-05, + "loss": 0.9501, + "step": 4882 + }, + { + "epoch": 0.3308489735076902, + "grad_norm": 7.666022777557373, + "learning_rate": 9.434321308782258e-05, + "loss": 0.9411, + "step": 4883 + }, + { + "epoch": 0.3309167287756623, + "grad_norm": 6.989424705505371, + "learning_rate": 9.434184406872476e-05, + "loss": 0.9511, + "step": 4884 + }, + { + "epoch": 0.3309844840436344, + "grad_norm": 6.9908905029296875, + "learning_rate": 9.434047504962694e-05, + "loss": 0.9769, + "step": 4885 + }, + { + "epoch": 0.3310522393116065, + "grad_norm": 7.985763072967529, + "learning_rate": 9.433910603052913e-05, + "loss": 1.1041, + "step": 4886 + }, + { + "epoch": 0.3311199945795786, + "grad_norm": 7.9841694831848145, + "learning_rate": 9.433773701143131e-05, + "loss": 0.8341, + "step": 4887 + }, + { + "epoch": 0.33118774984755067, + "grad_norm": 6.9602952003479, + "learning_rate": 9.433636799233349e-05, + "loss": 0.9388, + "step": 4888 + }, + { + "epoch": 0.3312555051155227, + "grad_norm": 7.2530012130737305, + "learning_rate": 9.433499897323567e-05, + "loss": 0.9396, + "step": 4889 + }, + { + "epoch": 0.3313232603834948, + "grad_norm": 7.338861465454102, + "learning_rate": 9.433362995413786e-05, + "loss": 0.9168, + "step": 4890 + }, + { + "epoch": 0.3313910156514669, + "grad_norm": 7.08353853225708, + "learning_rate": 9.433226093504005e-05, + "loss": 0.9332, + "step": 4891 + }, + { + "epoch": 0.331458770919439, + "grad_norm": 7.379842758178711, + "learning_rate": 9.433089191594223e-05, + "loss": 0.7139, + "step": 4892 + }, + { + "epoch": 0.3315265261874111, + "grad_norm": 8.599993705749512, + "learning_rate": 9.432952289684442e-05, + "loss": 1.0023, + "step": 4893 + }, + { + "epoch": 0.33159428145538317, + "grad_norm": 7.096752643585205, + "learning_rate": 9.43281538777466e-05, + "loss": 0.9052, + "step": 4894 + }, + { + "epoch": 0.33166203672335526, + "grad_norm": 6.28333044052124, + "learning_rate": 9.432678485864878e-05, + "loss": 0.8721, + "step": 4895 + }, + { + "epoch": 0.33172979199132735, + "grad_norm": 8.48362922668457, + "learning_rate": 9.432541583955097e-05, + "loss": 0.9774, + "step": 4896 + }, + { + "epoch": 0.3317975472592994, + "grad_norm": 8.214259147644043, + "learning_rate": 9.432404682045315e-05, + "loss": 0.999, + "step": 4897 + }, + { + "epoch": 0.3318653025272715, + "grad_norm": 9.603363990783691, + "learning_rate": 9.432267780135533e-05, + "loss": 0.7848, + "step": 4898 + }, + { + "epoch": 0.33193305779524357, + "grad_norm": 7.982606887817383, + "learning_rate": 9.432130878225753e-05, + "loss": 0.7167, + "step": 4899 + }, + { + "epoch": 0.33200081306321566, + "grad_norm": 6.8360915184021, + "learning_rate": 9.431993976315971e-05, + "loss": 1.1884, + "step": 4900 + }, + { + "epoch": 0.33206856833118775, + "grad_norm": 9.70743179321289, + "learning_rate": 9.431857074406189e-05, + "loss": 1.0198, + "step": 4901 + }, + { + "epoch": 0.33213632359915984, + "grad_norm": 7.7961554527282715, + "learning_rate": 9.431720172496407e-05, + "loss": 0.8637, + "step": 4902 + }, + { + "epoch": 0.33220407886713194, + "grad_norm": 7.809814453125, + "learning_rate": 9.431583270586625e-05, + "loss": 1.0737, + "step": 4903 + }, + { + "epoch": 0.332271834135104, + "grad_norm": 6.5765862464904785, + "learning_rate": 9.431446368676844e-05, + "loss": 0.6574, + "step": 4904 + }, + { + "epoch": 0.33233958940307606, + "grad_norm": 6.912911891937256, + "learning_rate": 9.431309466767062e-05, + "loss": 0.9647, + "step": 4905 + }, + { + "epoch": 0.33240734467104815, + "grad_norm": 7.2532877922058105, + "learning_rate": 9.43117256485728e-05, + "loss": 1.098, + "step": 4906 + }, + { + "epoch": 0.33247509993902025, + "grad_norm": 6.589138984680176, + "learning_rate": 9.431035662947498e-05, + "loss": 0.9948, + "step": 4907 + }, + { + "epoch": 0.33254285520699234, + "grad_norm": 9.1469144821167, + "learning_rate": 9.430898761037717e-05, + "loss": 0.9656, + "step": 4908 + }, + { + "epoch": 0.33261061047496443, + "grad_norm": 7.124277591705322, + "learning_rate": 9.430761859127936e-05, + "loss": 0.8704, + "step": 4909 + }, + { + "epoch": 0.3326783657429365, + "grad_norm": 6.862776279449463, + "learning_rate": 9.430624957218154e-05, + "loss": 0.8008, + "step": 4910 + }, + { + "epoch": 0.3327461210109086, + "grad_norm": 6.245189666748047, + "learning_rate": 9.430488055308372e-05, + "loss": 0.8529, + "step": 4911 + }, + { + "epoch": 0.3328138762788807, + "grad_norm": 8.546821594238281, + "learning_rate": 9.43035115339859e-05, + "loss": 1.1747, + "step": 4912 + }, + { + "epoch": 0.33288163154685274, + "grad_norm": 7.5279765129089355, + "learning_rate": 9.43021425148881e-05, + "loss": 0.8725, + "step": 4913 + }, + { + "epoch": 0.33294938681482483, + "grad_norm": 7.987123966217041, + "learning_rate": 9.430077349579027e-05, + "loss": 0.8752, + "step": 4914 + }, + { + "epoch": 0.3330171420827969, + "grad_norm": 10.1973876953125, + "learning_rate": 9.429940447669245e-05, + "loss": 1.0813, + "step": 4915 + }, + { + "epoch": 0.333084897350769, + "grad_norm": 7.406893253326416, + "learning_rate": 9.429803545759464e-05, + "loss": 0.7064, + "step": 4916 + }, + { + "epoch": 0.3331526526187411, + "grad_norm": 6.545569896697998, + "learning_rate": 9.429666643849682e-05, + "loss": 0.9005, + "step": 4917 + }, + { + "epoch": 0.3332204078867132, + "grad_norm": 8.624588012695312, + "learning_rate": 9.429529741939901e-05, + "loss": 0.939, + "step": 4918 + }, + { + "epoch": 0.3332881631546853, + "grad_norm": 6.811842441558838, + "learning_rate": 9.429392840030119e-05, + "loss": 0.8367, + "step": 4919 + }, + { + "epoch": 0.3333559184226574, + "grad_norm": 7.256634712219238, + "learning_rate": 9.429255938120337e-05, + "loss": 1.0455, + "step": 4920 + }, + { + "epoch": 0.3334236736906295, + "grad_norm": 6.900022983551025, + "learning_rate": 9.429119036210555e-05, + "loss": 0.954, + "step": 4921 + }, + { + "epoch": 0.3334914289586015, + "grad_norm": 9.217741012573242, + "learning_rate": 9.428982134300774e-05, + "loss": 0.6392, + "step": 4922 + }, + { + "epoch": 0.3335591842265736, + "grad_norm": 7.729586124420166, + "learning_rate": 9.428845232390992e-05, + "loss": 0.9372, + "step": 4923 + }, + { + "epoch": 0.3336269394945457, + "grad_norm": 6.606268405914307, + "learning_rate": 9.42870833048121e-05, + "loss": 0.8127, + "step": 4924 + }, + { + "epoch": 0.3336946947625178, + "grad_norm": 8.078627586364746, + "learning_rate": 9.428571428571429e-05, + "loss": 0.847, + "step": 4925 + }, + { + "epoch": 0.3337624500304899, + "grad_norm": 8.212615966796875, + "learning_rate": 9.428434526661647e-05, + "loss": 1.0133, + "step": 4926 + }, + { + "epoch": 0.33383020529846197, + "grad_norm": 6.68360710144043, + "learning_rate": 9.428297624751866e-05, + "loss": 0.9796, + "step": 4927 + }, + { + "epoch": 0.33389796056643406, + "grad_norm": 7.115147590637207, + "learning_rate": 9.428160722842084e-05, + "loss": 0.9106, + "step": 4928 + }, + { + "epoch": 0.33396571583440615, + "grad_norm": 7.178501129150391, + "learning_rate": 9.428023820932302e-05, + "loss": 0.7531, + "step": 4929 + }, + { + "epoch": 0.3340334711023782, + "grad_norm": 7.303642749786377, + "learning_rate": 9.42788691902252e-05, + "loss": 0.9163, + "step": 4930 + }, + { + "epoch": 0.3341012263703503, + "grad_norm": 7.532678604125977, + "learning_rate": 9.427750017112738e-05, + "loss": 1.001, + "step": 4931 + }, + { + "epoch": 0.33416898163832237, + "grad_norm": 6.791550636291504, + "learning_rate": 9.427613115202957e-05, + "loss": 0.9005, + "step": 4932 + }, + { + "epoch": 0.33423673690629446, + "grad_norm": 7.264156341552734, + "learning_rate": 9.427476213293176e-05, + "loss": 0.8076, + "step": 4933 + }, + { + "epoch": 0.33430449217426655, + "grad_norm": 8.477554321289062, + "learning_rate": 9.427339311383394e-05, + "loss": 0.9363, + "step": 4934 + }, + { + "epoch": 0.33437224744223865, + "grad_norm": 7.020787239074707, + "learning_rate": 9.427202409473612e-05, + "loss": 0.6933, + "step": 4935 + }, + { + "epoch": 0.33444000271021074, + "grad_norm": 7.990420818328857, + "learning_rate": 9.427065507563831e-05, + "loss": 0.9759, + "step": 4936 + }, + { + "epoch": 0.33450775797818283, + "grad_norm": 8.950580596923828, + "learning_rate": 9.426928605654049e-05, + "loss": 1.2876, + "step": 4937 + }, + { + "epoch": 0.33457551324615487, + "grad_norm": 6.879635334014893, + "learning_rate": 9.426791703744267e-05, + "loss": 0.992, + "step": 4938 + }, + { + "epoch": 0.33464326851412696, + "grad_norm": 7.640803813934326, + "learning_rate": 9.426654801834486e-05, + "loss": 1.1523, + "step": 4939 + }, + { + "epoch": 0.33471102378209905, + "grad_norm": 7.156732559204102, + "learning_rate": 9.426517899924704e-05, + "loss": 0.6521, + "step": 4940 + }, + { + "epoch": 0.33477877905007114, + "grad_norm": 7.543867588043213, + "learning_rate": 9.426380998014922e-05, + "loss": 0.8495, + "step": 4941 + }, + { + "epoch": 0.33484653431804323, + "grad_norm": 8.554939270019531, + "learning_rate": 9.426244096105142e-05, + "loss": 1.0593, + "step": 4942 + }, + { + "epoch": 0.3349142895860153, + "grad_norm": 7.902100563049316, + "learning_rate": 9.42610719419536e-05, + "loss": 1.0271, + "step": 4943 + }, + { + "epoch": 0.3349820448539874, + "grad_norm": 7.945007801055908, + "learning_rate": 9.425970292285578e-05, + "loss": 0.7839, + "step": 4944 + }, + { + "epoch": 0.3350498001219595, + "grad_norm": 7.539274215698242, + "learning_rate": 9.425833390375797e-05, + "loss": 0.968, + "step": 4945 + }, + { + "epoch": 0.33511755538993154, + "grad_norm": 7.23228120803833, + "learning_rate": 9.425696488466015e-05, + "loss": 0.8319, + "step": 4946 + }, + { + "epoch": 0.33518531065790363, + "grad_norm": 7.251431941986084, + "learning_rate": 9.425559586556233e-05, + "loss": 0.9513, + "step": 4947 + }, + { + "epoch": 0.3352530659258757, + "grad_norm": 7.047804355621338, + "learning_rate": 9.425422684646451e-05, + "loss": 0.8677, + "step": 4948 + }, + { + "epoch": 0.3353208211938478, + "grad_norm": 7.7244696617126465, + "learning_rate": 9.42528578273667e-05, + "loss": 0.8705, + "step": 4949 + }, + { + "epoch": 0.3353885764618199, + "grad_norm": 6.078210830688477, + "learning_rate": 9.425148880826889e-05, + "loss": 0.9205, + "step": 4950 + }, + { + "epoch": 0.335456331729792, + "grad_norm": 7.856949329376221, + "learning_rate": 9.425011978917107e-05, + "loss": 0.8637, + "step": 4951 + }, + { + "epoch": 0.3355240869977641, + "grad_norm": 7.341653347015381, + "learning_rate": 9.424875077007325e-05, + "loss": 0.9582, + "step": 4952 + }, + { + "epoch": 0.3355918422657362, + "grad_norm": 7.456873893737793, + "learning_rate": 9.424738175097543e-05, + "loss": 0.8581, + "step": 4953 + }, + { + "epoch": 0.3356595975337082, + "grad_norm": 7.885173797607422, + "learning_rate": 9.424601273187761e-05, + "loss": 0.9551, + "step": 4954 + }, + { + "epoch": 0.3357273528016803, + "grad_norm": 7.016728401184082, + "learning_rate": 9.42446437127798e-05, + "loss": 0.9493, + "step": 4955 + }, + { + "epoch": 0.3357951080696524, + "grad_norm": 6.383670806884766, + "learning_rate": 9.424327469368198e-05, + "loss": 0.8178, + "step": 4956 + }, + { + "epoch": 0.3358628633376245, + "grad_norm": 6.930068492889404, + "learning_rate": 9.424190567458416e-05, + "loss": 1.1073, + "step": 4957 + }, + { + "epoch": 0.3359306186055966, + "grad_norm": 7.001153469085693, + "learning_rate": 9.424053665548634e-05, + "loss": 0.844, + "step": 4958 + }, + { + "epoch": 0.3359983738735687, + "grad_norm": 8.316204071044922, + "learning_rate": 9.423916763638854e-05, + "loss": 0.9363, + "step": 4959 + }, + { + "epoch": 0.33606612914154077, + "grad_norm": 7.931922912597656, + "learning_rate": 9.423779861729072e-05, + "loss": 1.1617, + "step": 4960 + }, + { + "epoch": 0.33613388440951286, + "grad_norm": 7.447847366333008, + "learning_rate": 9.42364295981929e-05, + "loss": 0.9137, + "step": 4961 + }, + { + "epoch": 0.3362016396774849, + "grad_norm": 6.582141876220703, + "learning_rate": 9.423506057909508e-05, + "loss": 1.0516, + "step": 4962 + }, + { + "epoch": 0.336269394945457, + "grad_norm": 6.278825283050537, + "learning_rate": 9.423369155999726e-05, + "loss": 0.9489, + "step": 4963 + }, + { + "epoch": 0.3363371502134291, + "grad_norm": 7.415517330169678, + "learning_rate": 9.423232254089945e-05, + "loss": 0.9275, + "step": 4964 + }, + { + "epoch": 0.3364049054814012, + "grad_norm": 6.073827266693115, + "learning_rate": 9.423095352180163e-05, + "loss": 0.7514, + "step": 4965 + }, + { + "epoch": 0.33647266074937326, + "grad_norm": 9.100361824035645, + "learning_rate": 9.422958450270381e-05, + "loss": 1.0882, + "step": 4966 + }, + { + "epoch": 0.33654041601734536, + "grad_norm": 8.23529052734375, + "learning_rate": 9.4228215483606e-05, + "loss": 0.8323, + "step": 4967 + }, + { + "epoch": 0.33660817128531745, + "grad_norm": 9.86119270324707, + "learning_rate": 9.422684646450819e-05, + "loss": 0.8997, + "step": 4968 + }, + { + "epoch": 0.33667592655328954, + "grad_norm": 7.328428268432617, + "learning_rate": 9.422547744541037e-05, + "loss": 0.8558, + "step": 4969 + }, + { + "epoch": 0.3367436818212616, + "grad_norm": 6.703019618988037, + "learning_rate": 9.422410842631255e-05, + "loss": 0.782, + "step": 4970 + }, + { + "epoch": 0.33681143708923367, + "grad_norm": 7.780190944671631, + "learning_rate": 9.422273940721473e-05, + "loss": 1.1681, + "step": 4971 + }, + { + "epoch": 0.33687919235720576, + "grad_norm": 6.972787380218506, + "learning_rate": 9.422137038811691e-05, + "loss": 1.0137, + "step": 4972 + }, + { + "epoch": 0.33694694762517785, + "grad_norm": 6.521172523498535, + "learning_rate": 9.42200013690191e-05, + "loss": 0.8866, + "step": 4973 + }, + { + "epoch": 0.33701470289314994, + "grad_norm": 6.414315223693848, + "learning_rate": 9.421863234992128e-05, + "loss": 0.7614, + "step": 4974 + }, + { + "epoch": 0.33708245816112203, + "grad_norm": 8.930048942565918, + "learning_rate": 9.421726333082346e-05, + "loss": 1.0757, + "step": 4975 + }, + { + "epoch": 0.3371502134290941, + "grad_norm": 6.710857391357422, + "learning_rate": 9.421589431172565e-05, + "loss": 0.8966, + "step": 4976 + }, + { + "epoch": 0.3372179686970662, + "grad_norm": 10.788898468017578, + "learning_rate": 9.421452529262784e-05, + "loss": 0.906, + "step": 4977 + }, + { + "epoch": 0.3372857239650383, + "grad_norm": 6.601465225219727, + "learning_rate": 9.421315627353002e-05, + "loss": 0.967, + "step": 4978 + }, + { + "epoch": 0.33735347923301034, + "grad_norm": 7.0998406410217285, + "learning_rate": 9.42117872544322e-05, + "loss": 1.0584, + "step": 4979 + }, + { + "epoch": 0.33742123450098244, + "grad_norm": 8.517425537109375, + "learning_rate": 9.421041823533438e-05, + "loss": 0.83, + "step": 4980 + }, + { + "epoch": 0.33748898976895453, + "grad_norm": 7.631463050842285, + "learning_rate": 9.420904921623656e-05, + "loss": 0.7431, + "step": 4981 + }, + { + "epoch": 0.3375567450369266, + "grad_norm": 7.006965637207031, + "learning_rate": 9.420768019713875e-05, + "loss": 1.0081, + "step": 4982 + }, + { + "epoch": 0.3376245003048987, + "grad_norm": 7.085225582122803, + "learning_rate": 9.420631117804093e-05, + "loss": 1.0094, + "step": 4983 + }, + { + "epoch": 0.3376922555728708, + "grad_norm": 8.096879959106445, + "learning_rate": 9.420494215894312e-05, + "loss": 1.0376, + "step": 4984 + }, + { + "epoch": 0.3377600108408429, + "grad_norm": 7.823955535888672, + "learning_rate": 9.420357313984531e-05, + "loss": 0.975, + "step": 4985 + }, + { + "epoch": 0.337827766108815, + "grad_norm": 6.340082168579102, + "learning_rate": 9.420220412074749e-05, + "loss": 0.6133, + "step": 4986 + }, + { + "epoch": 0.337895521376787, + "grad_norm": 9.930416107177734, + "learning_rate": 9.420083510164967e-05, + "loss": 1.0132, + "step": 4987 + }, + { + "epoch": 0.3379632766447591, + "grad_norm": 6.7059221267700195, + "learning_rate": 9.419946608255186e-05, + "loss": 0.7784, + "step": 4988 + }, + { + "epoch": 0.3380310319127312, + "grad_norm": 7.869287967681885, + "learning_rate": 9.419809706345404e-05, + "loss": 0.9114, + "step": 4989 + }, + { + "epoch": 0.3380987871807033, + "grad_norm": 8.061338424682617, + "learning_rate": 9.419672804435622e-05, + "loss": 1.0982, + "step": 4990 + }, + { + "epoch": 0.3381665424486754, + "grad_norm": 6.226239204406738, + "learning_rate": 9.419535902525842e-05, + "loss": 0.8251, + "step": 4991 + }, + { + "epoch": 0.3382342977166475, + "grad_norm": 6.236758232116699, + "learning_rate": 9.41939900061606e-05, + "loss": 1.0367, + "step": 4992 + }, + { + "epoch": 0.3383020529846196, + "grad_norm": 8.880475044250488, + "learning_rate": 9.419262098706278e-05, + "loss": 1.0941, + "step": 4993 + }, + { + "epoch": 0.33836980825259166, + "grad_norm": 7.7189621925354, + "learning_rate": 9.419125196796496e-05, + "loss": 1.2868, + "step": 4994 + }, + { + "epoch": 0.3384375635205637, + "grad_norm": 8.368658065795898, + "learning_rate": 9.418988294886714e-05, + "loss": 1.1495, + "step": 4995 + }, + { + "epoch": 0.3385053187885358, + "grad_norm": 6.596102237701416, + "learning_rate": 9.418851392976933e-05, + "loss": 0.9639, + "step": 4996 + }, + { + "epoch": 0.3385730740565079, + "grad_norm": 6.297356605529785, + "learning_rate": 9.418714491067151e-05, + "loss": 0.8802, + "step": 4997 + }, + { + "epoch": 0.33864082932448, + "grad_norm": 6.869907855987549, + "learning_rate": 9.41857758915737e-05, + "loss": 0.8247, + "step": 4998 + }, + { + "epoch": 0.33870858459245207, + "grad_norm": 8.147067070007324, + "learning_rate": 9.418440687247587e-05, + "loss": 0.7532, + "step": 4999 + }, + { + "epoch": 0.33877633986042416, + "grad_norm": 7.368498802185059, + "learning_rate": 9.418303785337807e-05, + "loss": 0.8697, + "step": 5000 + }, + { + "epoch": 0.33884409512839625, + "grad_norm": 6.064372539520264, + "learning_rate": 9.418166883428025e-05, + "loss": 0.8134, + "step": 5001 + }, + { + "epoch": 0.33891185039636834, + "grad_norm": 7.22601318359375, + "learning_rate": 9.418029981518243e-05, + "loss": 0.8224, + "step": 5002 + }, + { + "epoch": 0.3389796056643404, + "grad_norm": 7.832242012023926, + "learning_rate": 9.417893079608461e-05, + "loss": 0.9114, + "step": 5003 + }, + { + "epoch": 0.33904736093231247, + "grad_norm": 7.05675745010376, + "learning_rate": 9.417756177698679e-05, + "loss": 0.8475, + "step": 5004 + }, + { + "epoch": 0.33911511620028456, + "grad_norm": 6.385340690612793, + "learning_rate": 9.417619275788898e-05, + "loss": 0.8604, + "step": 5005 + }, + { + "epoch": 0.33918287146825665, + "grad_norm": 7.100057601928711, + "learning_rate": 9.417482373879116e-05, + "loss": 0.8418, + "step": 5006 + }, + { + "epoch": 0.33925062673622874, + "grad_norm": 8.983677864074707, + "learning_rate": 9.417345471969334e-05, + "loss": 1.1282, + "step": 5007 + }, + { + "epoch": 0.33931838200420084, + "grad_norm": 8.526215553283691, + "learning_rate": 9.417208570059552e-05, + "loss": 0.8759, + "step": 5008 + }, + { + "epoch": 0.3393861372721729, + "grad_norm": 7.278728485107422, + "learning_rate": 9.41707166814977e-05, + "loss": 1.0722, + "step": 5009 + }, + { + "epoch": 0.339453892540145, + "grad_norm": 9.66884994506836, + "learning_rate": 9.41693476623999e-05, + "loss": 0.8776, + "step": 5010 + }, + { + "epoch": 0.33952164780811706, + "grad_norm": 7.001797199249268, + "learning_rate": 9.416797864330208e-05, + "loss": 0.7863, + "step": 5011 + }, + { + "epoch": 0.33958940307608915, + "grad_norm": 7.707855224609375, + "learning_rate": 9.416660962420426e-05, + "loss": 0.9138, + "step": 5012 + }, + { + "epoch": 0.33965715834406124, + "grad_norm": 7.721848964691162, + "learning_rate": 9.416524060510644e-05, + "loss": 1.1121, + "step": 5013 + }, + { + "epoch": 0.33972491361203333, + "grad_norm": 8.91602611541748, + "learning_rate": 9.416387158600863e-05, + "loss": 0.9152, + "step": 5014 + }, + { + "epoch": 0.3397926688800054, + "grad_norm": 7.332693099975586, + "learning_rate": 9.416250256691081e-05, + "loss": 0.9559, + "step": 5015 + }, + { + "epoch": 0.3398604241479775, + "grad_norm": 7.198998928070068, + "learning_rate": 9.4161133547813e-05, + "loss": 0.9239, + "step": 5016 + }, + { + "epoch": 0.3399281794159496, + "grad_norm": 7.566831588745117, + "learning_rate": 9.415976452871517e-05, + "loss": 0.9281, + "step": 5017 + }, + { + "epoch": 0.3399959346839217, + "grad_norm": 6.832939624786377, + "learning_rate": 9.415839550961736e-05, + "loss": 0.9164, + "step": 5018 + }, + { + "epoch": 0.34006368995189373, + "grad_norm": 7.087849140167236, + "learning_rate": 9.415702649051955e-05, + "loss": 0.9314, + "step": 5019 + }, + { + "epoch": 0.3401314452198658, + "grad_norm": 7.417871952056885, + "learning_rate": 9.415565747142173e-05, + "loss": 0.9687, + "step": 5020 + }, + { + "epoch": 0.3401992004878379, + "grad_norm": 6.9238362312316895, + "learning_rate": 9.415428845232391e-05, + "loss": 1.0757, + "step": 5021 + }, + { + "epoch": 0.34026695575581, + "grad_norm": 7.4261698722839355, + "learning_rate": 9.415291943322609e-05, + "loss": 1.0636, + "step": 5022 + }, + { + "epoch": 0.3403347110237821, + "grad_norm": 6.1964030265808105, + "learning_rate": 9.415155041412828e-05, + "loss": 0.739, + "step": 5023 + }, + { + "epoch": 0.3404024662917542, + "grad_norm": 7.07076358795166, + "learning_rate": 9.415018139503046e-05, + "loss": 0.9435, + "step": 5024 + }, + { + "epoch": 0.3404702215597263, + "grad_norm": 8.069670677185059, + "learning_rate": 9.414881237593264e-05, + "loss": 0.9745, + "step": 5025 + }, + { + "epoch": 0.3405379768276984, + "grad_norm": 7.559169769287109, + "learning_rate": 9.414744335683482e-05, + "loss": 0.9329, + "step": 5026 + }, + { + "epoch": 0.34060573209567047, + "grad_norm": 8.438983917236328, + "learning_rate": 9.4146074337737e-05, + "loss": 1.0608, + "step": 5027 + }, + { + "epoch": 0.3406734873636425, + "grad_norm": 6.522724151611328, + "learning_rate": 9.41447053186392e-05, + "loss": 0.7912, + "step": 5028 + }, + { + "epoch": 0.3407412426316146, + "grad_norm": 8.31757926940918, + "learning_rate": 9.414333629954138e-05, + "loss": 0.7535, + "step": 5029 + }, + { + "epoch": 0.3408089978995867, + "grad_norm": 7.540639877319336, + "learning_rate": 9.414196728044356e-05, + "loss": 0.7239, + "step": 5030 + }, + { + "epoch": 0.3408767531675588, + "grad_norm": 7.016546726226807, + "learning_rate": 9.414059826134575e-05, + "loss": 1.0658, + "step": 5031 + }, + { + "epoch": 0.34094450843553087, + "grad_norm": 5.6253814697265625, + "learning_rate": 9.413922924224793e-05, + "loss": 0.8522, + "step": 5032 + }, + { + "epoch": 0.34101226370350296, + "grad_norm": 7.693328380584717, + "learning_rate": 9.413786022315011e-05, + "loss": 1.1011, + "step": 5033 + }, + { + "epoch": 0.34108001897147505, + "grad_norm": 8.77978229522705, + "learning_rate": 9.413649120405231e-05, + "loss": 1.0617, + "step": 5034 + }, + { + "epoch": 0.34114777423944714, + "grad_norm": 6.854990005493164, + "learning_rate": 9.413512218495449e-05, + "loss": 1.0232, + "step": 5035 + }, + { + "epoch": 0.3412155295074192, + "grad_norm": 8.380729675292969, + "learning_rate": 9.413375316585667e-05, + "loss": 0.7554, + "step": 5036 + }, + { + "epoch": 0.34128328477539127, + "grad_norm": 6.667641639709473, + "learning_rate": 9.413238414675886e-05, + "loss": 0.8992, + "step": 5037 + }, + { + "epoch": 0.34135104004336336, + "grad_norm": 7.5778489112854, + "learning_rate": 9.413101512766104e-05, + "loss": 0.9521, + "step": 5038 + }, + { + "epoch": 0.34141879531133545, + "grad_norm": 10.053051948547363, + "learning_rate": 9.412964610856322e-05, + "loss": 0.9595, + "step": 5039 + }, + { + "epoch": 0.34148655057930755, + "grad_norm": 7.460272789001465, + "learning_rate": 9.41282770894654e-05, + "loss": 0.683, + "step": 5040 + }, + { + "epoch": 0.34155430584727964, + "grad_norm": 7.724341869354248, + "learning_rate": 9.412690807036758e-05, + "loss": 0.9902, + "step": 5041 + }, + { + "epoch": 0.34162206111525173, + "grad_norm": 7.729767799377441, + "learning_rate": 9.412553905126978e-05, + "loss": 0.9249, + "step": 5042 + }, + { + "epoch": 0.3416898163832238, + "grad_norm": 6.388404846191406, + "learning_rate": 9.412417003217196e-05, + "loss": 1.0154, + "step": 5043 + }, + { + "epoch": 0.34175757165119586, + "grad_norm": 6.2206130027771, + "learning_rate": 9.412280101307414e-05, + "loss": 0.8169, + "step": 5044 + }, + { + "epoch": 0.34182532691916795, + "grad_norm": 7.383151054382324, + "learning_rate": 9.412143199397632e-05, + "loss": 1.0476, + "step": 5045 + }, + { + "epoch": 0.34189308218714004, + "grad_norm": 7.733860969543457, + "learning_rate": 9.412006297487851e-05, + "loss": 0.9861, + "step": 5046 + }, + { + "epoch": 0.34196083745511213, + "grad_norm": 7.823349475860596, + "learning_rate": 9.41186939557807e-05, + "loss": 0.8351, + "step": 5047 + }, + { + "epoch": 0.3420285927230842, + "grad_norm": 10.20052433013916, + "learning_rate": 9.411732493668287e-05, + "loss": 1.2573, + "step": 5048 + }, + { + "epoch": 0.3420963479910563, + "grad_norm": 7.878631114959717, + "learning_rate": 9.411595591758505e-05, + "loss": 1.0979, + "step": 5049 + }, + { + "epoch": 0.3421641032590284, + "grad_norm": 7.230095386505127, + "learning_rate": 9.411458689848723e-05, + "loss": 0.9929, + "step": 5050 + }, + { + "epoch": 0.3422318585270005, + "grad_norm": 7.354804992675781, + "learning_rate": 9.411321787938943e-05, + "loss": 1.0552, + "step": 5051 + }, + { + "epoch": 0.34229961379497253, + "grad_norm": 9.846439361572266, + "learning_rate": 9.411184886029161e-05, + "loss": 1.0616, + "step": 5052 + }, + { + "epoch": 0.3423673690629446, + "grad_norm": 6.999122142791748, + "learning_rate": 9.411047984119379e-05, + "loss": 0.9958, + "step": 5053 + }, + { + "epoch": 0.3424351243309167, + "grad_norm": 5.719442367553711, + "learning_rate": 9.410911082209597e-05, + "loss": 0.8239, + "step": 5054 + }, + { + "epoch": 0.3425028795988888, + "grad_norm": 7.650005340576172, + "learning_rate": 9.410774180299816e-05, + "loss": 0.9736, + "step": 5055 + }, + { + "epoch": 0.3425706348668609, + "grad_norm": 7.938320159912109, + "learning_rate": 9.410637278390034e-05, + "loss": 0.7809, + "step": 5056 + }, + { + "epoch": 0.342638390134833, + "grad_norm": 8.319568634033203, + "learning_rate": 9.410500376480252e-05, + "loss": 0.9162, + "step": 5057 + }, + { + "epoch": 0.3427061454028051, + "grad_norm": 8.151114463806152, + "learning_rate": 9.41036347457047e-05, + "loss": 1.1071, + "step": 5058 + }, + { + "epoch": 0.3427739006707772, + "grad_norm": 7.333863735198975, + "learning_rate": 9.410226572660688e-05, + "loss": 0.8425, + "step": 5059 + }, + { + "epoch": 0.3428416559387492, + "grad_norm": 6.848262310028076, + "learning_rate": 9.410089670750908e-05, + "loss": 0.9385, + "step": 5060 + }, + { + "epoch": 0.3429094112067213, + "grad_norm": 7.205636024475098, + "learning_rate": 9.409952768841126e-05, + "loss": 0.8448, + "step": 5061 + }, + { + "epoch": 0.3429771664746934, + "grad_norm": 7.133283615112305, + "learning_rate": 9.409815866931344e-05, + "loss": 0.8306, + "step": 5062 + }, + { + "epoch": 0.3430449217426655, + "grad_norm": 6.693148136138916, + "learning_rate": 9.409678965021562e-05, + "loss": 1.0321, + "step": 5063 + }, + { + "epoch": 0.3431126770106376, + "grad_norm": 7.183022975921631, + "learning_rate": 9.40954206311178e-05, + "loss": 1.0067, + "step": 5064 + }, + { + "epoch": 0.34318043227860967, + "grad_norm": 7.537884712219238, + "learning_rate": 9.409405161202e-05, + "loss": 0.9719, + "step": 5065 + }, + { + "epoch": 0.34324818754658176, + "grad_norm": 7.812978267669678, + "learning_rate": 9.409268259292217e-05, + "loss": 0.9936, + "step": 5066 + }, + { + "epoch": 0.34331594281455385, + "grad_norm": 7.93212366104126, + "learning_rate": 9.409131357382435e-05, + "loss": 1.1685, + "step": 5067 + }, + { + "epoch": 0.3433836980825259, + "grad_norm": 7.298057556152344, + "learning_rate": 9.408994455472653e-05, + "loss": 0.908, + "step": 5068 + }, + { + "epoch": 0.343451453350498, + "grad_norm": 8.108441352844238, + "learning_rate": 9.408857553562873e-05, + "loss": 0.9916, + "step": 5069 + }, + { + "epoch": 0.3435192086184701, + "grad_norm": 7.206883430480957, + "learning_rate": 9.408720651653091e-05, + "loss": 0.9431, + "step": 5070 + }, + { + "epoch": 0.34358696388644216, + "grad_norm": 8.54073715209961, + "learning_rate": 9.408583749743309e-05, + "loss": 0.9156, + "step": 5071 + }, + { + "epoch": 0.34365471915441426, + "grad_norm": 7.274294376373291, + "learning_rate": 9.408446847833527e-05, + "loss": 0.7436, + "step": 5072 + }, + { + "epoch": 0.34372247442238635, + "grad_norm": 7.734363079071045, + "learning_rate": 9.408309945923745e-05, + "loss": 1.054, + "step": 5073 + }, + { + "epoch": 0.34379022969035844, + "grad_norm": 7.952118873596191, + "learning_rate": 9.408173044013964e-05, + "loss": 0.9152, + "step": 5074 + }, + { + "epoch": 0.34385798495833053, + "grad_norm": 7.481127738952637, + "learning_rate": 9.408036142104182e-05, + "loss": 1.0134, + "step": 5075 + }, + { + "epoch": 0.34392574022630257, + "grad_norm": 8.52331256866455, + "learning_rate": 9.4078992401944e-05, + "loss": 0.9724, + "step": 5076 + }, + { + "epoch": 0.34399349549427466, + "grad_norm": 7.454250335693359, + "learning_rate": 9.40776233828462e-05, + "loss": 0.9535, + "step": 5077 + }, + { + "epoch": 0.34406125076224675, + "grad_norm": 7.192932605743408, + "learning_rate": 9.407625436374838e-05, + "loss": 0.8923, + "step": 5078 + }, + { + "epoch": 0.34412900603021884, + "grad_norm": 8.29765796661377, + "learning_rate": 9.407488534465056e-05, + "loss": 1.0374, + "step": 5079 + }, + { + "epoch": 0.34419676129819093, + "grad_norm": 7.1159820556640625, + "learning_rate": 9.407351632555275e-05, + "loss": 0.7876, + "step": 5080 + }, + { + "epoch": 0.344264516566163, + "grad_norm": 7.157162189483643, + "learning_rate": 9.407214730645493e-05, + "loss": 0.8747, + "step": 5081 + }, + { + "epoch": 0.3443322718341351, + "grad_norm": 6.686028003692627, + "learning_rate": 9.407077828735711e-05, + "loss": 0.8189, + "step": 5082 + }, + { + "epoch": 0.3444000271021072, + "grad_norm": 8.957246780395508, + "learning_rate": 9.406940926825931e-05, + "loss": 1.0617, + "step": 5083 + }, + { + "epoch": 0.3444677823700793, + "grad_norm": 7.599720001220703, + "learning_rate": 9.406804024916149e-05, + "loss": 1.0455, + "step": 5084 + }, + { + "epoch": 0.34453553763805134, + "grad_norm": 5.757546424865723, + "learning_rate": 9.406667123006367e-05, + "loss": 0.7654, + "step": 5085 + }, + { + "epoch": 0.34460329290602343, + "grad_norm": 6.592649936676025, + "learning_rate": 9.406530221096585e-05, + "loss": 0.9713, + "step": 5086 + }, + { + "epoch": 0.3446710481739955, + "grad_norm": 7.119808673858643, + "learning_rate": 9.406393319186803e-05, + "loss": 0.7413, + "step": 5087 + }, + { + "epoch": 0.3447388034419676, + "grad_norm": 6.288692951202393, + "learning_rate": 9.406256417277022e-05, + "loss": 0.9479, + "step": 5088 + }, + { + "epoch": 0.3448065587099397, + "grad_norm": 6.267573356628418, + "learning_rate": 9.40611951536724e-05, + "loss": 0.9745, + "step": 5089 + }, + { + "epoch": 0.3448743139779118, + "grad_norm": 7.639352798461914, + "learning_rate": 9.405982613457458e-05, + "loss": 0.9655, + "step": 5090 + }, + { + "epoch": 0.3449420692458839, + "grad_norm": 6.32698917388916, + "learning_rate": 9.405845711547676e-05, + "loss": 0.789, + "step": 5091 + }, + { + "epoch": 0.345009824513856, + "grad_norm": 8.769354820251465, + "learning_rate": 9.405708809637896e-05, + "loss": 1.1124, + "step": 5092 + }, + { + "epoch": 0.345077579781828, + "grad_norm": 7.179650783538818, + "learning_rate": 9.405571907728114e-05, + "loss": 0.7939, + "step": 5093 + }, + { + "epoch": 0.3451453350498001, + "grad_norm": 10.112159729003906, + "learning_rate": 9.405435005818332e-05, + "loss": 0.943, + "step": 5094 + }, + { + "epoch": 0.3452130903177722, + "grad_norm": 6.925206661224365, + "learning_rate": 9.40529810390855e-05, + "loss": 1.0444, + "step": 5095 + }, + { + "epoch": 0.3452808455857443, + "grad_norm": 7.8664398193359375, + "learning_rate": 9.405161201998768e-05, + "loss": 0.9643, + "step": 5096 + }, + { + "epoch": 0.3453486008537164, + "grad_norm": 7.060378551483154, + "learning_rate": 9.405024300088987e-05, + "loss": 0.9043, + "step": 5097 + }, + { + "epoch": 0.3454163561216885, + "grad_norm": 8.105093002319336, + "learning_rate": 9.404887398179205e-05, + "loss": 1.1106, + "step": 5098 + }, + { + "epoch": 0.34548411138966056, + "grad_norm": 7.8055739402771, + "learning_rate": 9.404750496269423e-05, + "loss": 0.9638, + "step": 5099 + }, + { + "epoch": 0.34555186665763266, + "grad_norm": 8.225363731384277, + "learning_rate": 9.404613594359641e-05, + "loss": 0.7878, + "step": 5100 + }, + { + "epoch": 0.3456196219256047, + "grad_norm": 7.809800148010254, + "learning_rate": 9.404476692449861e-05, + "loss": 0.8241, + "step": 5101 + }, + { + "epoch": 0.3456873771935768, + "grad_norm": 7.520929336547852, + "learning_rate": 9.404339790540079e-05, + "loss": 1.0689, + "step": 5102 + }, + { + "epoch": 0.3457551324615489, + "grad_norm": 5.996449947357178, + "learning_rate": 9.404202888630297e-05, + "loss": 0.9163, + "step": 5103 + }, + { + "epoch": 0.34582288772952097, + "grad_norm": 7.2040114402771, + "learning_rate": 9.404065986720515e-05, + "loss": 1.0615, + "step": 5104 + }, + { + "epoch": 0.34589064299749306, + "grad_norm": 7.268972873687744, + "learning_rate": 9.403929084810733e-05, + "loss": 0.8785, + "step": 5105 + }, + { + "epoch": 0.34595839826546515, + "grad_norm": 9.244166374206543, + "learning_rate": 9.403792182900952e-05, + "loss": 0.8714, + "step": 5106 + }, + { + "epoch": 0.34602615353343724, + "grad_norm": 6.766915321350098, + "learning_rate": 9.40365528099117e-05, + "loss": 0.8075, + "step": 5107 + }, + { + "epoch": 0.34609390880140933, + "grad_norm": 8.137444496154785, + "learning_rate": 9.403518379081388e-05, + "loss": 1.0687, + "step": 5108 + }, + { + "epoch": 0.34616166406938137, + "grad_norm": 7.488312244415283, + "learning_rate": 9.403381477171606e-05, + "loss": 0.8056, + "step": 5109 + }, + { + "epoch": 0.34622941933735346, + "grad_norm": 6.555777549743652, + "learning_rate": 9.403244575261826e-05, + "loss": 0.8761, + "step": 5110 + }, + { + "epoch": 0.34629717460532555, + "grad_norm": 11.631979942321777, + "learning_rate": 9.403107673352044e-05, + "loss": 0.8139, + "step": 5111 + }, + { + "epoch": 0.34636492987329764, + "grad_norm": 5.973453998565674, + "learning_rate": 9.402970771442262e-05, + "loss": 0.8816, + "step": 5112 + }, + { + "epoch": 0.34643268514126974, + "grad_norm": 5.817539215087891, + "learning_rate": 9.40283386953248e-05, + "loss": 0.6034, + "step": 5113 + }, + { + "epoch": 0.3465004404092418, + "grad_norm": 8.215511322021484, + "learning_rate": 9.402696967622698e-05, + "loss": 0.8876, + "step": 5114 + }, + { + "epoch": 0.3465681956772139, + "grad_norm": 5.848570346832275, + "learning_rate": 9.402560065712917e-05, + "loss": 0.7093, + "step": 5115 + }, + { + "epoch": 0.346635950945186, + "grad_norm": 7.996893405914307, + "learning_rate": 9.402423163803135e-05, + "loss": 0.8865, + "step": 5116 + }, + { + "epoch": 0.34670370621315805, + "grad_norm": 7.348632335662842, + "learning_rate": 9.402286261893353e-05, + "loss": 0.8042, + "step": 5117 + }, + { + "epoch": 0.34677146148113014, + "grad_norm": 6.820111274719238, + "learning_rate": 9.402149359983571e-05, + "loss": 0.8744, + "step": 5118 + }, + { + "epoch": 0.34683921674910223, + "grad_norm": 7.354914665222168, + "learning_rate": 9.40201245807379e-05, + "loss": 0.892, + "step": 5119 + }, + { + "epoch": 0.3469069720170743, + "grad_norm": 7.751887321472168, + "learning_rate": 9.401875556164009e-05, + "loss": 0.8752, + "step": 5120 + }, + { + "epoch": 0.3469747272850464, + "grad_norm": 9.630827903747559, + "learning_rate": 9.401738654254227e-05, + "loss": 0.999, + "step": 5121 + }, + { + "epoch": 0.3470424825530185, + "grad_norm": 9.546615600585938, + "learning_rate": 9.401601752344445e-05, + "loss": 1.341, + "step": 5122 + }, + { + "epoch": 0.3471102378209906, + "grad_norm": 7.031918048858643, + "learning_rate": 9.401464850434663e-05, + "loss": 1.1371, + "step": 5123 + }, + { + "epoch": 0.3471779930889627, + "grad_norm": 9.030802726745605, + "learning_rate": 9.401327948524882e-05, + "loss": 0.9883, + "step": 5124 + }, + { + "epoch": 0.3472457483569347, + "grad_norm": 6.786712646484375, + "learning_rate": 9.4011910466151e-05, + "loss": 0.8692, + "step": 5125 + }, + { + "epoch": 0.3473135036249068, + "grad_norm": 7.327836990356445, + "learning_rate": 9.401054144705318e-05, + "loss": 0.8157, + "step": 5126 + }, + { + "epoch": 0.3473812588928789, + "grad_norm": 6.41863489151001, + "learning_rate": 9.400917242795538e-05, + "loss": 0.8565, + "step": 5127 + }, + { + "epoch": 0.347449014160851, + "grad_norm": 5.737055778503418, + "learning_rate": 9.400780340885756e-05, + "loss": 0.7385, + "step": 5128 + }, + { + "epoch": 0.3475167694288231, + "grad_norm": 6.321473121643066, + "learning_rate": 9.400643438975974e-05, + "loss": 0.6459, + "step": 5129 + }, + { + "epoch": 0.3475845246967952, + "grad_norm": 9.07598876953125, + "learning_rate": 9.400506537066193e-05, + "loss": 1.3719, + "step": 5130 + }, + { + "epoch": 0.3476522799647673, + "grad_norm": 6.3348388671875, + "learning_rate": 9.400369635156411e-05, + "loss": 0.903, + "step": 5131 + }, + { + "epoch": 0.34772003523273937, + "grad_norm": 7.836405277252197, + "learning_rate": 9.40023273324663e-05, + "loss": 0.8854, + "step": 5132 + }, + { + "epoch": 0.34778779050071146, + "grad_norm": 7.712037086486816, + "learning_rate": 9.400095831336849e-05, + "loss": 0.8756, + "step": 5133 + }, + { + "epoch": 0.3478555457686835, + "grad_norm": 7.445090293884277, + "learning_rate": 9.399958929427067e-05, + "loss": 1.0776, + "step": 5134 + }, + { + "epoch": 0.3479233010366556, + "grad_norm": 6.7496724128723145, + "learning_rate": 9.399822027517285e-05, + "loss": 0.9425, + "step": 5135 + }, + { + "epoch": 0.3479910563046277, + "grad_norm": 7.703073501586914, + "learning_rate": 9.399685125607503e-05, + "loss": 0.9217, + "step": 5136 + }, + { + "epoch": 0.34805881157259977, + "grad_norm": 5.972830772399902, + "learning_rate": 9.399548223697721e-05, + "loss": 0.5922, + "step": 5137 + }, + { + "epoch": 0.34812656684057186, + "grad_norm": 7.165718078613281, + "learning_rate": 9.39941132178794e-05, + "loss": 0.9624, + "step": 5138 + }, + { + "epoch": 0.34819432210854395, + "grad_norm": 8.589313507080078, + "learning_rate": 9.399274419878158e-05, + "loss": 1.0485, + "step": 5139 + }, + { + "epoch": 0.34826207737651604, + "grad_norm": 6.503042697906494, + "learning_rate": 9.399137517968376e-05, + "loss": 0.7833, + "step": 5140 + }, + { + "epoch": 0.34832983264448814, + "grad_norm": 9.673978805541992, + "learning_rate": 9.399000616058594e-05, + "loss": 1.2266, + "step": 5141 + }, + { + "epoch": 0.34839758791246017, + "grad_norm": 6.86154842376709, + "learning_rate": 9.398863714148812e-05, + "loss": 0.8709, + "step": 5142 + }, + { + "epoch": 0.34846534318043226, + "grad_norm": 8.111627578735352, + "learning_rate": 9.398726812239032e-05, + "loss": 0.9657, + "step": 5143 + }, + { + "epoch": 0.34853309844840435, + "grad_norm": 7.725754737854004, + "learning_rate": 9.39858991032925e-05, + "loss": 0.9551, + "step": 5144 + }, + { + "epoch": 0.34860085371637645, + "grad_norm": 8.093070030212402, + "learning_rate": 9.398453008419468e-05, + "loss": 0.9814, + "step": 5145 + }, + { + "epoch": 0.34866860898434854, + "grad_norm": 8.375652313232422, + "learning_rate": 9.398316106509686e-05, + "loss": 0.9232, + "step": 5146 + }, + { + "epoch": 0.34873636425232063, + "grad_norm": 7.012859344482422, + "learning_rate": 9.398179204599905e-05, + "loss": 0.8322, + "step": 5147 + }, + { + "epoch": 0.3488041195202927, + "grad_norm": 8.437539100646973, + "learning_rate": 9.398042302690123e-05, + "loss": 0.852, + "step": 5148 + }, + { + "epoch": 0.3488718747882648, + "grad_norm": 9.844721794128418, + "learning_rate": 9.397905400780341e-05, + "loss": 0.8601, + "step": 5149 + }, + { + "epoch": 0.34893963005623685, + "grad_norm": 7.359288215637207, + "learning_rate": 9.39776849887056e-05, + "loss": 0.9313, + "step": 5150 + }, + { + "epoch": 0.34900738532420894, + "grad_norm": 7.528818607330322, + "learning_rate": 9.397631596960777e-05, + "loss": 0.923, + "step": 5151 + }, + { + "epoch": 0.34907514059218103, + "grad_norm": 7.8577399253845215, + "learning_rate": 9.397494695050997e-05, + "loss": 0.9569, + "step": 5152 + }, + { + "epoch": 0.3491428958601531, + "grad_norm": 6.960932731628418, + "learning_rate": 9.397357793141215e-05, + "loss": 0.9025, + "step": 5153 + }, + { + "epoch": 0.3492106511281252, + "grad_norm": 6.392679691314697, + "learning_rate": 9.397220891231433e-05, + "loss": 0.7948, + "step": 5154 + }, + { + "epoch": 0.3492784063960973, + "grad_norm": 8.22850227355957, + "learning_rate": 9.397083989321651e-05, + "loss": 0.9932, + "step": 5155 + }, + { + "epoch": 0.3493461616640694, + "grad_norm": 6.1851277351379395, + "learning_rate": 9.39694708741187e-05, + "loss": 0.776, + "step": 5156 + }, + { + "epoch": 0.3494139169320415, + "grad_norm": 9.058121681213379, + "learning_rate": 9.396810185502088e-05, + "loss": 1.1036, + "step": 5157 + }, + { + "epoch": 0.3494816722000135, + "grad_norm": 7.11410665512085, + "learning_rate": 9.396673283592306e-05, + "loss": 0.8862, + "step": 5158 + }, + { + "epoch": 0.3495494274679856, + "grad_norm": 7.148082256317139, + "learning_rate": 9.396536381682524e-05, + "loss": 0.9796, + "step": 5159 + }, + { + "epoch": 0.3496171827359577, + "grad_norm": 7.258500576019287, + "learning_rate": 9.396399479772742e-05, + "loss": 0.9617, + "step": 5160 + }, + { + "epoch": 0.3496849380039298, + "grad_norm": 5.1490044593811035, + "learning_rate": 9.396262577862962e-05, + "loss": 0.6844, + "step": 5161 + }, + { + "epoch": 0.3497526932719019, + "grad_norm": 6.73121452331543, + "learning_rate": 9.39612567595318e-05, + "loss": 1.0669, + "step": 5162 + }, + { + "epoch": 0.349820448539874, + "grad_norm": 7.891530990600586, + "learning_rate": 9.395988774043398e-05, + "loss": 1.0439, + "step": 5163 + }, + { + "epoch": 0.3498882038078461, + "grad_norm": 7.256270885467529, + "learning_rate": 9.395851872133616e-05, + "loss": 0.9682, + "step": 5164 + }, + { + "epoch": 0.34995595907581817, + "grad_norm": 7.153442859649658, + "learning_rate": 9.395714970223835e-05, + "loss": 0.913, + "step": 5165 + }, + { + "epoch": 0.3500237143437902, + "grad_norm": 8.722851753234863, + "learning_rate": 9.395578068314053e-05, + "loss": 0.8749, + "step": 5166 + }, + { + "epoch": 0.3500914696117623, + "grad_norm": 6.8031005859375, + "learning_rate": 9.395441166404271e-05, + "loss": 0.8755, + "step": 5167 + }, + { + "epoch": 0.3501592248797344, + "grad_norm": 5.449770450592041, + "learning_rate": 9.39530426449449e-05, + "loss": 0.8813, + "step": 5168 + }, + { + "epoch": 0.3502269801477065, + "grad_norm": 7.74420690536499, + "learning_rate": 9.395167362584707e-05, + "loss": 0.8361, + "step": 5169 + }, + { + "epoch": 0.35029473541567857, + "grad_norm": 7.8701677322387695, + "learning_rate": 9.395030460674927e-05, + "loss": 0.9294, + "step": 5170 + }, + { + "epoch": 0.35036249068365066, + "grad_norm": 5.897029876708984, + "learning_rate": 9.394893558765145e-05, + "loss": 0.7739, + "step": 5171 + }, + { + "epoch": 0.35043024595162275, + "grad_norm": 6.7868781089782715, + "learning_rate": 9.394756656855363e-05, + "loss": 0.7252, + "step": 5172 + }, + { + "epoch": 0.35049800121959485, + "grad_norm": 10.289764404296875, + "learning_rate": 9.394619754945582e-05, + "loss": 1.0943, + "step": 5173 + }, + { + "epoch": 0.3505657564875669, + "grad_norm": 7.676051616668701, + "learning_rate": 9.3944828530358e-05, + "loss": 0.881, + "step": 5174 + }, + { + "epoch": 0.350633511755539, + "grad_norm": 7.702653408050537, + "learning_rate": 9.394345951126018e-05, + "loss": 1.0346, + "step": 5175 + }, + { + "epoch": 0.35070126702351107, + "grad_norm": 7.525374889373779, + "learning_rate": 9.394209049216238e-05, + "loss": 0.9955, + "step": 5176 + }, + { + "epoch": 0.35076902229148316, + "grad_norm": 7.942657470703125, + "learning_rate": 9.394072147306456e-05, + "loss": 0.9715, + "step": 5177 + }, + { + "epoch": 0.35083677755945525, + "grad_norm": 9.328110694885254, + "learning_rate": 9.393935245396674e-05, + "loss": 1.0753, + "step": 5178 + }, + { + "epoch": 0.35090453282742734, + "grad_norm": 8.149508476257324, + "learning_rate": 9.393798343486893e-05, + "loss": 0.9818, + "step": 5179 + }, + { + "epoch": 0.35097228809539943, + "grad_norm": 6.882593154907227, + "learning_rate": 9.393661441577111e-05, + "loss": 1.105, + "step": 5180 + }, + { + "epoch": 0.3510400433633715, + "grad_norm": 7.821188926696777, + "learning_rate": 9.393524539667329e-05, + "loss": 0.8106, + "step": 5181 + }, + { + "epoch": 0.35110779863134356, + "grad_norm": 7.235447406768799, + "learning_rate": 9.393387637757547e-05, + "loss": 0.9304, + "step": 5182 + }, + { + "epoch": 0.35117555389931565, + "grad_norm": 6.859196662902832, + "learning_rate": 9.393250735847765e-05, + "loss": 0.7797, + "step": 5183 + }, + { + "epoch": 0.35124330916728774, + "grad_norm": 8.33879566192627, + "learning_rate": 9.393113833937985e-05, + "loss": 1.0075, + "step": 5184 + }, + { + "epoch": 0.35131106443525983, + "grad_norm": 6.733922004699707, + "learning_rate": 9.392976932028203e-05, + "loss": 0.7304, + "step": 5185 + }, + { + "epoch": 0.3513788197032319, + "grad_norm": 8.839714050292969, + "learning_rate": 9.392840030118421e-05, + "loss": 0.8852, + "step": 5186 + }, + { + "epoch": 0.351446574971204, + "grad_norm": 9.954099655151367, + "learning_rate": 9.392703128208639e-05, + "loss": 1.1631, + "step": 5187 + }, + { + "epoch": 0.3515143302391761, + "grad_norm": 8.042675971984863, + "learning_rate": 9.392566226298858e-05, + "loss": 0.8741, + "step": 5188 + }, + { + "epoch": 0.3515820855071482, + "grad_norm": 6.661304950714111, + "learning_rate": 9.392429324389076e-05, + "loss": 0.8879, + "step": 5189 + }, + { + "epoch": 0.3516498407751203, + "grad_norm": 6.568962097167969, + "learning_rate": 9.392292422479294e-05, + "loss": 0.6823, + "step": 5190 + }, + { + "epoch": 0.35171759604309233, + "grad_norm": 6.836343288421631, + "learning_rate": 9.392155520569512e-05, + "loss": 0.8174, + "step": 5191 + }, + { + "epoch": 0.3517853513110644, + "grad_norm": 7.555830478668213, + "learning_rate": 9.39201861865973e-05, + "loss": 0.803, + "step": 5192 + }, + { + "epoch": 0.3518531065790365, + "grad_norm": 7.264036178588867, + "learning_rate": 9.39188171674995e-05, + "loss": 1.0786, + "step": 5193 + }, + { + "epoch": 0.3519208618470086, + "grad_norm": 5.6876420974731445, + "learning_rate": 9.391744814840168e-05, + "loss": 0.7553, + "step": 5194 + }, + { + "epoch": 0.3519886171149807, + "grad_norm": 8.890271186828613, + "learning_rate": 9.391607912930386e-05, + "loss": 1.0685, + "step": 5195 + }, + { + "epoch": 0.3520563723829528, + "grad_norm": 6.010801315307617, + "learning_rate": 9.391471011020604e-05, + "loss": 0.7915, + "step": 5196 + }, + { + "epoch": 0.3521241276509249, + "grad_norm": 8.496731758117676, + "learning_rate": 9.391334109110822e-05, + "loss": 1.2103, + "step": 5197 + }, + { + "epoch": 0.35219188291889697, + "grad_norm": 8.196046829223633, + "learning_rate": 9.391197207201041e-05, + "loss": 1.0173, + "step": 5198 + }, + { + "epoch": 0.352259638186869, + "grad_norm": 8.809300422668457, + "learning_rate": 9.391060305291259e-05, + "loss": 0.906, + "step": 5199 + }, + { + "epoch": 0.3523273934548411, + "grad_norm": 8.437650680541992, + "learning_rate": 9.390923403381477e-05, + "loss": 0.8865, + "step": 5200 + }, + { + "epoch": 0.3523951487228132, + "grad_norm": 8.659364700317383, + "learning_rate": 9.390786501471695e-05, + "loss": 0.8878, + "step": 5201 + }, + { + "epoch": 0.3524629039907853, + "grad_norm": 6.658010482788086, + "learning_rate": 9.390649599561915e-05, + "loss": 0.9141, + "step": 5202 + }, + { + "epoch": 0.3525306592587574, + "grad_norm": 6.702786445617676, + "learning_rate": 9.390512697652133e-05, + "loss": 0.8193, + "step": 5203 + }, + { + "epoch": 0.35259841452672946, + "grad_norm": 8.0852632522583, + "learning_rate": 9.390375795742351e-05, + "loss": 0.9655, + "step": 5204 + }, + { + "epoch": 0.35266616979470156, + "grad_norm": 6.598480224609375, + "learning_rate": 9.390238893832569e-05, + "loss": 0.8858, + "step": 5205 + }, + { + "epoch": 0.35273392506267365, + "grad_norm": 8.510173797607422, + "learning_rate": 9.390101991922787e-05, + "loss": 1.0543, + "step": 5206 + }, + { + "epoch": 0.3528016803306457, + "grad_norm": 7.498827934265137, + "learning_rate": 9.389965090013006e-05, + "loss": 1.138, + "step": 5207 + }, + { + "epoch": 0.3528694355986178, + "grad_norm": 6.370652675628662, + "learning_rate": 9.389828188103224e-05, + "loss": 0.996, + "step": 5208 + }, + { + "epoch": 0.35293719086658987, + "grad_norm": 6.830577373504639, + "learning_rate": 9.389691286193442e-05, + "loss": 0.7562, + "step": 5209 + }, + { + "epoch": 0.35300494613456196, + "grad_norm": 7.01290225982666, + "learning_rate": 9.38955438428366e-05, + "loss": 0.8141, + "step": 5210 + }, + { + "epoch": 0.35307270140253405, + "grad_norm": 6.957060813903809, + "learning_rate": 9.38941748237388e-05, + "loss": 1.1923, + "step": 5211 + }, + { + "epoch": 0.35314045667050614, + "grad_norm": 5.929959297180176, + "learning_rate": 9.389280580464098e-05, + "loss": 0.7928, + "step": 5212 + }, + { + "epoch": 0.35320821193847823, + "grad_norm": 8.654718399047852, + "learning_rate": 9.389143678554316e-05, + "loss": 1.0896, + "step": 5213 + }, + { + "epoch": 0.3532759672064503, + "grad_norm": 5.981748580932617, + "learning_rate": 9.389006776644534e-05, + "loss": 0.9777, + "step": 5214 + }, + { + "epoch": 0.35334372247442236, + "grad_norm": 6.251374244689941, + "learning_rate": 9.388869874734752e-05, + "loss": 0.8562, + "step": 5215 + }, + { + "epoch": 0.35341147774239445, + "grad_norm": 7.049698352813721, + "learning_rate": 9.388732972824971e-05, + "loss": 0.8297, + "step": 5216 + }, + { + "epoch": 0.35347923301036654, + "grad_norm": 8.04577350616455, + "learning_rate": 9.38859607091519e-05, + "loss": 1.2035, + "step": 5217 + }, + { + "epoch": 0.35354698827833864, + "grad_norm": 6.800088405609131, + "learning_rate": 9.388459169005407e-05, + "loss": 0.9551, + "step": 5218 + }, + { + "epoch": 0.35361474354631073, + "grad_norm": 7.368710517883301, + "learning_rate": 9.388322267095627e-05, + "loss": 0.8174, + "step": 5219 + }, + { + "epoch": 0.3536824988142828, + "grad_norm": 7.573550701141357, + "learning_rate": 9.388185365185845e-05, + "loss": 1.0921, + "step": 5220 + }, + { + "epoch": 0.3537502540822549, + "grad_norm": 8.282757759094238, + "learning_rate": 9.388048463276063e-05, + "loss": 0.9124, + "step": 5221 + }, + { + "epoch": 0.353818009350227, + "grad_norm": 8.21078872680664, + "learning_rate": 9.387911561366282e-05, + "loss": 1.0818, + "step": 5222 + }, + { + "epoch": 0.35388576461819904, + "grad_norm": 7.401734352111816, + "learning_rate": 9.3877746594565e-05, + "loss": 0.9906, + "step": 5223 + }, + { + "epoch": 0.35395351988617113, + "grad_norm": 8.078129768371582, + "learning_rate": 9.387637757546718e-05, + "loss": 1.1233, + "step": 5224 + }, + { + "epoch": 0.3540212751541432, + "grad_norm": 7.231998920440674, + "learning_rate": 9.387500855636938e-05, + "loss": 0.8412, + "step": 5225 + }, + { + "epoch": 0.3540890304221153, + "grad_norm": 8.709794044494629, + "learning_rate": 9.387363953727156e-05, + "loss": 0.8346, + "step": 5226 + }, + { + "epoch": 0.3541567856900874, + "grad_norm": 8.45758056640625, + "learning_rate": 9.387227051817374e-05, + "loss": 1.0201, + "step": 5227 + }, + { + "epoch": 0.3542245409580595, + "grad_norm": 5.156687259674072, + "learning_rate": 9.387090149907592e-05, + "loss": 0.8029, + "step": 5228 + }, + { + "epoch": 0.3542922962260316, + "grad_norm": 6.2013163566589355, + "learning_rate": 9.38695324799781e-05, + "loss": 0.8984, + "step": 5229 + }, + { + "epoch": 0.3543600514940037, + "grad_norm": 7.414056301116943, + "learning_rate": 9.386816346088029e-05, + "loss": 0.8606, + "step": 5230 + }, + { + "epoch": 0.3544278067619757, + "grad_norm": 8.167763710021973, + "learning_rate": 9.386679444178247e-05, + "loss": 0.973, + "step": 5231 + }, + { + "epoch": 0.3544955620299478, + "grad_norm": 7.290304183959961, + "learning_rate": 9.386542542268465e-05, + "loss": 0.8988, + "step": 5232 + }, + { + "epoch": 0.3545633172979199, + "grad_norm": 7.181061744689941, + "learning_rate": 9.386405640358683e-05, + "loss": 0.8689, + "step": 5233 + }, + { + "epoch": 0.354631072565892, + "grad_norm": 8.87985610961914, + "learning_rate": 9.386268738448903e-05, + "loss": 1.0076, + "step": 5234 + }, + { + "epoch": 0.3546988278338641, + "grad_norm": 6.205500602722168, + "learning_rate": 9.386131836539121e-05, + "loss": 0.8101, + "step": 5235 + }, + { + "epoch": 0.3547665831018362, + "grad_norm": 7.425187587738037, + "learning_rate": 9.385994934629339e-05, + "loss": 0.9122, + "step": 5236 + }, + { + "epoch": 0.35483433836980827, + "grad_norm": 9.541454315185547, + "learning_rate": 9.385858032719557e-05, + "loss": 0.9868, + "step": 5237 + }, + { + "epoch": 0.35490209363778036, + "grad_norm": 6.853835582733154, + "learning_rate": 9.385721130809775e-05, + "loss": 0.9506, + "step": 5238 + }, + { + "epoch": 0.35496984890575245, + "grad_norm": 9.07067584991455, + "learning_rate": 9.385584228899994e-05, + "loss": 0.9146, + "step": 5239 + }, + { + "epoch": 0.3550376041737245, + "grad_norm": 7.5352373123168945, + "learning_rate": 9.385447326990212e-05, + "loss": 1.0089, + "step": 5240 + }, + { + "epoch": 0.3551053594416966, + "grad_norm": 5.626684188842773, + "learning_rate": 9.38531042508043e-05, + "loss": 0.7057, + "step": 5241 + }, + { + "epoch": 0.35517311470966867, + "grad_norm": 6.1156768798828125, + "learning_rate": 9.385173523170648e-05, + "loss": 0.808, + "step": 5242 + }, + { + "epoch": 0.35524086997764076, + "grad_norm": 7.227247714996338, + "learning_rate": 9.385036621260868e-05, + "loss": 0.9477, + "step": 5243 + }, + { + "epoch": 0.35530862524561285, + "grad_norm": 7.034331798553467, + "learning_rate": 9.384899719351086e-05, + "loss": 0.7929, + "step": 5244 + }, + { + "epoch": 0.35537638051358494, + "grad_norm": 5.943993091583252, + "learning_rate": 9.384762817441304e-05, + "loss": 0.8886, + "step": 5245 + }, + { + "epoch": 0.35544413578155704, + "grad_norm": 7.859194278717041, + "learning_rate": 9.384625915531522e-05, + "loss": 1.0028, + "step": 5246 + }, + { + "epoch": 0.3555118910495291, + "grad_norm": 6.8557448387146, + "learning_rate": 9.38448901362174e-05, + "loss": 0.8946, + "step": 5247 + }, + { + "epoch": 0.35557964631750116, + "grad_norm": 6.733648777008057, + "learning_rate": 9.384352111711959e-05, + "loss": 0.9186, + "step": 5248 + }, + { + "epoch": 0.35564740158547326, + "grad_norm": 5.929673194885254, + "learning_rate": 9.384215209802177e-05, + "loss": 0.8132, + "step": 5249 + }, + { + "epoch": 0.35571515685344535, + "grad_norm": 6.680620193481445, + "learning_rate": 9.384078307892395e-05, + "loss": 0.9197, + "step": 5250 + }, + { + "epoch": 0.35578291212141744, + "grad_norm": 8.043455123901367, + "learning_rate": 9.383941405982613e-05, + "loss": 1.0841, + "step": 5251 + }, + { + "epoch": 0.35585066738938953, + "grad_norm": 9.225475311279297, + "learning_rate": 9.383804504072831e-05, + "loss": 1.3691, + "step": 5252 + }, + { + "epoch": 0.3559184226573616, + "grad_norm": 8.588724136352539, + "learning_rate": 9.383667602163051e-05, + "loss": 1.1282, + "step": 5253 + }, + { + "epoch": 0.3559861779253337, + "grad_norm": 8.21908950805664, + "learning_rate": 9.383530700253269e-05, + "loss": 0.7606, + "step": 5254 + }, + { + "epoch": 0.3560539331933058, + "grad_norm": 6.107995510101318, + "learning_rate": 9.383393798343487e-05, + "loss": 1.1818, + "step": 5255 + }, + { + "epoch": 0.35612168846127784, + "grad_norm": 6.612033367156982, + "learning_rate": 9.383256896433705e-05, + "loss": 0.8892, + "step": 5256 + }, + { + "epoch": 0.35618944372924993, + "grad_norm": 6.935641288757324, + "learning_rate": 9.383119994523924e-05, + "loss": 0.8565, + "step": 5257 + }, + { + "epoch": 0.356257198997222, + "grad_norm": 7.328373908996582, + "learning_rate": 9.382983092614142e-05, + "loss": 0.9409, + "step": 5258 + }, + { + "epoch": 0.3563249542651941, + "grad_norm": 7.016412734985352, + "learning_rate": 9.38284619070436e-05, + "loss": 0.8768, + "step": 5259 + }, + { + "epoch": 0.3563927095331662, + "grad_norm": 7.122356414794922, + "learning_rate": 9.382709288794578e-05, + "loss": 0.9048, + "step": 5260 + }, + { + "epoch": 0.3564604648011383, + "grad_norm": 7.590730667114258, + "learning_rate": 9.382572386884796e-05, + "loss": 0.8154, + "step": 5261 + }, + { + "epoch": 0.3565282200691104, + "grad_norm": 5.5446858406066895, + "learning_rate": 9.382435484975016e-05, + "loss": 0.8783, + "step": 5262 + }, + { + "epoch": 0.3565959753370825, + "grad_norm": 7.153842449188232, + "learning_rate": 9.382298583065234e-05, + "loss": 1.2412, + "step": 5263 + }, + { + "epoch": 0.3566637306050545, + "grad_norm": 7.115203857421875, + "learning_rate": 9.382161681155452e-05, + "loss": 1.0302, + "step": 5264 + }, + { + "epoch": 0.3567314858730266, + "grad_norm": 5.7724833488464355, + "learning_rate": 9.382024779245671e-05, + "loss": 0.5839, + "step": 5265 + }, + { + "epoch": 0.3567992411409987, + "grad_norm": 7.793944358825684, + "learning_rate": 9.381887877335889e-05, + "loss": 1.0893, + "step": 5266 + }, + { + "epoch": 0.3568669964089708, + "grad_norm": 6.561145782470703, + "learning_rate": 9.381750975426107e-05, + "loss": 0.911, + "step": 5267 + }, + { + "epoch": 0.3569347516769429, + "grad_norm": 6.053153038024902, + "learning_rate": 9.381614073516327e-05, + "loss": 0.9239, + "step": 5268 + }, + { + "epoch": 0.357002506944915, + "grad_norm": 7.162718772888184, + "learning_rate": 9.381477171606545e-05, + "loss": 0.7193, + "step": 5269 + }, + { + "epoch": 0.35707026221288707, + "grad_norm": 7.634250640869141, + "learning_rate": 9.381340269696763e-05, + "loss": 0.8826, + "step": 5270 + }, + { + "epoch": 0.35713801748085916, + "grad_norm": 6.502168655395508, + "learning_rate": 9.381203367786982e-05, + "loss": 0.8278, + "step": 5271 + }, + { + "epoch": 0.3572057727488312, + "grad_norm": 7.339065074920654, + "learning_rate": 9.3810664658772e-05, + "loss": 0.8818, + "step": 5272 + }, + { + "epoch": 0.3572735280168033, + "grad_norm": 8.079582214355469, + "learning_rate": 9.380929563967418e-05, + "loss": 0.8596, + "step": 5273 + }, + { + "epoch": 0.3573412832847754, + "grad_norm": 6.368169784545898, + "learning_rate": 9.380792662057636e-05, + "loss": 0.7352, + "step": 5274 + }, + { + "epoch": 0.35740903855274747, + "grad_norm": 7.813303470611572, + "learning_rate": 9.380655760147854e-05, + "loss": 0.876, + "step": 5275 + }, + { + "epoch": 0.35747679382071956, + "grad_norm": 8.180051803588867, + "learning_rate": 9.380518858238074e-05, + "loss": 0.8893, + "step": 5276 + }, + { + "epoch": 0.35754454908869165, + "grad_norm": 7.618046283721924, + "learning_rate": 9.380381956328292e-05, + "loss": 0.9647, + "step": 5277 + }, + { + "epoch": 0.35761230435666375, + "grad_norm": 7.507559776306152, + "learning_rate": 9.38024505441851e-05, + "loss": 0.9524, + "step": 5278 + }, + { + "epoch": 0.35768005962463584, + "grad_norm": 9.470221519470215, + "learning_rate": 9.380108152508728e-05, + "loss": 1.0057, + "step": 5279 + }, + { + "epoch": 0.3577478148926079, + "grad_norm": 6.612621784210205, + "learning_rate": 9.379971250598947e-05, + "loss": 0.8489, + "step": 5280 + }, + { + "epoch": 0.35781557016057997, + "grad_norm": 7.146090030670166, + "learning_rate": 9.379834348689165e-05, + "loss": 0.9585, + "step": 5281 + }, + { + "epoch": 0.35788332542855206, + "grad_norm": 6.300119400024414, + "learning_rate": 9.379697446779383e-05, + "loss": 0.881, + "step": 5282 + }, + { + "epoch": 0.35795108069652415, + "grad_norm": 7.285689353942871, + "learning_rate": 9.379560544869601e-05, + "loss": 1.0182, + "step": 5283 + }, + { + "epoch": 0.35801883596449624, + "grad_norm": 8.943527221679688, + "learning_rate": 9.379423642959819e-05, + "loss": 0.9782, + "step": 5284 + }, + { + "epoch": 0.35808659123246833, + "grad_norm": 8.551790237426758, + "learning_rate": 9.379286741050039e-05, + "loss": 0.8513, + "step": 5285 + }, + { + "epoch": 0.3581543465004404, + "grad_norm": 8.670862197875977, + "learning_rate": 9.379149839140257e-05, + "loss": 0.9026, + "step": 5286 + }, + { + "epoch": 0.3582221017684125, + "grad_norm": 8.31614875793457, + "learning_rate": 9.379012937230475e-05, + "loss": 0.7666, + "step": 5287 + }, + { + "epoch": 0.35828985703638455, + "grad_norm": 6.347659111022949, + "learning_rate": 9.378876035320693e-05, + "loss": 0.953, + "step": 5288 + }, + { + "epoch": 0.35835761230435664, + "grad_norm": 9.098658561706543, + "learning_rate": 9.378739133410912e-05, + "loss": 1.3235, + "step": 5289 + }, + { + "epoch": 0.35842536757232873, + "grad_norm": 7.218830585479736, + "learning_rate": 9.37860223150113e-05, + "loss": 1.0008, + "step": 5290 + }, + { + "epoch": 0.3584931228403008, + "grad_norm": 7.346166133880615, + "learning_rate": 9.378465329591348e-05, + "loss": 0.8441, + "step": 5291 + }, + { + "epoch": 0.3585608781082729, + "grad_norm": 6.566136360168457, + "learning_rate": 9.378328427681566e-05, + "loss": 1.0381, + "step": 5292 + }, + { + "epoch": 0.358628633376245, + "grad_norm": 7.599377632141113, + "learning_rate": 9.378191525771784e-05, + "loss": 0.8131, + "step": 5293 + }, + { + "epoch": 0.3586963886442171, + "grad_norm": 6.4648284912109375, + "learning_rate": 9.378054623862004e-05, + "loss": 0.9599, + "step": 5294 + }, + { + "epoch": 0.3587641439121892, + "grad_norm": 6.155094146728516, + "learning_rate": 9.377917721952222e-05, + "loss": 1.0911, + "step": 5295 + }, + { + "epoch": 0.3588318991801613, + "grad_norm": 5.981008529663086, + "learning_rate": 9.37778082004244e-05, + "loss": 0.7692, + "step": 5296 + }, + { + "epoch": 0.3588996544481333, + "grad_norm": 7.424664497375488, + "learning_rate": 9.377643918132658e-05, + "loss": 0.9248, + "step": 5297 + }, + { + "epoch": 0.3589674097161054, + "grad_norm": 8.310460090637207, + "learning_rate": 9.377507016222877e-05, + "loss": 0.9659, + "step": 5298 + }, + { + "epoch": 0.3590351649840775, + "grad_norm": 7.189448833465576, + "learning_rate": 9.377370114313095e-05, + "loss": 0.9844, + "step": 5299 + }, + { + "epoch": 0.3591029202520496, + "grad_norm": 6.688578128814697, + "learning_rate": 9.377233212403313e-05, + "loss": 0.8488, + "step": 5300 + }, + { + "epoch": 0.3591706755200217, + "grad_norm": 8.917186737060547, + "learning_rate": 9.377096310493531e-05, + "loss": 1.0579, + "step": 5301 + }, + { + "epoch": 0.3592384307879938, + "grad_norm": 8.004783630371094, + "learning_rate": 9.37695940858375e-05, + "loss": 0.8347, + "step": 5302 + }, + { + "epoch": 0.35930618605596587, + "grad_norm": 8.461618423461914, + "learning_rate": 9.376822506673969e-05, + "loss": 1.1304, + "step": 5303 + }, + { + "epoch": 0.35937394132393796, + "grad_norm": 7.35521936416626, + "learning_rate": 9.376685604764187e-05, + "loss": 0.8222, + "step": 5304 + }, + { + "epoch": 0.35944169659191, + "grad_norm": 7.697165012359619, + "learning_rate": 9.376548702854405e-05, + "loss": 0.8885, + "step": 5305 + }, + { + "epoch": 0.3595094518598821, + "grad_norm": 8.730046272277832, + "learning_rate": 9.376411800944623e-05, + "loss": 0.9201, + "step": 5306 + }, + { + "epoch": 0.3595772071278542, + "grad_norm": 7.738508701324463, + "learning_rate": 9.376274899034841e-05, + "loss": 0.828, + "step": 5307 + }, + { + "epoch": 0.3596449623958263, + "grad_norm": 6.961246013641357, + "learning_rate": 9.37613799712506e-05, + "loss": 1.2298, + "step": 5308 + }, + { + "epoch": 0.35971271766379836, + "grad_norm": 7.723383903503418, + "learning_rate": 9.376001095215278e-05, + "loss": 0.9714, + "step": 5309 + }, + { + "epoch": 0.35978047293177046, + "grad_norm": 6.885270595550537, + "learning_rate": 9.375864193305496e-05, + "loss": 0.8352, + "step": 5310 + }, + { + "epoch": 0.35984822819974255, + "grad_norm": 8.623848915100098, + "learning_rate": 9.375727291395716e-05, + "loss": 0.7216, + "step": 5311 + }, + { + "epoch": 0.35991598346771464, + "grad_norm": 7.042871952056885, + "learning_rate": 9.375590389485934e-05, + "loss": 0.8098, + "step": 5312 + }, + { + "epoch": 0.3599837387356867, + "grad_norm": 6.125864505767822, + "learning_rate": 9.375453487576152e-05, + "loss": 0.7628, + "step": 5313 + }, + { + "epoch": 0.36005149400365877, + "grad_norm": 7.560012340545654, + "learning_rate": 9.375316585666371e-05, + "loss": 0.9114, + "step": 5314 + }, + { + "epoch": 0.36011924927163086, + "grad_norm": 6.125675678253174, + "learning_rate": 9.375179683756589e-05, + "loss": 0.7894, + "step": 5315 + }, + { + "epoch": 0.36018700453960295, + "grad_norm": 7.076975345611572, + "learning_rate": 9.375042781846807e-05, + "loss": 1.1084, + "step": 5316 + }, + { + "epoch": 0.36025475980757504, + "grad_norm": 6.712325096130371, + "learning_rate": 9.374905879937027e-05, + "loss": 0.9326, + "step": 5317 + }, + { + "epoch": 0.36032251507554713, + "grad_norm": 5.849967956542969, + "learning_rate": 9.374768978027245e-05, + "loss": 0.7884, + "step": 5318 + }, + { + "epoch": 0.3603902703435192, + "grad_norm": 7.5400614738464355, + "learning_rate": 9.374632076117463e-05, + "loss": 0.9844, + "step": 5319 + }, + { + "epoch": 0.3604580256114913, + "grad_norm": 5.720027446746826, + "learning_rate": 9.374495174207681e-05, + "loss": 0.8233, + "step": 5320 + }, + { + "epoch": 0.36052578087946335, + "grad_norm": 6.872372627258301, + "learning_rate": 9.3743582722979e-05, + "loss": 0.6808, + "step": 5321 + }, + { + "epoch": 0.36059353614743545, + "grad_norm": 6.98433780670166, + "learning_rate": 9.374221370388118e-05, + "loss": 0.642, + "step": 5322 + }, + { + "epoch": 0.36066129141540754, + "grad_norm": 7.231049537658691, + "learning_rate": 9.374084468478336e-05, + "loss": 1.1379, + "step": 5323 + }, + { + "epoch": 0.36072904668337963, + "grad_norm": 8.301416397094727, + "learning_rate": 9.373947566568554e-05, + "loss": 0.9184, + "step": 5324 + }, + { + "epoch": 0.3607968019513517, + "grad_norm": 6.9752068519592285, + "learning_rate": 9.373810664658772e-05, + "loss": 0.9984, + "step": 5325 + }, + { + "epoch": 0.3608645572193238, + "grad_norm": 6.826324939727783, + "learning_rate": 9.373673762748992e-05, + "loss": 0.8923, + "step": 5326 + }, + { + "epoch": 0.3609323124872959, + "grad_norm": 6.605769634246826, + "learning_rate": 9.37353686083921e-05, + "loss": 0.8809, + "step": 5327 + }, + { + "epoch": 0.361000067755268, + "grad_norm": 7.791666030883789, + "learning_rate": 9.373399958929428e-05, + "loss": 0.9199, + "step": 5328 + }, + { + "epoch": 0.36106782302324003, + "grad_norm": 7.015176773071289, + "learning_rate": 9.373263057019646e-05, + "loss": 0.8271, + "step": 5329 + }, + { + "epoch": 0.3611355782912121, + "grad_norm": 6.63329553604126, + "learning_rate": 9.373126155109864e-05, + "loss": 0.9052, + "step": 5330 + }, + { + "epoch": 0.3612033335591842, + "grad_norm": 9.516122817993164, + "learning_rate": 9.372989253200083e-05, + "loss": 1.1517, + "step": 5331 + }, + { + "epoch": 0.3612710888271563, + "grad_norm": 7.7722978591918945, + "learning_rate": 9.372852351290301e-05, + "loss": 1.0195, + "step": 5332 + }, + { + "epoch": 0.3613388440951284, + "grad_norm": 7.2577338218688965, + "learning_rate": 9.372715449380519e-05, + "loss": 0.9093, + "step": 5333 + }, + { + "epoch": 0.3614065993631005, + "grad_norm": 6.668231010437012, + "learning_rate": 9.372578547470737e-05, + "loss": 0.7677, + "step": 5334 + }, + { + "epoch": 0.3614743546310726, + "grad_norm": 8.366500854492188, + "learning_rate": 9.372441645560957e-05, + "loss": 1.0869, + "step": 5335 + }, + { + "epoch": 0.3615421098990447, + "grad_norm": 7.495104789733887, + "learning_rate": 9.372304743651175e-05, + "loss": 0.8074, + "step": 5336 + }, + { + "epoch": 0.3616098651670167, + "grad_norm": 6.546140670776367, + "learning_rate": 9.372167841741393e-05, + "loss": 0.8792, + "step": 5337 + }, + { + "epoch": 0.3616776204349888, + "grad_norm": 7.664621353149414, + "learning_rate": 9.372030939831611e-05, + "loss": 0.8177, + "step": 5338 + }, + { + "epoch": 0.3617453757029609, + "grad_norm": 7.482497215270996, + "learning_rate": 9.371894037921829e-05, + "loss": 0.7624, + "step": 5339 + }, + { + "epoch": 0.361813130970933, + "grad_norm": 11.148695945739746, + "learning_rate": 9.371757136012048e-05, + "loss": 1.0792, + "step": 5340 + }, + { + "epoch": 0.3618808862389051, + "grad_norm": 9.08121109008789, + "learning_rate": 9.371620234102266e-05, + "loss": 1.0027, + "step": 5341 + }, + { + "epoch": 0.36194864150687717, + "grad_norm": 8.424308776855469, + "learning_rate": 9.371483332192484e-05, + "loss": 1.2137, + "step": 5342 + }, + { + "epoch": 0.36201639677484926, + "grad_norm": 8.55742359161377, + "learning_rate": 9.371346430282702e-05, + "loss": 0.8023, + "step": 5343 + }, + { + "epoch": 0.36208415204282135, + "grad_norm": 7.197443008422852, + "learning_rate": 9.371209528372922e-05, + "loss": 1.0458, + "step": 5344 + }, + { + "epoch": 0.36215190731079344, + "grad_norm": 8.317964553833008, + "learning_rate": 9.37107262646314e-05, + "loss": 0.9829, + "step": 5345 + }, + { + "epoch": 0.3622196625787655, + "grad_norm": 8.819951057434082, + "learning_rate": 9.370935724553358e-05, + "loss": 1.0024, + "step": 5346 + }, + { + "epoch": 0.36228741784673757, + "grad_norm": 6.795368671417236, + "learning_rate": 9.370798822643576e-05, + "loss": 1.0216, + "step": 5347 + }, + { + "epoch": 0.36235517311470966, + "grad_norm": 5.370431900024414, + "learning_rate": 9.370661920733794e-05, + "loss": 0.8693, + "step": 5348 + }, + { + "epoch": 0.36242292838268175, + "grad_norm": 7.330100059509277, + "learning_rate": 9.370525018824013e-05, + "loss": 1.0576, + "step": 5349 + }, + { + "epoch": 0.36249068365065384, + "grad_norm": 6.56719446182251, + "learning_rate": 9.370388116914231e-05, + "loss": 0.8367, + "step": 5350 + }, + { + "epoch": 0.36255843891862594, + "grad_norm": 7.462427616119385, + "learning_rate": 9.370251215004449e-05, + "loss": 0.8284, + "step": 5351 + }, + { + "epoch": 0.362626194186598, + "grad_norm": 10.382340431213379, + "learning_rate": 9.370114313094667e-05, + "loss": 1.1184, + "step": 5352 + }, + { + "epoch": 0.3626939494545701, + "grad_norm": 6.872713088989258, + "learning_rate": 9.369977411184885e-05, + "loss": 0.9656, + "step": 5353 + }, + { + "epoch": 0.36276170472254216, + "grad_norm": 6.2565741539001465, + "learning_rate": 9.369840509275105e-05, + "loss": 0.9206, + "step": 5354 + }, + { + "epoch": 0.36282945999051425, + "grad_norm": 8.036307334899902, + "learning_rate": 9.369703607365323e-05, + "loss": 1.2556, + "step": 5355 + }, + { + "epoch": 0.36289721525848634, + "grad_norm": 7.401971340179443, + "learning_rate": 9.369566705455541e-05, + "loss": 0.9584, + "step": 5356 + }, + { + "epoch": 0.36296497052645843, + "grad_norm": 7.890522003173828, + "learning_rate": 9.369429803545759e-05, + "loss": 1.0172, + "step": 5357 + }, + { + "epoch": 0.3630327257944305, + "grad_norm": 7.4986348152160645, + "learning_rate": 9.369292901635978e-05, + "loss": 0.9985, + "step": 5358 + }, + { + "epoch": 0.3631004810624026, + "grad_norm": 6.275259494781494, + "learning_rate": 9.369155999726196e-05, + "loss": 0.9051, + "step": 5359 + }, + { + "epoch": 0.3631682363303747, + "grad_norm": 6.2667365074157715, + "learning_rate": 9.369019097816416e-05, + "loss": 0.9137, + "step": 5360 + }, + { + "epoch": 0.3632359915983468, + "grad_norm": 7.5247483253479, + "learning_rate": 9.368882195906634e-05, + "loss": 0.9157, + "step": 5361 + }, + { + "epoch": 0.36330374686631883, + "grad_norm": 8.757854461669922, + "learning_rate": 9.368745293996852e-05, + "loss": 1.2596, + "step": 5362 + }, + { + "epoch": 0.3633715021342909, + "grad_norm": 7.257226943969727, + "learning_rate": 9.368608392087071e-05, + "loss": 0.8211, + "step": 5363 + }, + { + "epoch": 0.363439257402263, + "grad_norm": 6.194060325622559, + "learning_rate": 9.368471490177289e-05, + "loss": 0.8816, + "step": 5364 + }, + { + "epoch": 0.3635070126702351, + "grad_norm": 10.445760726928711, + "learning_rate": 9.368334588267507e-05, + "loss": 0.858, + "step": 5365 + }, + { + "epoch": 0.3635747679382072, + "grad_norm": 6.156910419464111, + "learning_rate": 9.368197686357725e-05, + "loss": 1.0348, + "step": 5366 + }, + { + "epoch": 0.3636425232061793, + "grad_norm": 6.490479946136475, + "learning_rate": 9.368060784447945e-05, + "loss": 0.8834, + "step": 5367 + }, + { + "epoch": 0.3637102784741514, + "grad_norm": 6.535744667053223, + "learning_rate": 9.367923882538163e-05, + "loss": 0.6612, + "step": 5368 + }, + { + "epoch": 0.3637780337421235, + "grad_norm": 8.068840980529785, + "learning_rate": 9.36778698062838e-05, + "loss": 1.0348, + "step": 5369 + }, + { + "epoch": 0.3638457890100955, + "grad_norm": 6.666348934173584, + "learning_rate": 9.367650078718599e-05, + "loss": 0.8346, + "step": 5370 + }, + { + "epoch": 0.3639135442780676, + "grad_norm": 7.482582092285156, + "learning_rate": 9.367513176808817e-05, + "loss": 0.7356, + "step": 5371 + }, + { + "epoch": 0.3639812995460397, + "grad_norm": 6.928842067718506, + "learning_rate": 9.367376274899036e-05, + "loss": 0.7963, + "step": 5372 + }, + { + "epoch": 0.3640490548140118, + "grad_norm": 6.574978351593018, + "learning_rate": 9.367239372989254e-05, + "loss": 0.7893, + "step": 5373 + }, + { + "epoch": 0.3641168100819839, + "grad_norm": 6.267110824584961, + "learning_rate": 9.367102471079472e-05, + "loss": 0.6282, + "step": 5374 + }, + { + "epoch": 0.36418456534995597, + "grad_norm": 6.368738651275635, + "learning_rate": 9.36696556916969e-05, + "loss": 0.8983, + "step": 5375 + }, + { + "epoch": 0.36425232061792806, + "grad_norm": 7.211498737335205, + "learning_rate": 9.36682866725991e-05, + "loss": 0.9471, + "step": 5376 + }, + { + "epoch": 0.36432007588590015, + "grad_norm": 6.124608516693115, + "learning_rate": 9.366691765350128e-05, + "loss": 0.8649, + "step": 5377 + }, + { + "epoch": 0.3643878311538722, + "grad_norm": 7.325745582580566, + "learning_rate": 9.366554863440346e-05, + "loss": 0.9811, + "step": 5378 + }, + { + "epoch": 0.3644555864218443, + "grad_norm": 5.563783645629883, + "learning_rate": 9.366417961530564e-05, + "loss": 0.833, + "step": 5379 + }, + { + "epoch": 0.36452334168981637, + "grad_norm": 7.581454753875732, + "learning_rate": 9.366281059620782e-05, + "loss": 0.9078, + "step": 5380 + }, + { + "epoch": 0.36459109695778846, + "grad_norm": 7.746644020080566, + "learning_rate": 9.366144157711001e-05, + "loss": 0.9143, + "step": 5381 + }, + { + "epoch": 0.36465885222576055, + "grad_norm": 7.729243278503418, + "learning_rate": 9.366007255801219e-05, + "loss": 0.856, + "step": 5382 + }, + { + "epoch": 0.36472660749373265, + "grad_norm": 7.337301731109619, + "learning_rate": 9.365870353891437e-05, + "loss": 0.9059, + "step": 5383 + }, + { + "epoch": 0.36479436276170474, + "grad_norm": 7.622936248779297, + "learning_rate": 9.365733451981655e-05, + "loss": 1.0493, + "step": 5384 + }, + { + "epoch": 0.36486211802967683, + "grad_norm": 7.159453392028809, + "learning_rate": 9.365596550071873e-05, + "loss": 0.824, + "step": 5385 + }, + { + "epoch": 0.36492987329764887, + "grad_norm": 7.111423492431641, + "learning_rate": 9.365459648162093e-05, + "loss": 0.7201, + "step": 5386 + }, + { + "epoch": 0.36499762856562096, + "grad_norm": 8.7478666305542, + "learning_rate": 9.365322746252311e-05, + "loss": 1.2497, + "step": 5387 + }, + { + "epoch": 0.36506538383359305, + "grad_norm": 9.522677421569824, + "learning_rate": 9.365185844342529e-05, + "loss": 0.9175, + "step": 5388 + }, + { + "epoch": 0.36513313910156514, + "grad_norm": 7.881449222564697, + "learning_rate": 9.365048942432747e-05, + "loss": 1.1123, + "step": 5389 + }, + { + "epoch": 0.36520089436953723, + "grad_norm": 6.60174560546875, + "learning_rate": 9.364912040522966e-05, + "loss": 0.9148, + "step": 5390 + }, + { + "epoch": 0.3652686496375093, + "grad_norm": 7.549520492553711, + "learning_rate": 9.364775138613184e-05, + "loss": 0.991, + "step": 5391 + }, + { + "epoch": 0.3653364049054814, + "grad_norm": 7.287946701049805, + "learning_rate": 9.364638236703402e-05, + "loss": 0.984, + "step": 5392 + }, + { + "epoch": 0.3654041601734535, + "grad_norm": 9.351056098937988, + "learning_rate": 9.36450133479362e-05, + "loss": 1.1052, + "step": 5393 + }, + { + "epoch": 0.36547191544142554, + "grad_norm": 8.443605422973633, + "learning_rate": 9.364364432883838e-05, + "loss": 0.941, + "step": 5394 + }, + { + "epoch": 0.36553967070939763, + "grad_norm": 6.416137218475342, + "learning_rate": 9.364227530974058e-05, + "loss": 0.9359, + "step": 5395 + }, + { + "epoch": 0.3656074259773697, + "grad_norm": 7.125934600830078, + "learning_rate": 9.364090629064276e-05, + "loss": 1.0006, + "step": 5396 + }, + { + "epoch": 0.3656751812453418, + "grad_norm": 8.161697387695312, + "learning_rate": 9.363953727154494e-05, + "loss": 0.8284, + "step": 5397 + }, + { + "epoch": 0.3657429365133139, + "grad_norm": 6.955849647521973, + "learning_rate": 9.363816825244712e-05, + "loss": 0.7467, + "step": 5398 + }, + { + "epoch": 0.365810691781286, + "grad_norm": 6.489741325378418, + "learning_rate": 9.363679923334931e-05, + "loss": 0.7611, + "step": 5399 + }, + { + "epoch": 0.3658784470492581, + "grad_norm": 7.332844257354736, + "learning_rate": 9.363543021425149e-05, + "loss": 0.7666, + "step": 5400 + }, + { + "epoch": 0.3659462023172302, + "grad_norm": 9.001418113708496, + "learning_rate": 9.363406119515367e-05, + "loss": 1.0219, + "step": 5401 + }, + { + "epoch": 0.3660139575852023, + "grad_norm": 8.972160339355469, + "learning_rate": 9.363269217605585e-05, + "loss": 1.0122, + "step": 5402 + }, + { + "epoch": 0.3660817128531743, + "grad_norm": 7.487551212310791, + "learning_rate": 9.363132315695803e-05, + "loss": 1.2551, + "step": 5403 + }, + { + "epoch": 0.3661494681211464, + "grad_norm": 5.9293904304504395, + "learning_rate": 9.362995413786023e-05, + "loss": 0.7594, + "step": 5404 + }, + { + "epoch": 0.3662172233891185, + "grad_norm": 7.69260311126709, + "learning_rate": 9.362858511876241e-05, + "loss": 0.9297, + "step": 5405 + }, + { + "epoch": 0.3662849786570906, + "grad_norm": 6.723849296569824, + "learning_rate": 9.362721609966459e-05, + "loss": 1.0034, + "step": 5406 + }, + { + "epoch": 0.3663527339250627, + "grad_norm": 8.467827796936035, + "learning_rate": 9.362584708056678e-05, + "loss": 0.9831, + "step": 5407 + }, + { + "epoch": 0.36642048919303477, + "grad_norm": 6.563052654266357, + "learning_rate": 9.362447806146896e-05, + "loss": 0.8464, + "step": 5408 + }, + { + "epoch": 0.36648824446100686, + "grad_norm": 7.124545574188232, + "learning_rate": 9.362310904237114e-05, + "loss": 1.1, + "step": 5409 + }, + { + "epoch": 0.36655599972897895, + "grad_norm": 8.016179084777832, + "learning_rate": 9.362174002327334e-05, + "loss": 0.9962, + "step": 5410 + }, + { + "epoch": 0.366623754996951, + "grad_norm": 5.937708854675293, + "learning_rate": 9.362037100417552e-05, + "loss": 0.5908, + "step": 5411 + }, + { + "epoch": 0.3666915102649231, + "grad_norm": 6.945435523986816, + "learning_rate": 9.36190019850777e-05, + "loss": 0.7576, + "step": 5412 + }, + { + "epoch": 0.3667592655328952, + "grad_norm": 6.756751537322998, + "learning_rate": 9.361763296597989e-05, + "loss": 0.7966, + "step": 5413 + }, + { + "epoch": 0.36682702080086727, + "grad_norm": 6.434855937957764, + "learning_rate": 9.361626394688207e-05, + "loss": 0.8401, + "step": 5414 + }, + { + "epoch": 0.36689477606883936, + "grad_norm": 6.420316696166992, + "learning_rate": 9.361489492778425e-05, + "loss": 0.8031, + "step": 5415 + }, + { + "epoch": 0.36696253133681145, + "grad_norm": 5.286546230316162, + "learning_rate": 9.361352590868643e-05, + "loss": 0.6665, + "step": 5416 + }, + { + "epoch": 0.36703028660478354, + "grad_norm": 8.99885368347168, + "learning_rate": 9.361215688958861e-05, + "loss": 0.7545, + "step": 5417 + }, + { + "epoch": 0.36709804187275563, + "grad_norm": 7.546742916107178, + "learning_rate": 9.36107878704908e-05, + "loss": 1.0391, + "step": 5418 + }, + { + "epoch": 0.36716579714072767, + "grad_norm": 6.452430248260498, + "learning_rate": 9.360941885139299e-05, + "loss": 0.8458, + "step": 5419 + }, + { + "epoch": 0.36723355240869976, + "grad_norm": 7.125503063201904, + "learning_rate": 9.360804983229517e-05, + "loss": 0.8061, + "step": 5420 + }, + { + "epoch": 0.36730130767667185, + "grad_norm": 7.046570301055908, + "learning_rate": 9.360668081319735e-05, + "loss": 1.0968, + "step": 5421 + }, + { + "epoch": 0.36736906294464394, + "grad_norm": 7.032671928405762, + "learning_rate": 9.360531179409954e-05, + "loss": 0.9834, + "step": 5422 + }, + { + "epoch": 0.36743681821261603, + "grad_norm": 8.430721282958984, + "learning_rate": 9.360394277500172e-05, + "loss": 0.8875, + "step": 5423 + }, + { + "epoch": 0.3675045734805881, + "grad_norm": 8.67082405090332, + "learning_rate": 9.36025737559039e-05, + "loss": 0.9981, + "step": 5424 + }, + { + "epoch": 0.3675723287485602, + "grad_norm": 6.5781707763671875, + "learning_rate": 9.360120473680608e-05, + "loss": 0.9702, + "step": 5425 + }, + { + "epoch": 0.3676400840165323, + "grad_norm": 9.087675094604492, + "learning_rate": 9.359983571770826e-05, + "loss": 1.1141, + "step": 5426 + }, + { + "epoch": 0.36770783928450435, + "grad_norm": 6.949502468109131, + "learning_rate": 9.359846669861046e-05, + "loss": 0.9167, + "step": 5427 + }, + { + "epoch": 0.36777559455247644, + "grad_norm": 9.335396766662598, + "learning_rate": 9.359709767951264e-05, + "loss": 0.8119, + "step": 5428 + }, + { + "epoch": 0.36784334982044853, + "grad_norm": 8.041236877441406, + "learning_rate": 9.359572866041482e-05, + "loss": 1.0112, + "step": 5429 + }, + { + "epoch": 0.3679111050884206, + "grad_norm": 6.435655117034912, + "learning_rate": 9.3594359641317e-05, + "loss": 0.9589, + "step": 5430 + }, + { + "epoch": 0.3679788603563927, + "grad_norm": 8.047952651977539, + "learning_rate": 9.359299062221919e-05, + "loss": 0.7143, + "step": 5431 + }, + { + "epoch": 0.3680466156243648, + "grad_norm": 6.986575126647949, + "learning_rate": 9.359162160312137e-05, + "loss": 0.8857, + "step": 5432 + }, + { + "epoch": 0.3681143708923369, + "grad_norm": 5.447303771972656, + "learning_rate": 9.359025258402355e-05, + "loss": 0.7229, + "step": 5433 + }, + { + "epoch": 0.368182126160309, + "grad_norm": 6.337569713592529, + "learning_rate": 9.358888356492573e-05, + "loss": 0.8595, + "step": 5434 + }, + { + "epoch": 0.368249881428281, + "grad_norm": 7.318792819976807, + "learning_rate": 9.358751454582791e-05, + "loss": 0.8131, + "step": 5435 + }, + { + "epoch": 0.3683176366962531, + "grad_norm": 6.816128730773926, + "learning_rate": 9.35861455267301e-05, + "loss": 0.6972, + "step": 5436 + }, + { + "epoch": 0.3683853919642252, + "grad_norm": 6.886023998260498, + "learning_rate": 9.358477650763229e-05, + "loss": 0.9075, + "step": 5437 + }, + { + "epoch": 0.3684531472321973, + "grad_norm": 7.4018049240112305, + "learning_rate": 9.358340748853447e-05, + "loss": 0.7913, + "step": 5438 + }, + { + "epoch": 0.3685209025001694, + "grad_norm": 6.976071834564209, + "learning_rate": 9.358203846943665e-05, + "loss": 0.8306, + "step": 5439 + }, + { + "epoch": 0.3685886577681415, + "grad_norm": 8.005870819091797, + "learning_rate": 9.358066945033883e-05, + "loss": 1.0109, + "step": 5440 + }, + { + "epoch": 0.3686564130361136, + "grad_norm": 8.600711822509766, + "learning_rate": 9.357930043124102e-05, + "loss": 1.022, + "step": 5441 + }, + { + "epoch": 0.36872416830408566, + "grad_norm": 9.857340812683105, + "learning_rate": 9.35779314121432e-05, + "loss": 1.1266, + "step": 5442 + }, + { + "epoch": 0.3687919235720577, + "grad_norm": 8.064130783081055, + "learning_rate": 9.357656239304538e-05, + "loss": 0.9224, + "step": 5443 + }, + { + "epoch": 0.3688596788400298, + "grad_norm": 9.217019081115723, + "learning_rate": 9.357519337394756e-05, + "loss": 1.0041, + "step": 5444 + }, + { + "epoch": 0.3689274341080019, + "grad_norm": 7.042741298675537, + "learning_rate": 9.357382435484976e-05, + "loss": 0.6798, + "step": 5445 + }, + { + "epoch": 0.368995189375974, + "grad_norm": 6.679181098937988, + "learning_rate": 9.357245533575194e-05, + "loss": 0.9641, + "step": 5446 + }, + { + "epoch": 0.36906294464394607, + "grad_norm": 8.608474731445312, + "learning_rate": 9.357108631665412e-05, + "loss": 0.8224, + "step": 5447 + }, + { + "epoch": 0.36913069991191816, + "grad_norm": 8.975968360900879, + "learning_rate": 9.35697172975563e-05, + "loss": 0.9103, + "step": 5448 + }, + { + "epoch": 0.36919845517989025, + "grad_norm": 7.249898910522461, + "learning_rate": 9.356834827845848e-05, + "loss": 1.0178, + "step": 5449 + }, + { + "epoch": 0.36926621044786234, + "grad_norm": 6.6788787841796875, + "learning_rate": 9.356697925936067e-05, + "loss": 0.7786, + "step": 5450 + }, + { + "epoch": 0.36933396571583443, + "grad_norm": 7.834086894989014, + "learning_rate": 9.356561024026285e-05, + "loss": 1.0282, + "step": 5451 + }, + { + "epoch": 0.36940172098380647, + "grad_norm": 7.36384391784668, + "learning_rate": 9.356424122116503e-05, + "loss": 1.0448, + "step": 5452 + }, + { + "epoch": 0.36946947625177856, + "grad_norm": 7.843540191650391, + "learning_rate": 9.356287220206723e-05, + "loss": 1.0646, + "step": 5453 + }, + { + "epoch": 0.36953723151975065, + "grad_norm": 8.009191513061523, + "learning_rate": 9.35615031829694e-05, + "loss": 1.0279, + "step": 5454 + }, + { + "epoch": 0.36960498678772274, + "grad_norm": 7.594022274017334, + "learning_rate": 9.356013416387159e-05, + "loss": 1.0235, + "step": 5455 + }, + { + "epoch": 0.36967274205569484, + "grad_norm": 6.167095184326172, + "learning_rate": 9.355876514477378e-05, + "loss": 0.7107, + "step": 5456 + }, + { + "epoch": 0.36974049732366693, + "grad_norm": 8.07007884979248, + "learning_rate": 9.355739612567596e-05, + "loss": 0.9667, + "step": 5457 + }, + { + "epoch": 0.369808252591639, + "grad_norm": 7.6446099281311035, + "learning_rate": 9.355602710657814e-05, + "loss": 0.9758, + "step": 5458 + }, + { + "epoch": 0.3698760078596111, + "grad_norm": 7.282294273376465, + "learning_rate": 9.355465808748034e-05, + "loss": 0.929, + "step": 5459 + }, + { + "epoch": 0.36994376312758315, + "grad_norm": 9.878494262695312, + "learning_rate": 9.355328906838252e-05, + "loss": 0.9909, + "step": 5460 + }, + { + "epoch": 0.37001151839555524, + "grad_norm": 7.331822395324707, + "learning_rate": 9.35519200492847e-05, + "loss": 0.9693, + "step": 5461 + }, + { + "epoch": 0.37007927366352733, + "grad_norm": 7.3209404945373535, + "learning_rate": 9.355055103018688e-05, + "loss": 0.8587, + "step": 5462 + }, + { + "epoch": 0.3701470289314994, + "grad_norm": 8.261675834655762, + "learning_rate": 9.354918201108906e-05, + "loss": 0.9391, + "step": 5463 + }, + { + "epoch": 0.3702147841994715, + "grad_norm": 7.214691162109375, + "learning_rate": 9.354781299199125e-05, + "loss": 0.8805, + "step": 5464 + }, + { + "epoch": 0.3702825394674436, + "grad_norm": 7.409287452697754, + "learning_rate": 9.354644397289343e-05, + "loss": 0.6792, + "step": 5465 + }, + { + "epoch": 0.3703502947354157, + "grad_norm": 6.368542194366455, + "learning_rate": 9.354507495379561e-05, + "loss": 0.6464, + "step": 5466 + }, + { + "epoch": 0.3704180500033878, + "grad_norm": 7.106698513031006, + "learning_rate": 9.354370593469779e-05, + "loss": 1.0956, + "step": 5467 + }, + { + "epoch": 0.3704858052713598, + "grad_norm": 7.591500759124756, + "learning_rate": 9.354233691559999e-05, + "loss": 0.9643, + "step": 5468 + }, + { + "epoch": 0.3705535605393319, + "grad_norm": 7.255499839782715, + "learning_rate": 9.354096789650217e-05, + "loss": 0.7782, + "step": 5469 + }, + { + "epoch": 0.370621315807304, + "grad_norm": 8.414693832397461, + "learning_rate": 9.353959887740435e-05, + "loss": 0.9433, + "step": 5470 + }, + { + "epoch": 0.3706890710752761, + "grad_norm": 7.294419765472412, + "learning_rate": 9.353822985830653e-05, + "loss": 0.9643, + "step": 5471 + }, + { + "epoch": 0.3707568263432482, + "grad_norm": 7.249752998352051, + "learning_rate": 9.353686083920871e-05, + "loss": 0.8529, + "step": 5472 + }, + { + "epoch": 0.3708245816112203, + "grad_norm": 9.345986366271973, + "learning_rate": 9.35354918201109e-05, + "loss": 0.9641, + "step": 5473 + }, + { + "epoch": 0.3708923368791924, + "grad_norm": 7.828972816467285, + "learning_rate": 9.353412280101308e-05, + "loss": 1.1, + "step": 5474 + }, + { + "epoch": 0.37096009214716447, + "grad_norm": 7.739612579345703, + "learning_rate": 9.353275378191526e-05, + "loss": 0.8228, + "step": 5475 + }, + { + "epoch": 0.3710278474151365, + "grad_norm": 7.853143215179443, + "learning_rate": 9.353138476281744e-05, + "loss": 0.9088, + "step": 5476 + }, + { + "epoch": 0.3710956026831086, + "grad_norm": 7.996474742889404, + "learning_rate": 9.353001574371964e-05, + "loss": 0.9291, + "step": 5477 + }, + { + "epoch": 0.3711633579510807, + "grad_norm": 7.374247074127197, + "learning_rate": 9.352864672462182e-05, + "loss": 1.0619, + "step": 5478 + }, + { + "epoch": 0.3712311132190528, + "grad_norm": 8.037210464477539, + "learning_rate": 9.3527277705524e-05, + "loss": 0.9426, + "step": 5479 + }, + { + "epoch": 0.37129886848702487, + "grad_norm": 5.515965938568115, + "learning_rate": 9.352590868642618e-05, + "loss": 0.6838, + "step": 5480 + }, + { + "epoch": 0.37136662375499696, + "grad_norm": 8.530059814453125, + "learning_rate": 9.352453966732836e-05, + "loss": 0.9221, + "step": 5481 + }, + { + "epoch": 0.37143437902296905, + "grad_norm": 8.07000732421875, + "learning_rate": 9.352317064823055e-05, + "loss": 0.9321, + "step": 5482 + }, + { + "epoch": 0.37150213429094114, + "grad_norm": 8.79816722869873, + "learning_rate": 9.352180162913273e-05, + "loss": 1.0875, + "step": 5483 + }, + { + "epoch": 0.3715698895589132, + "grad_norm": 8.476999282836914, + "learning_rate": 9.352043261003491e-05, + "loss": 1.0144, + "step": 5484 + }, + { + "epoch": 0.37163764482688527, + "grad_norm": 5.7845540046691895, + "learning_rate": 9.351906359093709e-05, + "loss": 0.6802, + "step": 5485 + }, + { + "epoch": 0.37170540009485736, + "grad_norm": 6.311066627502441, + "learning_rate": 9.351769457183929e-05, + "loss": 0.7673, + "step": 5486 + }, + { + "epoch": 0.37177315536282945, + "grad_norm": 6.929514408111572, + "learning_rate": 9.351632555274147e-05, + "loss": 1.1287, + "step": 5487 + }, + { + "epoch": 0.37184091063080155, + "grad_norm": 7.7819132804870605, + "learning_rate": 9.351495653364365e-05, + "loss": 0.8828, + "step": 5488 + }, + { + "epoch": 0.37190866589877364, + "grad_norm": 7.906317710876465, + "learning_rate": 9.351358751454583e-05, + "loss": 0.7961, + "step": 5489 + }, + { + "epoch": 0.37197642116674573, + "grad_norm": 8.190624237060547, + "learning_rate": 9.351221849544801e-05, + "loss": 0.8106, + "step": 5490 + }, + { + "epoch": 0.3720441764347178, + "grad_norm": 6.0358734130859375, + "learning_rate": 9.35108494763502e-05, + "loss": 1.0342, + "step": 5491 + }, + { + "epoch": 0.37211193170268986, + "grad_norm": 7.096914291381836, + "learning_rate": 9.350948045725238e-05, + "loss": 0.6652, + "step": 5492 + }, + { + "epoch": 0.37217968697066195, + "grad_norm": 7.488151550292969, + "learning_rate": 9.350811143815456e-05, + "loss": 0.9283, + "step": 5493 + }, + { + "epoch": 0.37224744223863404, + "grad_norm": 7.084420680999756, + "learning_rate": 9.350674241905674e-05, + "loss": 1.0424, + "step": 5494 + }, + { + "epoch": 0.37231519750660613, + "grad_norm": 7.483108997344971, + "learning_rate": 9.350537339995892e-05, + "loss": 1.0657, + "step": 5495 + }, + { + "epoch": 0.3723829527745782, + "grad_norm": 7.312635898590088, + "learning_rate": 9.350400438086112e-05, + "loss": 0.7831, + "step": 5496 + }, + { + "epoch": 0.3724507080425503, + "grad_norm": 6.901159763336182, + "learning_rate": 9.35026353617633e-05, + "loss": 0.9832, + "step": 5497 + }, + { + "epoch": 0.3725184633105224, + "grad_norm": 7.787426471710205, + "learning_rate": 9.350126634266548e-05, + "loss": 0.9112, + "step": 5498 + }, + { + "epoch": 0.3725862185784945, + "grad_norm": 7.2583746910095215, + "learning_rate": 9.349989732356767e-05, + "loss": 0.757, + "step": 5499 + }, + { + "epoch": 0.37265397384646654, + "grad_norm": 7.068667411804199, + "learning_rate": 9.349852830446985e-05, + "loss": 0.8516, + "step": 5500 + }, + { + "epoch": 0.3727217291144386, + "grad_norm": 8.109786987304688, + "learning_rate": 9.349715928537203e-05, + "loss": 0.7925, + "step": 5501 + }, + { + "epoch": 0.3727894843824107, + "grad_norm": 7.539520740509033, + "learning_rate": 9.349579026627423e-05, + "loss": 1.283, + "step": 5502 + }, + { + "epoch": 0.3728572396503828, + "grad_norm": 8.608511924743652, + "learning_rate": 9.34944212471764e-05, + "loss": 1.2367, + "step": 5503 + }, + { + "epoch": 0.3729249949183549, + "grad_norm": 7.932346820831299, + "learning_rate": 9.349305222807859e-05, + "loss": 0.894, + "step": 5504 + }, + { + "epoch": 0.372992750186327, + "grad_norm": 6.520984172821045, + "learning_rate": 9.349168320898078e-05, + "loss": 0.884, + "step": 5505 + }, + { + "epoch": 0.3730605054542991, + "grad_norm": 6.876826286315918, + "learning_rate": 9.349031418988296e-05, + "loss": 0.9447, + "step": 5506 + }, + { + "epoch": 0.3731282607222712, + "grad_norm": 8.540080070495605, + "learning_rate": 9.348894517078514e-05, + "loss": 1.0709, + "step": 5507 + }, + { + "epoch": 0.37319601599024327, + "grad_norm": 7.760754585266113, + "learning_rate": 9.348757615168732e-05, + "loss": 1.0666, + "step": 5508 + }, + { + "epoch": 0.3732637712582153, + "grad_norm": 6.1746134757995605, + "learning_rate": 9.348620713258951e-05, + "loss": 0.8367, + "step": 5509 + }, + { + "epoch": 0.3733315265261874, + "grad_norm": 8.053776741027832, + "learning_rate": 9.34848381134917e-05, + "loss": 1.253, + "step": 5510 + }, + { + "epoch": 0.3733992817941595, + "grad_norm": 10.046021461486816, + "learning_rate": 9.348346909439388e-05, + "loss": 0.9086, + "step": 5511 + }, + { + "epoch": 0.3734670370621316, + "grad_norm": 7.247015953063965, + "learning_rate": 9.348210007529606e-05, + "loss": 1.3036, + "step": 5512 + }, + { + "epoch": 0.37353479233010367, + "grad_norm": 6.103085517883301, + "learning_rate": 9.348073105619824e-05, + "loss": 0.9778, + "step": 5513 + }, + { + "epoch": 0.37360254759807576, + "grad_norm": 7.330771446228027, + "learning_rate": 9.347936203710043e-05, + "loss": 0.8335, + "step": 5514 + }, + { + "epoch": 0.37367030286604785, + "grad_norm": 7.571770668029785, + "learning_rate": 9.347799301800261e-05, + "loss": 0.9963, + "step": 5515 + }, + { + "epoch": 0.37373805813401995, + "grad_norm": 8.018990516662598, + "learning_rate": 9.347662399890479e-05, + "loss": 0.7925, + "step": 5516 + }, + { + "epoch": 0.373805813401992, + "grad_norm": 7.156185626983643, + "learning_rate": 9.347525497980697e-05, + "loss": 0.7821, + "step": 5517 + }, + { + "epoch": 0.3738735686699641, + "grad_norm": 8.189929008483887, + "learning_rate": 9.347388596070915e-05, + "loss": 0.8997, + "step": 5518 + }, + { + "epoch": 0.37394132393793617, + "grad_norm": 6.826793670654297, + "learning_rate": 9.347251694161135e-05, + "loss": 0.7392, + "step": 5519 + }, + { + "epoch": 0.37400907920590826, + "grad_norm": 7.931643962860107, + "learning_rate": 9.347114792251353e-05, + "loss": 1.1607, + "step": 5520 + }, + { + "epoch": 0.37407683447388035, + "grad_norm": 6.973018646240234, + "learning_rate": 9.34697789034157e-05, + "loss": 0.7419, + "step": 5521 + }, + { + "epoch": 0.37414458974185244, + "grad_norm": 7.43549919128418, + "learning_rate": 9.346840988431789e-05, + "loss": 1.0035, + "step": 5522 + }, + { + "epoch": 0.37421234500982453, + "grad_norm": 6.700629234313965, + "learning_rate": 9.346704086522008e-05, + "loss": 1.0953, + "step": 5523 + }, + { + "epoch": 0.3742801002777966, + "grad_norm": 6.039200782775879, + "learning_rate": 9.346567184612226e-05, + "loss": 0.6512, + "step": 5524 + }, + { + "epoch": 0.37434785554576866, + "grad_norm": 8.45660400390625, + "learning_rate": 9.346430282702444e-05, + "loss": 1.0725, + "step": 5525 + }, + { + "epoch": 0.37441561081374075, + "grad_norm": 8.787965774536133, + "learning_rate": 9.346293380792662e-05, + "loss": 1.0101, + "step": 5526 + }, + { + "epoch": 0.37448336608171284, + "grad_norm": 8.229619026184082, + "learning_rate": 9.34615647888288e-05, + "loss": 1.2651, + "step": 5527 + }, + { + "epoch": 0.37455112134968493, + "grad_norm": 6.913321018218994, + "learning_rate": 9.3460195769731e-05, + "loss": 0.9196, + "step": 5528 + }, + { + "epoch": 0.374618876617657, + "grad_norm": 5.944606781005859, + "learning_rate": 9.345882675063318e-05, + "loss": 0.857, + "step": 5529 + }, + { + "epoch": 0.3746866318856291, + "grad_norm": 9.863933563232422, + "learning_rate": 9.345745773153536e-05, + "loss": 1.0566, + "step": 5530 + }, + { + "epoch": 0.3747543871536012, + "grad_norm": 6.494686603546143, + "learning_rate": 9.345608871243754e-05, + "loss": 0.8839, + "step": 5531 + }, + { + "epoch": 0.3748221424215733, + "grad_norm": 6.558131694793701, + "learning_rate": 9.345471969333973e-05, + "loss": 0.9802, + "step": 5532 + }, + { + "epoch": 0.37488989768954534, + "grad_norm": 7.316765308380127, + "learning_rate": 9.345335067424191e-05, + "loss": 1.1635, + "step": 5533 + }, + { + "epoch": 0.37495765295751743, + "grad_norm": 6.615121841430664, + "learning_rate": 9.345198165514409e-05, + "loss": 0.919, + "step": 5534 + }, + { + "epoch": 0.3750254082254895, + "grad_norm": 8.071523666381836, + "learning_rate": 9.345061263604627e-05, + "loss": 1.0023, + "step": 5535 + }, + { + "epoch": 0.3750931634934616, + "grad_norm": 7.563895225524902, + "learning_rate": 9.344924361694845e-05, + "loss": 0.963, + "step": 5536 + }, + { + "epoch": 0.3751609187614337, + "grad_norm": 6.013591766357422, + "learning_rate": 9.344787459785065e-05, + "loss": 0.7862, + "step": 5537 + }, + { + "epoch": 0.3752286740294058, + "grad_norm": 7.296882629394531, + "learning_rate": 9.344650557875283e-05, + "loss": 1.0106, + "step": 5538 + }, + { + "epoch": 0.3752964292973779, + "grad_norm": 7.192817211151123, + "learning_rate": 9.3445136559655e-05, + "loss": 0.8491, + "step": 5539 + }, + { + "epoch": 0.37536418456535, + "grad_norm": 7.962235927581787, + "learning_rate": 9.344376754055719e-05, + "loss": 1.0711, + "step": 5540 + }, + { + "epoch": 0.375431939833322, + "grad_norm": 6.515812873840332, + "learning_rate": 9.344239852145937e-05, + "loss": 0.8779, + "step": 5541 + }, + { + "epoch": 0.3754996951012941, + "grad_norm": 8.563304901123047, + "learning_rate": 9.344102950236156e-05, + "loss": 1.3236, + "step": 5542 + }, + { + "epoch": 0.3755674503692662, + "grad_norm": 8.214926719665527, + "learning_rate": 9.343966048326374e-05, + "loss": 0.9231, + "step": 5543 + }, + { + "epoch": 0.3756352056372383, + "grad_norm": 6.213039398193359, + "learning_rate": 9.343829146416592e-05, + "loss": 0.8446, + "step": 5544 + }, + { + "epoch": 0.3757029609052104, + "grad_norm": 6.392467021942139, + "learning_rate": 9.343692244506812e-05, + "loss": 0.6742, + "step": 5545 + }, + { + "epoch": 0.3757707161731825, + "grad_norm": 6.8930206298828125, + "learning_rate": 9.34355534259703e-05, + "loss": 0.9973, + "step": 5546 + }, + { + "epoch": 0.37583847144115456, + "grad_norm": 8.074743270874023, + "learning_rate": 9.343418440687248e-05, + "loss": 0.8808, + "step": 5547 + }, + { + "epoch": 0.37590622670912666, + "grad_norm": 8.443988800048828, + "learning_rate": 9.343281538777467e-05, + "loss": 1.0066, + "step": 5548 + }, + { + "epoch": 0.3759739819770987, + "grad_norm": 6.530252933502197, + "learning_rate": 9.343144636867685e-05, + "loss": 1.0203, + "step": 5549 + }, + { + "epoch": 0.3760417372450708, + "grad_norm": 6.693957328796387, + "learning_rate": 9.343007734957903e-05, + "loss": 0.9484, + "step": 5550 + }, + { + "epoch": 0.3761094925130429, + "grad_norm": 7.103133201599121, + "learning_rate": 9.342870833048122e-05, + "loss": 1.0442, + "step": 5551 + }, + { + "epoch": 0.37617724778101497, + "grad_norm": 6.089579105377197, + "learning_rate": 9.34273393113834e-05, + "loss": 0.8828, + "step": 5552 + }, + { + "epoch": 0.37624500304898706, + "grad_norm": 5.47523307800293, + "learning_rate": 9.342597029228559e-05, + "loss": 0.8302, + "step": 5553 + }, + { + "epoch": 0.37631275831695915, + "grad_norm": 7.930117130279541, + "learning_rate": 9.342460127318777e-05, + "loss": 0.9565, + "step": 5554 + }, + { + "epoch": 0.37638051358493124, + "grad_norm": 6.38248348236084, + "learning_rate": 9.342323225408996e-05, + "loss": 0.8815, + "step": 5555 + }, + { + "epoch": 0.37644826885290333, + "grad_norm": 6.382977485656738, + "learning_rate": 9.342186323499214e-05, + "loss": 0.8472, + "step": 5556 + }, + { + "epoch": 0.3765160241208754, + "grad_norm": 6.029202461242676, + "learning_rate": 9.342049421589432e-05, + "loss": 0.8102, + "step": 5557 + }, + { + "epoch": 0.37658377938884746, + "grad_norm": 9.764986991882324, + "learning_rate": 9.34191251967965e-05, + "loss": 0.7955, + "step": 5558 + }, + { + "epoch": 0.37665153465681955, + "grad_norm": 8.568723678588867, + "learning_rate": 9.341775617769868e-05, + "loss": 1.0919, + "step": 5559 + }, + { + "epoch": 0.37671928992479164, + "grad_norm": 13.348160743713379, + "learning_rate": 9.341638715860087e-05, + "loss": 1.202, + "step": 5560 + }, + { + "epoch": 0.37678704519276374, + "grad_norm": 7.161466121673584, + "learning_rate": 9.341501813950306e-05, + "loss": 0.909, + "step": 5561 + }, + { + "epoch": 0.37685480046073583, + "grad_norm": 8.446759223937988, + "learning_rate": 9.341364912040524e-05, + "loss": 0.8871, + "step": 5562 + }, + { + "epoch": 0.3769225557287079, + "grad_norm": 7.228363037109375, + "learning_rate": 9.341228010130742e-05, + "loss": 0.961, + "step": 5563 + }, + { + "epoch": 0.37699031099668, + "grad_norm": 7.427947998046875, + "learning_rate": 9.341091108220961e-05, + "loss": 1.1086, + "step": 5564 + }, + { + "epoch": 0.3770580662646521, + "grad_norm": 8.64295482635498, + "learning_rate": 9.340954206311179e-05, + "loss": 0.9684, + "step": 5565 + }, + { + "epoch": 0.37712582153262414, + "grad_norm": 10.287007331848145, + "learning_rate": 9.340817304401397e-05, + "loss": 1.0202, + "step": 5566 + }, + { + "epoch": 0.37719357680059623, + "grad_norm": 7.803440570831299, + "learning_rate": 9.340680402491615e-05, + "loss": 0.8545, + "step": 5567 + }, + { + "epoch": 0.3772613320685683, + "grad_norm": 7.74205207824707, + "learning_rate": 9.340543500581833e-05, + "loss": 1.0585, + "step": 5568 + }, + { + "epoch": 0.3773290873365404, + "grad_norm": 6.5861053466796875, + "learning_rate": 9.340406598672053e-05, + "loss": 0.8838, + "step": 5569 + }, + { + "epoch": 0.3773968426045125, + "grad_norm": 7.1810431480407715, + "learning_rate": 9.34026969676227e-05, + "loss": 0.8669, + "step": 5570 + }, + { + "epoch": 0.3774645978724846, + "grad_norm": 7.8471808433532715, + "learning_rate": 9.340132794852489e-05, + "loss": 1.0412, + "step": 5571 + }, + { + "epoch": 0.3775323531404567, + "grad_norm": 7.740540027618408, + "learning_rate": 9.339995892942707e-05, + "loss": 0.944, + "step": 5572 + }, + { + "epoch": 0.3776001084084288, + "grad_norm": 7.580658912658691, + "learning_rate": 9.339858991032925e-05, + "loss": 0.8676, + "step": 5573 + }, + { + "epoch": 0.3776678636764008, + "grad_norm": 9.143624305725098, + "learning_rate": 9.339722089123144e-05, + "loss": 0.9752, + "step": 5574 + }, + { + "epoch": 0.3777356189443729, + "grad_norm": 8.142240524291992, + "learning_rate": 9.339585187213362e-05, + "loss": 0.9733, + "step": 5575 + }, + { + "epoch": 0.377803374212345, + "grad_norm": 8.034200668334961, + "learning_rate": 9.33944828530358e-05, + "loss": 0.9867, + "step": 5576 + }, + { + "epoch": 0.3778711294803171, + "grad_norm": 7.345114231109619, + "learning_rate": 9.339311383393798e-05, + "loss": 0.9401, + "step": 5577 + }, + { + "epoch": 0.3779388847482892, + "grad_norm": 7.522977828979492, + "learning_rate": 9.339174481484018e-05, + "loss": 0.965, + "step": 5578 + }, + { + "epoch": 0.3780066400162613, + "grad_norm": 6.02646541595459, + "learning_rate": 9.339037579574236e-05, + "loss": 0.9446, + "step": 5579 + }, + { + "epoch": 0.37807439528423337, + "grad_norm": 6.161006450653076, + "learning_rate": 9.338900677664454e-05, + "loss": 0.8803, + "step": 5580 + }, + { + "epoch": 0.37814215055220546, + "grad_norm": 6.318620681762695, + "learning_rate": 9.338763775754672e-05, + "loss": 0.646, + "step": 5581 + }, + { + "epoch": 0.3782099058201775, + "grad_norm": 5.793397903442383, + "learning_rate": 9.33862687384489e-05, + "loss": 0.7483, + "step": 5582 + }, + { + "epoch": 0.3782776610881496, + "grad_norm": 7.235496997833252, + "learning_rate": 9.338489971935109e-05, + "loss": 0.9768, + "step": 5583 + }, + { + "epoch": 0.3783454163561217, + "grad_norm": 6.682423114776611, + "learning_rate": 9.338353070025327e-05, + "loss": 0.8466, + "step": 5584 + }, + { + "epoch": 0.37841317162409377, + "grad_norm": 9.257969856262207, + "learning_rate": 9.338216168115545e-05, + "loss": 1.1546, + "step": 5585 + }, + { + "epoch": 0.37848092689206586, + "grad_norm": 7.223050594329834, + "learning_rate": 9.338079266205763e-05, + "loss": 0.7594, + "step": 5586 + }, + { + "epoch": 0.37854868216003795, + "grad_norm": 7.735831260681152, + "learning_rate": 9.337942364295983e-05, + "loss": 1.0392, + "step": 5587 + }, + { + "epoch": 0.37861643742801004, + "grad_norm": 6.439917087554932, + "learning_rate": 9.3378054623862e-05, + "loss": 1.0901, + "step": 5588 + }, + { + "epoch": 0.37868419269598214, + "grad_norm": 7.018877983093262, + "learning_rate": 9.337668560476419e-05, + "loss": 0.9918, + "step": 5589 + }, + { + "epoch": 0.37875194796395417, + "grad_norm": 5.858334541320801, + "learning_rate": 9.337531658566637e-05, + "loss": 0.6788, + "step": 5590 + }, + { + "epoch": 0.37881970323192626, + "grad_norm": 8.445877075195312, + "learning_rate": 9.337394756656856e-05, + "loss": 0.9777, + "step": 5591 + }, + { + "epoch": 0.37888745849989836, + "grad_norm": 6.7996439933776855, + "learning_rate": 9.337257854747074e-05, + "loss": 1.0086, + "step": 5592 + }, + { + "epoch": 0.37895521376787045, + "grad_norm": 6.983299255371094, + "learning_rate": 9.337120952837292e-05, + "loss": 0.9981, + "step": 5593 + }, + { + "epoch": 0.37902296903584254, + "grad_norm": 7.999194622039795, + "learning_rate": 9.336984050927511e-05, + "loss": 0.9166, + "step": 5594 + }, + { + "epoch": 0.37909072430381463, + "grad_norm": 7.698132038116455, + "learning_rate": 9.33684714901773e-05, + "loss": 0.8333, + "step": 5595 + }, + { + "epoch": 0.3791584795717867, + "grad_norm": 5.9128217697143555, + "learning_rate": 9.336710247107948e-05, + "loss": 0.7822, + "step": 5596 + }, + { + "epoch": 0.3792262348397588, + "grad_norm": 7.200125217437744, + "learning_rate": 9.336573345198167e-05, + "loss": 0.906, + "step": 5597 + }, + { + "epoch": 0.37929399010773085, + "grad_norm": 5.82244348526001, + "learning_rate": 9.336436443288385e-05, + "loss": 0.7376, + "step": 5598 + }, + { + "epoch": 0.37936174537570294, + "grad_norm": 6.775420188903809, + "learning_rate": 9.336299541378603e-05, + "loss": 0.6975, + "step": 5599 + }, + { + "epoch": 0.37942950064367503, + "grad_norm": 6.542332649230957, + "learning_rate": 9.336162639468821e-05, + "loss": 0.9788, + "step": 5600 + }, + { + "epoch": 0.3794972559116471, + "grad_norm": 6.677700519561768, + "learning_rate": 9.33602573755904e-05, + "loss": 0.79, + "step": 5601 + }, + { + "epoch": 0.3795650111796192, + "grad_norm": 8.561968803405762, + "learning_rate": 9.335888835649258e-05, + "loss": 0.9499, + "step": 5602 + }, + { + "epoch": 0.3796327664475913, + "grad_norm": 7.0320234298706055, + "learning_rate": 9.335751933739477e-05, + "loss": 0.865, + "step": 5603 + }, + { + "epoch": 0.3797005217155634, + "grad_norm": 7.31481409072876, + "learning_rate": 9.335615031829695e-05, + "loss": 1.0356, + "step": 5604 + }, + { + "epoch": 0.3797682769835355, + "grad_norm": 8.205849647521973, + "learning_rate": 9.335478129919913e-05, + "loss": 0.7836, + "step": 5605 + }, + { + "epoch": 0.3798360322515075, + "grad_norm": 6.882634162902832, + "learning_rate": 9.335341228010132e-05, + "loss": 0.8879, + "step": 5606 + }, + { + "epoch": 0.3799037875194796, + "grad_norm": 8.171550750732422, + "learning_rate": 9.33520432610035e-05, + "loss": 0.8849, + "step": 5607 + }, + { + "epoch": 0.3799715427874517, + "grad_norm": 7.184536457061768, + "learning_rate": 9.335067424190568e-05, + "loss": 0.8206, + "step": 5608 + }, + { + "epoch": 0.3800392980554238, + "grad_norm": 6.50775146484375, + "learning_rate": 9.334930522280786e-05, + "loss": 0.7802, + "step": 5609 + }, + { + "epoch": 0.3801070533233959, + "grad_norm": 7.537467002868652, + "learning_rate": 9.334793620371005e-05, + "loss": 1.1643, + "step": 5610 + }, + { + "epoch": 0.380174808591368, + "grad_norm": 6.740983486175537, + "learning_rate": 9.334656718461223e-05, + "loss": 0.686, + "step": 5611 + }, + { + "epoch": 0.3802425638593401, + "grad_norm": 6.01100492477417, + "learning_rate": 9.334519816551442e-05, + "loss": 0.8931, + "step": 5612 + }, + { + "epoch": 0.38031031912731217, + "grad_norm": 6.056909084320068, + "learning_rate": 9.33438291464166e-05, + "loss": 0.7861, + "step": 5613 + }, + { + "epoch": 0.38037807439528426, + "grad_norm": 6.271320343017578, + "learning_rate": 9.334246012731878e-05, + "loss": 0.8277, + "step": 5614 + }, + { + "epoch": 0.3804458296632563, + "grad_norm": 5.80530309677124, + "learning_rate": 9.334109110822097e-05, + "loss": 0.6765, + "step": 5615 + }, + { + "epoch": 0.3805135849312284, + "grad_norm": 6.031884670257568, + "learning_rate": 9.333972208912315e-05, + "loss": 0.6482, + "step": 5616 + }, + { + "epoch": 0.3805813401992005, + "grad_norm": 7.629202842712402, + "learning_rate": 9.333835307002533e-05, + "loss": 1.0782, + "step": 5617 + }, + { + "epoch": 0.38064909546717257, + "grad_norm": 8.182783126831055, + "learning_rate": 9.333698405092751e-05, + "loss": 0.8355, + "step": 5618 + }, + { + "epoch": 0.38071685073514466, + "grad_norm": 7.317597389221191, + "learning_rate": 9.33356150318297e-05, + "loss": 0.9035, + "step": 5619 + }, + { + "epoch": 0.38078460600311675, + "grad_norm": 8.783252716064453, + "learning_rate": 9.333424601273189e-05, + "loss": 1.1105, + "step": 5620 + }, + { + "epoch": 0.38085236127108885, + "grad_norm": 6.5179290771484375, + "learning_rate": 9.333287699363407e-05, + "loss": 0.7859, + "step": 5621 + }, + { + "epoch": 0.38092011653906094, + "grad_norm": 7.544989585876465, + "learning_rate": 9.333150797453625e-05, + "loss": 0.8601, + "step": 5622 + }, + { + "epoch": 0.380987871807033, + "grad_norm": 7.307798862457275, + "learning_rate": 9.333013895543843e-05, + "loss": 1.071, + "step": 5623 + }, + { + "epoch": 0.38105562707500507, + "grad_norm": 6.946296215057373, + "learning_rate": 9.332876993634062e-05, + "loss": 0.8445, + "step": 5624 + }, + { + "epoch": 0.38112338234297716, + "grad_norm": 8.099409103393555, + "learning_rate": 9.33274009172428e-05, + "loss": 0.9821, + "step": 5625 + }, + { + "epoch": 0.38119113761094925, + "grad_norm": 7.492844104766846, + "learning_rate": 9.332603189814498e-05, + "loss": 0.9696, + "step": 5626 + }, + { + "epoch": 0.38125889287892134, + "grad_norm": 6.632567882537842, + "learning_rate": 9.332466287904716e-05, + "loss": 0.8853, + "step": 5627 + }, + { + "epoch": 0.38132664814689343, + "grad_norm": 5.6510748863220215, + "learning_rate": 9.332329385994934e-05, + "loss": 0.8686, + "step": 5628 + }, + { + "epoch": 0.3813944034148655, + "grad_norm": 6.428233623504639, + "learning_rate": 9.332192484085154e-05, + "loss": 0.8912, + "step": 5629 + }, + { + "epoch": 0.3814621586828376, + "grad_norm": 8.161954879760742, + "learning_rate": 9.332055582175372e-05, + "loss": 0.888, + "step": 5630 + }, + { + "epoch": 0.38152991395080965, + "grad_norm": 7.743470191955566, + "learning_rate": 9.33191868026559e-05, + "loss": 0.8759, + "step": 5631 + }, + { + "epoch": 0.38159766921878174, + "grad_norm": 7.648897647857666, + "learning_rate": 9.331781778355808e-05, + "loss": 1.2331, + "step": 5632 + }, + { + "epoch": 0.38166542448675383, + "grad_norm": 7.748523235321045, + "learning_rate": 9.331644876446027e-05, + "loss": 1.1732, + "step": 5633 + }, + { + "epoch": 0.3817331797547259, + "grad_norm": 7.337912082672119, + "learning_rate": 9.331507974536245e-05, + "loss": 0.8043, + "step": 5634 + }, + { + "epoch": 0.381800935022698, + "grad_norm": 7.525491714477539, + "learning_rate": 9.331371072626463e-05, + "loss": 0.8762, + "step": 5635 + }, + { + "epoch": 0.3818686902906701, + "grad_norm": 7.477416515350342, + "learning_rate": 9.331234170716681e-05, + "loss": 0.9053, + "step": 5636 + }, + { + "epoch": 0.3819364455586422, + "grad_norm": 6.855381965637207, + "learning_rate": 9.331097268806899e-05, + "loss": 0.9124, + "step": 5637 + }, + { + "epoch": 0.3820042008266143, + "grad_norm": 7.663267135620117, + "learning_rate": 9.330960366897119e-05, + "loss": 0.8139, + "step": 5638 + }, + { + "epoch": 0.38207195609458633, + "grad_norm": 6.08014440536499, + "learning_rate": 9.330823464987337e-05, + "loss": 0.7084, + "step": 5639 + }, + { + "epoch": 0.3821397113625584, + "grad_norm": 7.804579734802246, + "learning_rate": 9.330686563077555e-05, + "loss": 1.206, + "step": 5640 + }, + { + "epoch": 0.3822074666305305, + "grad_norm": 7.1148552894592285, + "learning_rate": 9.330549661167774e-05, + "loss": 1.1221, + "step": 5641 + }, + { + "epoch": 0.3822752218985026, + "grad_norm": 6.727654457092285, + "learning_rate": 9.330412759257992e-05, + "loss": 1.025, + "step": 5642 + }, + { + "epoch": 0.3823429771664747, + "grad_norm": 7.059203147888184, + "learning_rate": 9.33027585734821e-05, + "loss": 1.0819, + "step": 5643 + }, + { + "epoch": 0.3824107324344468, + "grad_norm": 8.003962516784668, + "learning_rate": 9.33013895543843e-05, + "loss": 0.95, + "step": 5644 + }, + { + "epoch": 0.3824784877024189, + "grad_norm": 7.414921283721924, + "learning_rate": 9.330002053528647e-05, + "loss": 0.9279, + "step": 5645 + }, + { + "epoch": 0.38254624297039097, + "grad_norm": 5.462070465087891, + "learning_rate": 9.329865151618866e-05, + "loss": 0.8003, + "step": 5646 + }, + { + "epoch": 0.382613998238363, + "grad_norm": 7.23457145690918, + "learning_rate": 9.329728249709085e-05, + "loss": 0.8939, + "step": 5647 + }, + { + "epoch": 0.3826817535063351, + "grad_norm": 6.371642112731934, + "learning_rate": 9.329591347799303e-05, + "loss": 0.7125, + "step": 5648 + }, + { + "epoch": 0.3827495087743072, + "grad_norm": 6.855792999267578, + "learning_rate": 9.329454445889521e-05, + "loss": 0.745, + "step": 5649 + }, + { + "epoch": 0.3828172640422793, + "grad_norm": 6.945821762084961, + "learning_rate": 9.329317543979739e-05, + "loss": 1.1931, + "step": 5650 + }, + { + "epoch": 0.3828850193102514, + "grad_norm": 7.453825950622559, + "learning_rate": 9.329180642069957e-05, + "loss": 0.9513, + "step": 5651 + }, + { + "epoch": 0.38295277457822346, + "grad_norm": 7.040378093719482, + "learning_rate": 9.329043740160176e-05, + "loss": 0.8896, + "step": 5652 + }, + { + "epoch": 0.38302052984619556, + "grad_norm": 8.740388870239258, + "learning_rate": 9.328906838250394e-05, + "loss": 0.771, + "step": 5653 + }, + { + "epoch": 0.38308828511416765, + "grad_norm": 6.791045665740967, + "learning_rate": 9.328769936340613e-05, + "loss": 0.8835, + "step": 5654 + }, + { + "epoch": 0.3831560403821397, + "grad_norm": 6.560173511505127, + "learning_rate": 9.32863303443083e-05, + "loss": 0.8516, + "step": 5655 + }, + { + "epoch": 0.3832237956501118, + "grad_norm": 6.763086795806885, + "learning_rate": 9.32849613252105e-05, + "loss": 0.7976, + "step": 5656 + }, + { + "epoch": 0.38329155091808387, + "grad_norm": 5.9037370681762695, + "learning_rate": 9.328359230611268e-05, + "loss": 0.943, + "step": 5657 + }, + { + "epoch": 0.38335930618605596, + "grad_norm": 4.944889545440674, + "learning_rate": 9.328222328701486e-05, + "loss": 0.7913, + "step": 5658 + }, + { + "epoch": 0.38342706145402805, + "grad_norm": 8.701375007629395, + "learning_rate": 9.328085426791704e-05, + "loss": 0.8, + "step": 5659 + }, + { + "epoch": 0.38349481672200014, + "grad_norm": 7.1927289962768555, + "learning_rate": 9.327948524881922e-05, + "loss": 0.8888, + "step": 5660 + }, + { + "epoch": 0.38356257198997223, + "grad_norm": 8.781030654907227, + "learning_rate": 9.327811622972141e-05, + "loss": 0.951, + "step": 5661 + }, + { + "epoch": 0.3836303272579443, + "grad_norm": 7.27484655380249, + "learning_rate": 9.32767472106236e-05, + "loss": 0.8034, + "step": 5662 + }, + { + "epoch": 0.3836980825259164, + "grad_norm": 7.739286422729492, + "learning_rate": 9.327537819152578e-05, + "loss": 1.0769, + "step": 5663 + }, + { + "epoch": 0.38376583779388845, + "grad_norm": 7.0264973640441895, + "learning_rate": 9.327400917242796e-05, + "loss": 1.0652, + "step": 5664 + }, + { + "epoch": 0.38383359306186055, + "grad_norm": 6.887399673461914, + "learning_rate": 9.327264015333015e-05, + "loss": 0.7986, + "step": 5665 + }, + { + "epoch": 0.38390134832983264, + "grad_norm": 9.081385612487793, + "learning_rate": 9.327127113423233e-05, + "loss": 1.2249, + "step": 5666 + }, + { + "epoch": 0.38396910359780473, + "grad_norm": 7.966154098510742, + "learning_rate": 9.326990211513451e-05, + "loss": 0.8924, + "step": 5667 + }, + { + "epoch": 0.3840368588657768, + "grad_norm": 7.321074962615967, + "learning_rate": 9.326853309603669e-05, + "loss": 1.0022, + "step": 5668 + }, + { + "epoch": 0.3841046141337489, + "grad_norm": 7.060865879058838, + "learning_rate": 9.326716407693887e-05, + "loss": 0.8849, + "step": 5669 + }, + { + "epoch": 0.384172369401721, + "grad_norm": 7.009364604949951, + "learning_rate": 9.326579505784106e-05, + "loss": 0.9032, + "step": 5670 + }, + { + "epoch": 0.3842401246696931, + "grad_norm": 8.417223930358887, + "learning_rate": 9.326442603874325e-05, + "loss": 1.1162, + "step": 5671 + }, + { + "epoch": 0.38430787993766513, + "grad_norm": 6.774781227111816, + "learning_rate": 9.326305701964543e-05, + "loss": 0.773, + "step": 5672 + }, + { + "epoch": 0.3843756352056372, + "grad_norm": 6.5464911460876465, + "learning_rate": 9.32616880005476e-05, + "loss": 1.0374, + "step": 5673 + }, + { + "epoch": 0.3844433904736093, + "grad_norm": 6.618286609649658, + "learning_rate": 9.326031898144979e-05, + "loss": 0.9538, + "step": 5674 + }, + { + "epoch": 0.3845111457415814, + "grad_norm": 7.598459720611572, + "learning_rate": 9.325894996235198e-05, + "loss": 0.7811, + "step": 5675 + }, + { + "epoch": 0.3845789010095535, + "grad_norm": 6.456278324127197, + "learning_rate": 9.325758094325416e-05, + "loss": 1.055, + "step": 5676 + }, + { + "epoch": 0.3846466562775256, + "grad_norm": 7.940021991729736, + "learning_rate": 9.325621192415634e-05, + "loss": 1.1605, + "step": 5677 + }, + { + "epoch": 0.3847144115454977, + "grad_norm": 8.113789558410645, + "learning_rate": 9.325484290505852e-05, + "loss": 1.0405, + "step": 5678 + }, + { + "epoch": 0.3847821668134698, + "grad_norm": 7.918695449829102, + "learning_rate": 9.325347388596071e-05, + "loss": 0.8019, + "step": 5679 + }, + { + "epoch": 0.3848499220814418, + "grad_norm": 8.129600524902344, + "learning_rate": 9.32521048668629e-05, + "loss": 0.9581, + "step": 5680 + }, + { + "epoch": 0.3849176773494139, + "grad_norm": 6.6353230476379395, + "learning_rate": 9.325073584776508e-05, + "loss": 0.8932, + "step": 5681 + }, + { + "epoch": 0.384985432617386, + "grad_norm": 6.153687477111816, + "learning_rate": 9.324936682866726e-05, + "loss": 0.7646, + "step": 5682 + }, + { + "epoch": 0.3850531878853581, + "grad_norm": 6.837560653686523, + "learning_rate": 9.324799780956944e-05, + "loss": 0.8998, + "step": 5683 + }, + { + "epoch": 0.3851209431533302, + "grad_norm": 8.48747444152832, + "learning_rate": 9.324662879047163e-05, + "loss": 0.7761, + "step": 5684 + }, + { + "epoch": 0.38518869842130227, + "grad_norm": 10.725845336914062, + "learning_rate": 9.324525977137381e-05, + "loss": 1.1755, + "step": 5685 + }, + { + "epoch": 0.38525645368927436, + "grad_norm": 5.731265544891357, + "learning_rate": 9.324389075227599e-05, + "loss": 0.8974, + "step": 5686 + }, + { + "epoch": 0.38532420895724645, + "grad_norm": 8.479190826416016, + "learning_rate": 9.324252173317818e-05, + "loss": 0.9307, + "step": 5687 + }, + { + "epoch": 0.3853919642252185, + "grad_norm": 6.539626598358154, + "learning_rate": 9.324115271408037e-05, + "loss": 0.9463, + "step": 5688 + }, + { + "epoch": 0.3854597194931906, + "grad_norm": 7.225162029266357, + "learning_rate": 9.323978369498255e-05, + "loss": 0.8206, + "step": 5689 + }, + { + "epoch": 0.38552747476116267, + "grad_norm": 8.779760360717773, + "learning_rate": 9.323841467588474e-05, + "loss": 1.1208, + "step": 5690 + }, + { + "epoch": 0.38559523002913476, + "grad_norm": 7.580684185028076, + "learning_rate": 9.323704565678692e-05, + "loss": 0.7993, + "step": 5691 + }, + { + "epoch": 0.38566298529710685, + "grad_norm": 8.699392318725586, + "learning_rate": 9.32356766376891e-05, + "loss": 1.073, + "step": 5692 + }, + { + "epoch": 0.38573074056507894, + "grad_norm": 7.190006256103516, + "learning_rate": 9.32343076185913e-05, + "loss": 1.0204, + "step": 5693 + }, + { + "epoch": 0.38579849583305104, + "grad_norm": 9.289970397949219, + "learning_rate": 9.323293859949347e-05, + "loss": 0.7091, + "step": 5694 + }, + { + "epoch": 0.3858662511010231, + "grad_norm": 9.533041000366211, + "learning_rate": 9.323156958039565e-05, + "loss": 1.103, + "step": 5695 + }, + { + "epoch": 0.38593400636899516, + "grad_norm": 7.375948905944824, + "learning_rate": 9.323020056129783e-05, + "loss": 0.9815, + "step": 5696 + }, + { + "epoch": 0.38600176163696726, + "grad_norm": 8.580230712890625, + "learning_rate": 9.322883154220003e-05, + "loss": 1.1056, + "step": 5697 + }, + { + "epoch": 0.38606951690493935, + "grad_norm": 8.332324981689453, + "learning_rate": 9.322746252310221e-05, + "loss": 0.8809, + "step": 5698 + }, + { + "epoch": 0.38613727217291144, + "grad_norm": 6.298001289367676, + "learning_rate": 9.322609350400439e-05, + "loss": 0.9255, + "step": 5699 + }, + { + "epoch": 0.38620502744088353, + "grad_norm": 7.031383514404297, + "learning_rate": 9.322472448490657e-05, + "loss": 0.9196, + "step": 5700 + }, + { + "epoch": 0.3862727827088556, + "grad_norm": 6.791995048522949, + "learning_rate": 9.322335546580875e-05, + "loss": 0.8184, + "step": 5701 + }, + { + "epoch": 0.3863405379768277, + "grad_norm": 7.81817102432251, + "learning_rate": 9.322198644671094e-05, + "loss": 0.9331, + "step": 5702 + }, + { + "epoch": 0.3864082932447998, + "grad_norm": 7.933851718902588, + "learning_rate": 9.322061742761312e-05, + "loss": 0.872, + "step": 5703 + }, + { + "epoch": 0.38647604851277184, + "grad_norm": 7.805744171142578, + "learning_rate": 9.32192484085153e-05, + "loss": 0.8378, + "step": 5704 + }, + { + "epoch": 0.38654380378074393, + "grad_norm": 9.205484390258789, + "learning_rate": 9.321787938941749e-05, + "loss": 0.9853, + "step": 5705 + }, + { + "epoch": 0.386611559048716, + "grad_norm": 7.313584804534912, + "learning_rate": 9.321651037031967e-05, + "loss": 0.7576, + "step": 5706 + }, + { + "epoch": 0.3866793143166881, + "grad_norm": 10.125847816467285, + "learning_rate": 9.321514135122186e-05, + "loss": 0.8497, + "step": 5707 + }, + { + "epoch": 0.3867470695846602, + "grad_norm": 7.392860412597656, + "learning_rate": 9.321377233212404e-05, + "loss": 0.9151, + "step": 5708 + }, + { + "epoch": 0.3868148248526323, + "grad_norm": 8.305766105651855, + "learning_rate": 9.321240331302622e-05, + "loss": 1.1604, + "step": 5709 + }, + { + "epoch": 0.3868825801206044, + "grad_norm": 5.897716522216797, + "learning_rate": 9.32110342939284e-05, + "loss": 0.6471, + "step": 5710 + }, + { + "epoch": 0.3869503353885765, + "grad_norm": 6.7423200607299805, + "learning_rate": 9.32096652748306e-05, + "loss": 0.7968, + "step": 5711 + }, + { + "epoch": 0.3870180906565485, + "grad_norm": 8.511382102966309, + "learning_rate": 9.320829625573277e-05, + "loss": 1.1875, + "step": 5712 + }, + { + "epoch": 0.3870858459245206, + "grad_norm": 8.105764389038086, + "learning_rate": 9.320692723663495e-05, + "loss": 1.0717, + "step": 5713 + }, + { + "epoch": 0.3871536011924927, + "grad_norm": 6.555315971374512, + "learning_rate": 9.320555821753714e-05, + "loss": 0.7491, + "step": 5714 + }, + { + "epoch": 0.3872213564604648, + "grad_norm": 7.481291770935059, + "learning_rate": 9.320418919843932e-05, + "loss": 1.0057, + "step": 5715 + }, + { + "epoch": 0.3872891117284369, + "grad_norm": 8.126254081726074, + "learning_rate": 9.320282017934151e-05, + "loss": 0.94, + "step": 5716 + }, + { + "epoch": 0.387356866996409, + "grad_norm": 5.760578632354736, + "learning_rate": 9.320145116024369e-05, + "loss": 0.8861, + "step": 5717 + }, + { + "epoch": 0.38742462226438107, + "grad_norm": 7.664496421813965, + "learning_rate": 9.320008214114587e-05, + "loss": 0.8005, + "step": 5718 + }, + { + "epoch": 0.38749237753235316, + "grad_norm": 8.473989486694336, + "learning_rate": 9.319871312204805e-05, + "loss": 0.8196, + "step": 5719 + }, + { + "epoch": 0.38756013280032525, + "grad_norm": 8.553350448608398, + "learning_rate": 9.319734410295024e-05, + "loss": 0.8356, + "step": 5720 + }, + { + "epoch": 0.3876278880682973, + "grad_norm": 7.713801860809326, + "learning_rate": 9.319597508385242e-05, + "loss": 1.0326, + "step": 5721 + }, + { + "epoch": 0.3876956433362694, + "grad_norm": 7.013209819793701, + "learning_rate": 9.31946060647546e-05, + "loss": 1.1149, + "step": 5722 + }, + { + "epoch": 0.38776339860424147, + "grad_norm": 8.171217918395996, + "learning_rate": 9.319323704565679e-05, + "loss": 1.141, + "step": 5723 + }, + { + "epoch": 0.38783115387221356, + "grad_norm": 8.459320068359375, + "learning_rate": 9.319186802655897e-05, + "loss": 0.8859, + "step": 5724 + }, + { + "epoch": 0.38789890914018565, + "grad_norm": 6.681031227111816, + "learning_rate": 9.319049900746116e-05, + "loss": 0.8746, + "step": 5725 + }, + { + "epoch": 0.38796666440815775, + "grad_norm": 6.796359539031982, + "learning_rate": 9.318912998836334e-05, + "loss": 1.0891, + "step": 5726 + }, + { + "epoch": 0.38803441967612984, + "grad_norm": 6.819052696228027, + "learning_rate": 9.318776096926552e-05, + "loss": 0.8087, + "step": 5727 + }, + { + "epoch": 0.38810217494410193, + "grad_norm": 6.746981620788574, + "learning_rate": 9.31863919501677e-05, + "loss": 1.1591, + "step": 5728 + }, + { + "epoch": 0.38816993021207397, + "grad_norm": 7.767449855804443, + "learning_rate": 9.318502293106988e-05, + "loss": 1.1012, + "step": 5729 + }, + { + "epoch": 0.38823768548004606, + "grad_norm": 7.967437744140625, + "learning_rate": 9.318365391197207e-05, + "loss": 1.0262, + "step": 5730 + }, + { + "epoch": 0.38830544074801815, + "grad_norm": 6.053138256072998, + "learning_rate": 9.318228489287426e-05, + "loss": 0.6821, + "step": 5731 + }, + { + "epoch": 0.38837319601599024, + "grad_norm": 7.298278331756592, + "learning_rate": 9.318091587377644e-05, + "loss": 1.0869, + "step": 5732 + }, + { + "epoch": 0.38844095128396233, + "grad_norm": 6.621678352355957, + "learning_rate": 9.317954685467863e-05, + "loss": 0.9388, + "step": 5733 + }, + { + "epoch": 0.3885087065519344, + "grad_norm": 7.588876247406006, + "learning_rate": 9.317817783558081e-05, + "loss": 0.8775, + "step": 5734 + }, + { + "epoch": 0.3885764618199065, + "grad_norm": 6.0856242179870605, + "learning_rate": 9.317680881648299e-05, + "loss": 0.8057, + "step": 5735 + }, + { + "epoch": 0.3886442170878786, + "grad_norm": 6.144415378570557, + "learning_rate": 9.317543979738518e-05, + "loss": 0.9205, + "step": 5736 + }, + { + "epoch": 0.38871197235585064, + "grad_norm": 8.25857925415039, + "learning_rate": 9.317407077828736e-05, + "loss": 0.9608, + "step": 5737 + }, + { + "epoch": 0.38877972762382274, + "grad_norm": 9.107572555541992, + "learning_rate": 9.317270175918954e-05, + "loss": 0.7823, + "step": 5738 + }, + { + "epoch": 0.3888474828917948, + "grad_norm": 6.879073619842529, + "learning_rate": 9.317133274009174e-05, + "loss": 0.9137, + "step": 5739 + }, + { + "epoch": 0.3889152381597669, + "grad_norm": 6.114928245544434, + "learning_rate": 9.316996372099392e-05, + "loss": 0.9713, + "step": 5740 + }, + { + "epoch": 0.388982993427739, + "grad_norm": 6.454460620880127, + "learning_rate": 9.31685947018961e-05, + "loss": 0.9224, + "step": 5741 + }, + { + "epoch": 0.3890507486957111, + "grad_norm": 7.231433868408203, + "learning_rate": 9.316722568279828e-05, + "loss": 0.9087, + "step": 5742 + }, + { + "epoch": 0.3891185039636832, + "grad_norm": 7.442675590515137, + "learning_rate": 9.316585666370047e-05, + "loss": 0.9038, + "step": 5743 + }, + { + "epoch": 0.3891862592316553, + "grad_norm": 6.472715854644775, + "learning_rate": 9.316448764460265e-05, + "loss": 0.7429, + "step": 5744 + }, + { + "epoch": 0.3892540144996273, + "grad_norm": 7.20242977142334, + "learning_rate": 9.316311862550483e-05, + "loss": 0.9592, + "step": 5745 + }, + { + "epoch": 0.3893217697675994, + "grad_norm": 5.8578715324401855, + "learning_rate": 9.316174960640701e-05, + "loss": 0.901, + "step": 5746 + }, + { + "epoch": 0.3893895250355715, + "grad_norm": 5.1851983070373535, + "learning_rate": 9.31603805873092e-05, + "loss": 0.6872, + "step": 5747 + }, + { + "epoch": 0.3894572803035436, + "grad_norm": 5.5100531578063965, + "learning_rate": 9.315901156821139e-05, + "loss": 0.8329, + "step": 5748 + }, + { + "epoch": 0.3895250355715157, + "grad_norm": 7.71047830581665, + "learning_rate": 9.315764254911357e-05, + "loss": 0.8726, + "step": 5749 + }, + { + "epoch": 0.3895927908394878, + "grad_norm": 5.928194522857666, + "learning_rate": 9.315627353001575e-05, + "loss": 0.6503, + "step": 5750 + }, + { + "epoch": 0.38966054610745987, + "grad_norm": 6.636775016784668, + "learning_rate": 9.315490451091793e-05, + "loss": 0.8905, + "step": 5751 + }, + { + "epoch": 0.38972830137543196, + "grad_norm": 7.5196027755737305, + "learning_rate": 9.315353549182012e-05, + "loss": 0.9333, + "step": 5752 + }, + { + "epoch": 0.389796056643404, + "grad_norm": 7.1420087814331055, + "learning_rate": 9.31521664727223e-05, + "loss": 0.777, + "step": 5753 + }, + { + "epoch": 0.3898638119113761, + "grad_norm": 6.713923931121826, + "learning_rate": 9.315079745362448e-05, + "loss": 0.7667, + "step": 5754 + }, + { + "epoch": 0.3899315671793482, + "grad_norm": 6.81540584564209, + "learning_rate": 9.314942843452666e-05, + "loss": 0.8838, + "step": 5755 + }, + { + "epoch": 0.3899993224473203, + "grad_norm": 7.069910049438477, + "learning_rate": 9.314805941542885e-05, + "loss": 0.9638, + "step": 5756 + }, + { + "epoch": 0.39006707771529237, + "grad_norm": 6.350069999694824, + "learning_rate": 9.314669039633104e-05, + "loss": 0.8163, + "step": 5757 + }, + { + "epoch": 0.39013483298326446, + "grad_norm": 5.552088737487793, + "learning_rate": 9.314532137723322e-05, + "loss": 0.7061, + "step": 5758 + }, + { + "epoch": 0.39020258825123655, + "grad_norm": 7.8301167488098145, + "learning_rate": 9.31439523581354e-05, + "loss": 1.0415, + "step": 5759 + }, + { + "epoch": 0.39027034351920864, + "grad_norm": 7.079575538635254, + "learning_rate": 9.314258333903758e-05, + "loss": 0.7247, + "step": 5760 + }, + { + "epoch": 0.3903380987871807, + "grad_norm": 7.282689571380615, + "learning_rate": 9.314121431993976e-05, + "loss": 0.8361, + "step": 5761 + }, + { + "epoch": 0.39040585405515277, + "grad_norm": 7.18900203704834, + "learning_rate": 9.313984530084195e-05, + "loss": 1.0206, + "step": 5762 + }, + { + "epoch": 0.39047360932312486, + "grad_norm": 5.926472187042236, + "learning_rate": 9.313847628174413e-05, + "loss": 0.8551, + "step": 5763 + }, + { + "epoch": 0.39054136459109695, + "grad_norm": 8.176214218139648, + "learning_rate": 9.313710726264631e-05, + "loss": 0.8106, + "step": 5764 + }, + { + "epoch": 0.39060911985906904, + "grad_norm": 7.093206882476807, + "learning_rate": 9.31357382435485e-05, + "loss": 0.9986, + "step": 5765 + }, + { + "epoch": 0.39067687512704113, + "grad_norm": 8.531807899475098, + "learning_rate": 9.313436922445069e-05, + "loss": 0.9368, + "step": 5766 + }, + { + "epoch": 0.3907446303950132, + "grad_norm": 6.479072570800781, + "learning_rate": 9.313300020535287e-05, + "loss": 0.858, + "step": 5767 + }, + { + "epoch": 0.3908123856629853, + "grad_norm": 6.749851226806641, + "learning_rate": 9.313163118625505e-05, + "loss": 0.638, + "step": 5768 + }, + { + "epoch": 0.3908801409309574, + "grad_norm": 8.79270076751709, + "learning_rate": 9.313026216715723e-05, + "loss": 0.7734, + "step": 5769 + }, + { + "epoch": 0.39094789619892945, + "grad_norm": 6.972496509552002, + "learning_rate": 9.312889314805941e-05, + "loss": 0.8557, + "step": 5770 + }, + { + "epoch": 0.39101565146690154, + "grad_norm": 6.48140811920166, + "learning_rate": 9.31275241289616e-05, + "loss": 0.8183, + "step": 5771 + }, + { + "epoch": 0.39108340673487363, + "grad_norm": 6.5230536460876465, + "learning_rate": 9.312615510986378e-05, + "loss": 0.7863, + "step": 5772 + }, + { + "epoch": 0.3911511620028457, + "grad_norm": 6.982974052429199, + "learning_rate": 9.312478609076597e-05, + "loss": 0.9604, + "step": 5773 + }, + { + "epoch": 0.3912189172708178, + "grad_norm": 7.524245262145996, + "learning_rate": 9.312341707166815e-05, + "loss": 0.8145, + "step": 5774 + }, + { + "epoch": 0.3912866725387899, + "grad_norm": 7.770455837249756, + "learning_rate": 9.312204805257034e-05, + "loss": 0.858, + "step": 5775 + }, + { + "epoch": 0.391354427806762, + "grad_norm": 6.145147800445557, + "learning_rate": 9.312067903347252e-05, + "loss": 0.6833, + "step": 5776 + }, + { + "epoch": 0.3914221830747341, + "grad_norm": 7.118679523468018, + "learning_rate": 9.31193100143747e-05, + "loss": 0.9945, + "step": 5777 + }, + { + "epoch": 0.3914899383427061, + "grad_norm": 7.793931007385254, + "learning_rate": 9.311794099527688e-05, + "loss": 1.0227, + "step": 5778 + }, + { + "epoch": 0.3915576936106782, + "grad_norm": 6.189599514007568, + "learning_rate": 9.311657197617907e-05, + "loss": 1.0355, + "step": 5779 + }, + { + "epoch": 0.3916254488786503, + "grad_norm": 8.16638469696045, + "learning_rate": 9.311520295708125e-05, + "loss": 0.998, + "step": 5780 + }, + { + "epoch": 0.3916932041466224, + "grad_norm": 7.059429168701172, + "learning_rate": 9.311383393798343e-05, + "loss": 0.8641, + "step": 5781 + }, + { + "epoch": 0.3917609594145945, + "grad_norm": 9.555390357971191, + "learning_rate": 9.311246491888563e-05, + "loss": 0.9243, + "step": 5782 + }, + { + "epoch": 0.3918287146825666, + "grad_norm": 7.566288948059082, + "learning_rate": 9.311109589978781e-05, + "loss": 0.8025, + "step": 5783 + }, + { + "epoch": 0.3918964699505387, + "grad_norm": 8.842116355895996, + "learning_rate": 9.310972688068999e-05, + "loss": 0.8875, + "step": 5784 + }, + { + "epoch": 0.39196422521851076, + "grad_norm": 7.426934719085693, + "learning_rate": 9.310835786159218e-05, + "loss": 0.8116, + "step": 5785 + }, + { + "epoch": 0.3920319804864828, + "grad_norm": 7.222875595092773, + "learning_rate": 9.310698884249436e-05, + "loss": 0.8286, + "step": 5786 + }, + { + "epoch": 0.3920997357544549, + "grad_norm": 8.364874839782715, + "learning_rate": 9.310561982339654e-05, + "loss": 1.1628, + "step": 5787 + }, + { + "epoch": 0.392167491022427, + "grad_norm": 6.4203104972839355, + "learning_rate": 9.310425080429872e-05, + "loss": 0.8501, + "step": 5788 + }, + { + "epoch": 0.3922352462903991, + "grad_norm": 9.124777793884277, + "learning_rate": 9.310288178520092e-05, + "loss": 1.1962, + "step": 5789 + }, + { + "epoch": 0.39230300155837117, + "grad_norm": 7.944338321685791, + "learning_rate": 9.31015127661031e-05, + "loss": 1.3275, + "step": 5790 + }, + { + "epoch": 0.39237075682634326, + "grad_norm": 7.256115913391113, + "learning_rate": 9.310014374700528e-05, + "loss": 0.8594, + "step": 5791 + }, + { + "epoch": 0.39243851209431535, + "grad_norm": 6.452229976654053, + "learning_rate": 9.309877472790746e-05, + "loss": 0.7744, + "step": 5792 + }, + { + "epoch": 0.39250626736228744, + "grad_norm": 8.207562446594238, + "learning_rate": 9.309740570880964e-05, + "loss": 1.0615, + "step": 5793 + }, + { + "epoch": 0.3925740226302595, + "grad_norm": 6.627633571624756, + "learning_rate": 9.309603668971183e-05, + "loss": 1.1951, + "step": 5794 + }, + { + "epoch": 0.39264177789823157, + "grad_norm": 6.888060092926025, + "learning_rate": 9.309466767061401e-05, + "loss": 0.7771, + "step": 5795 + }, + { + "epoch": 0.39270953316620366, + "grad_norm": 5.327685356140137, + "learning_rate": 9.30932986515162e-05, + "loss": 0.6568, + "step": 5796 + }, + { + "epoch": 0.39277728843417575, + "grad_norm": 7.033134460449219, + "learning_rate": 9.309192963241837e-05, + "loss": 0.8601, + "step": 5797 + }, + { + "epoch": 0.39284504370214784, + "grad_norm": 6.694526195526123, + "learning_rate": 9.309056061332057e-05, + "loss": 0.9084, + "step": 5798 + }, + { + "epoch": 0.39291279897011994, + "grad_norm": 6.9023027420043945, + "learning_rate": 9.308919159422275e-05, + "loss": 0.8983, + "step": 5799 + }, + { + "epoch": 0.39298055423809203, + "grad_norm": 9.527570724487305, + "learning_rate": 9.308782257512493e-05, + "loss": 0.9773, + "step": 5800 + }, + { + "epoch": 0.3930483095060641, + "grad_norm": 7.497427940368652, + "learning_rate": 9.308645355602711e-05, + "loss": 0.9713, + "step": 5801 + }, + { + "epoch": 0.39311606477403616, + "grad_norm": 8.860198020935059, + "learning_rate": 9.308508453692929e-05, + "loss": 0.8866, + "step": 5802 + }, + { + "epoch": 0.39318382004200825, + "grad_norm": 7.34425687789917, + "learning_rate": 9.308371551783148e-05, + "loss": 0.9442, + "step": 5803 + }, + { + "epoch": 0.39325157530998034, + "grad_norm": 10.30170726776123, + "learning_rate": 9.308234649873366e-05, + "loss": 0.8957, + "step": 5804 + }, + { + "epoch": 0.39331933057795243, + "grad_norm": 8.141131401062012, + "learning_rate": 9.308097747963584e-05, + "loss": 1.1858, + "step": 5805 + }, + { + "epoch": 0.3933870858459245, + "grad_norm": 7.154587268829346, + "learning_rate": 9.307960846053802e-05, + "loss": 0.9056, + "step": 5806 + }, + { + "epoch": 0.3934548411138966, + "grad_norm": 6.086984634399414, + "learning_rate": 9.30782394414402e-05, + "loss": 0.8943, + "step": 5807 + }, + { + "epoch": 0.3935225963818687, + "grad_norm": 6.48331880569458, + "learning_rate": 9.30768704223424e-05, + "loss": 0.8631, + "step": 5808 + }, + { + "epoch": 0.3935903516498408, + "grad_norm": 9.434244155883789, + "learning_rate": 9.307550140324458e-05, + "loss": 1.0951, + "step": 5809 + }, + { + "epoch": 0.39365810691781283, + "grad_norm": 7.943146228790283, + "learning_rate": 9.307413238414676e-05, + "loss": 0.9073, + "step": 5810 + }, + { + "epoch": 0.3937258621857849, + "grad_norm": 6.200165271759033, + "learning_rate": 9.307276336504894e-05, + "loss": 0.8406, + "step": 5811 + }, + { + "epoch": 0.393793617453757, + "grad_norm": 6.221752643585205, + "learning_rate": 9.307139434595113e-05, + "loss": 0.9181, + "step": 5812 + }, + { + "epoch": 0.3938613727217291, + "grad_norm": 6.645714282989502, + "learning_rate": 9.307002532685331e-05, + "loss": 0.8508, + "step": 5813 + }, + { + "epoch": 0.3939291279897012, + "grad_norm": 7.978506088256836, + "learning_rate": 9.30686563077555e-05, + "loss": 1.1544, + "step": 5814 + }, + { + "epoch": 0.3939968832576733, + "grad_norm": 6.967216968536377, + "learning_rate": 9.306728728865767e-05, + "loss": 0.7956, + "step": 5815 + }, + { + "epoch": 0.3940646385256454, + "grad_norm": 9.8863525390625, + "learning_rate": 9.306591826955986e-05, + "loss": 0.9111, + "step": 5816 + }, + { + "epoch": 0.3941323937936175, + "grad_norm": 7.202954292297363, + "learning_rate": 9.306454925046205e-05, + "loss": 0.9267, + "step": 5817 + }, + { + "epoch": 0.3942001490615895, + "grad_norm": 6.386566162109375, + "learning_rate": 9.306318023136423e-05, + "loss": 0.9494, + "step": 5818 + }, + { + "epoch": 0.3942679043295616, + "grad_norm": 6.505050182342529, + "learning_rate": 9.306181121226641e-05, + "loss": 0.904, + "step": 5819 + }, + { + "epoch": 0.3943356595975337, + "grad_norm": 7.091371536254883, + "learning_rate": 9.306044219316859e-05, + "loss": 0.7305, + "step": 5820 + }, + { + "epoch": 0.3944034148655058, + "grad_norm": 7.322198390960693, + "learning_rate": 9.305907317407078e-05, + "loss": 0.8101, + "step": 5821 + }, + { + "epoch": 0.3944711701334779, + "grad_norm": 6.345345497131348, + "learning_rate": 9.305770415497296e-05, + "loss": 0.7821, + "step": 5822 + }, + { + "epoch": 0.39453892540144997, + "grad_norm": 6.418498992919922, + "learning_rate": 9.305633513587514e-05, + "loss": 0.8844, + "step": 5823 + }, + { + "epoch": 0.39460668066942206, + "grad_norm": 7.375457763671875, + "learning_rate": 9.305496611677733e-05, + "loss": 1.0099, + "step": 5824 + }, + { + "epoch": 0.39467443593739415, + "grad_norm": 7.758962154388428, + "learning_rate": 9.305359709767952e-05, + "loss": 1.1459, + "step": 5825 + }, + { + "epoch": 0.39474219120536624, + "grad_norm": 9.552523612976074, + "learning_rate": 9.30522280785817e-05, + "loss": 1.0285, + "step": 5826 + }, + { + "epoch": 0.3948099464733383, + "grad_norm": 7.053111553192139, + "learning_rate": 9.305085905948388e-05, + "loss": 0.8006, + "step": 5827 + }, + { + "epoch": 0.39487770174131037, + "grad_norm": 7.239533424377441, + "learning_rate": 9.304949004038607e-05, + "loss": 0.9594, + "step": 5828 + }, + { + "epoch": 0.39494545700928246, + "grad_norm": 5.681763172149658, + "learning_rate": 9.304812102128825e-05, + "loss": 0.7517, + "step": 5829 + }, + { + "epoch": 0.39501321227725456, + "grad_norm": 7.394958019256592, + "learning_rate": 9.304675200219043e-05, + "loss": 0.9656, + "step": 5830 + }, + { + "epoch": 0.39508096754522665, + "grad_norm": 5.894937515258789, + "learning_rate": 9.304538298309263e-05, + "loss": 0.7568, + "step": 5831 + }, + { + "epoch": 0.39514872281319874, + "grad_norm": 7.866422653198242, + "learning_rate": 9.304401396399481e-05, + "loss": 1.1937, + "step": 5832 + }, + { + "epoch": 0.39521647808117083, + "grad_norm": 6.807196140289307, + "learning_rate": 9.304264494489699e-05, + "loss": 0.7785, + "step": 5833 + }, + { + "epoch": 0.3952842333491429, + "grad_norm": 7.117645740509033, + "learning_rate": 9.304127592579917e-05, + "loss": 0.8462, + "step": 5834 + }, + { + "epoch": 0.39535198861711496, + "grad_norm": 5.873225212097168, + "learning_rate": 9.303990690670136e-05, + "loss": 0.8372, + "step": 5835 + }, + { + "epoch": 0.39541974388508705, + "grad_norm": 6.9400410652160645, + "learning_rate": 9.303853788760354e-05, + "loss": 0.8154, + "step": 5836 + }, + { + "epoch": 0.39548749915305914, + "grad_norm": 5.7771100997924805, + "learning_rate": 9.303716886850572e-05, + "loss": 0.7411, + "step": 5837 + }, + { + "epoch": 0.39555525442103123, + "grad_norm": 6.784138202667236, + "learning_rate": 9.30357998494079e-05, + "loss": 1.062, + "step": 5838 + }, + { + "epoch": 0.3956230096890033, + "grad_norm": 7.726989269256592, + "learning_rate": 9.303443083031008e-05, + "loss": 0.9726, + "step": 5839 + }, + { + "epoch": 0.3956907649569754, + "grad_norm": 6.342170715332031, + "learning_rate": 9.303306181121228e-05, + "loss": 0.8388, + "step": 5840 + }, + { + "epoch": 0.3957585202249475, + "grad_norm": 8.047952651977539, + "learning_rate": 9.303169279211446e-05, + "loss": 1.0734, + "step": 5841 + }, + { + "epoch": 0.3958262754929196, + "grad_norm": 7.073848724365234, + "learning_rate": 9.303032377301664e-05, + "loss": 0.6968, + "step": 5842 + }, + { + "epoch": 0.39589403076089164, + "grad_norm": 7.159115791320801, + "learning_rate": 9.302895475391882e-05, + "loss": 0.7991, + "step": 5843 + }, + { + "epoch": 0.3959617860288637, + "grad_norm": 6.908319473266602, + "learning_rate": 9.302758573482101e-05, + "loss": 0.9283, + "step": 5844 + }, + { + "epoch": 0.3960295412968358, + "grad_norm": 6.5849690437316895, + "learning_rate": 9.30262167157232e-05, + "loss": 0.8258, + "step": 5845 + }, + { + "epoch": 0.3960972965648079, + "grad_norm": 8.763665199279785, + "learning_rate": 9.302484769662537e-05, + "loss": 0.7734, + "step": 5846 + }, + { + "epoch": 0.39616505183278, + "grad_norm": 7.134938716888428, + "learning_rate": 9.302347867752755e-05, + "loss": 0.8084, + "step": 5847 + }, + { + "epoch": 0.3962328071007521, + "grad_norm": 6.418683052062988, + "learning_rate": 9.302210965842973e-05, + "loss": 0.8225, + "step": 5848 + }, + { + "epoch": 0.3963005623687242, + "grad_norm": 7.124704837799072, + "learning_rate": 9.302074063933193e-05, + "loss": 0.6787, + "step": 5849 + }, + { + "epoch": 0.3963683176366963, + "grad_norm": 6.653177261352539, + "learning_rate": 9.301937162023411e-05, + "loss": 0.8026, + "step": 5850 + }, + { + "epoch": 0.3964360729046683, + "grad_norm": 7.981827735900879, + "learning_rate": 9.301800260113629e-05, + "loss": 0.9281, + "step": 5851 + }, + { + "epoch": 0.3965038281726404, + "grad_norm": 5.823386192321777, + "learning_rate": 9.301663358203847e-05, + "loss": 0.8158, + "step": 5852 + }, + { + "epoch": 0.3965715834406125, + "grad_norm": 8.96346664428711, + "learning_rate": 9.301526456294066e-05, + "loss": 1.0875, + "step": 5853 + }, + { + "epoch": 0.3966393387085846, + "grad_norm": 6.114203929901123, + "learning_rate": 9.301389554384284e-05, + "loss": 0.7896, + "step": 5854 + }, + { + "epoch": 0.3967070939765567, + "grad_norm": 6.386680603027344, + "learning_rate": 9.301252652474502e-05, + "loss": 1.0199, + "step": 5855 + }, + { + "epoch": 0.39677484924452877, + "grad_norm": 8.502519607543945, + "learning_rate": 9.30111575056472e-05, + "loss": 1.0127, + "step": 5856 + }, + { + "epoch": 0.39684260451250086, + "grad_norm": 6.875016689300537, + "learning_rate": 9.300978848654938e-05, + "loss": 0.8263, + "step": 5857 + }, + { + "epoch": 0.39691035978047295, + "grad_norm": 8.344440460205078, + "learning_rate": 9.300841946745158e-05, + "loss": 1.0351, + "step": 5858 + }, + { + "epoch": 0.396978115048445, + "grad_norm": 6.42828893661499, + "learning_rate": 9.300705044835376e-05, + "loss": 0.8503, + "step": 5859 + }, + { + "epoch": 0.3970458703164171, + "grad_norm": 6.0403547286987305, + "learning_rate": 9.300568142925594e-05, + "loss": 0.9078, + "step": 5860 + }, + { + "epoch": 0.3971136255843892, + "grad_norm": 5.010101795196533, + "learning_rate": 9.300431241015812e-05, + "loss": 0.8215, + "step": 5861 + }, + { + "epoch": 0.39718138085236127, + "grad_norm": 6.132750988006592, + "learning_rate": 9.30029433910603e-05, + "loss": 0.7887, + "step": 5862 + }, + { + "epoch": 0.39724913612033336, + "grad_norm": 6.075906753540039, + "learning_rate": 9.30015743719625e-05, + "loss": 0.822, + "step": 5863 + }, + { + "epoch": 0.39731689138830545, + "grad_norm": 8.225122451782227, + "learning_rate": 9.300020535286467e-05, + "loss": 0.7771, + "step": 5864 + }, + { + "epoch": 0.39738464665627754, + "grad_norm": 6.863472938537598, + "learning_rate": 9.299883633376685e-05, + "loss": 0.9185, + "step": 5865 + }, + { + "epoch": 0.39745240192424963, + "grad_norm": 7.280022144317627, + "learning_rate": 9.299746731466903e-05, + "loss": 0.8206, + "step": 5866 + }, + { + "epoch": 0.39752015719222167, + "grad_norm": 5.646960258483887, + "learning_rate": 9.299609829557123e-05, + "loss": 0.7897, + "step": 5867 + }, + { + "epoch": 0.39758791246019376, + "grad_norm": 8.248296737670898, + "learning_rate": 9.299472927647341e-05, + "loss": 0.802, + "step": 5868 + }, + { + "epoch": 0.39765566772816585, + "grad_norm": 6.389570236206055, + "learning_rate": 9.299336025737559e-05, + "loss": 0.7744, + "step": 5869 + }, + { + "epoch": 0.39772342299613794, + "grad_norm": 6.452336311340332, + "learning_rate": 9.299199123827777e-05, + "loss": 0.9003, + "step": 5870 + }, + { + "epoch": 0.39779117826411003, + "grad_norm": 6.798274040222168, + "learning_rate": 9.299062221917995e-05, + "loss": 0.9585, + "step": 5871 + }, + { + "epoch": 0.3978589335320821, + "grad_norm": 6.267078876495361, + "learning_rate": 9.298925320008214e-05, + "loss": 0.7971, + "step": 5872 + }, + { + "epoch": 0.3979266888000542, + "grad_norm": 7.0529069900512695, + "learning_rate": 9.298788418098432e-05, + "loss": 0.9384, + "step": 5873 + }, + { + "epoch": 0.3979944440680263, + "grad_norm": 6.686244964599609, + "learning_rate": 9.298651516188652e-05, + "loss": 0.8589, + "step": 5874 + }, + { + "epoch": 0.3980621993359984, + "grad_norm": 7.230234622955322, + "learning_rate": 9.29851461427887e-05, + "loss": 0.8638, + "step": 5875 + }, + { + "epoch": 0.39812995460397044, + "grad_norm": 8.14789867401123, + "learning_rate": 9.298377712369088e-05, + "loss": 0.7231, + "step": 5876 + }, + { + "epoch": 0.39819770987194253, + "grad_norm": 6.5517497062683105, + "learning_rate": 9.298240810459307e-05, + "loss": 1.0435, + "step": 5877 + }, + { + "epoch": 0.3982654651399146, + "grad_norm": 7.095836639404297, + "learning_rate": 9.298103908549525e-05, + "loss": 0.7551, + "step": 5878 + }, + { + "epoch": 0.3983332204078867, + "grad_norm": 6.932202339172363, + "learning_rate": 9.297967006639743e-05, + "loss": 0.9075, + "step": 5879 + }, + { + "epoch": 0.3984009756758588, + "grad_norm": 8.045002937316895, + "learning_rate": 9.297830104729961e-05, + "loss": 0.9787, + "step": 5880 + }, + { + "epoch": 0.3984687309438309, + "grad_norm": 7.1430511474609375, + "learning_rate": 9.297693202820181e-05, + "loss": 0.8549, + "step": 5881 + }, + { + "epoch": 0.398536486211803, + "grad_norm": 5.857006549835205, + "learning_rate": 9.297556300910399e-05, + "loss": 0.7573, + "step": 5882 + }, + { + "epoch": 0.3986042414797751, + "grad_norm": 6.143594741821289, + "learning_rate": 9.297419399000617e-05, + "loss": 0.747, + "step": 5883 + }, + { + "epoch": 0.3986719967477471, + "grad_norm": 7.4967169761657715, + "learning_rate": 9.297282497090835e-05, + "loss": 1.0366, + "step": 5884 + }, + { + "epoch": 0.3987397520157192, + "grad_norm": 6.6401166915893555, + "learning_rate": 9.297145595181054e-05, + "loss": 0.9889, + "step": 5885 + }, + { + "epoch": 0.3988075072836913, + "grad_norm": 5.577928066253662, + "learning_rate": 9.297008693271272e-05, + "loss": 0.6917, + "step": 5886 + }, + { + "epoch": 0.3988752625516634, + "grad_norm": 5.9933061599731445, + "learning_rate": 9.29687179136149e-05, + "loss": 0.9545, + "step": 5887 + }, + { + "epoch": 0.3989430178196355, + "grad_norm": 6.353165149688721, + "learning_rate": 9.296734889451708e-05, + "loss": 0.7512, + "step": 5888 + }, + { + "epoch": 0.3990107730876076, + "grad_norm": 6.181026458740234, + "learning_rate": 9.296597987541926e-05, + "loss": 0.767, + "step": 5889 + }, + { + "epoch": 0.39907852835557966, + "grad_norm": 6.579110622406006, + "learning_rate": 9.296461085632146e-05, + "loss": 1.0604, + "step": 5890 + }, + { + "epoch": 0.39914628362355176, + "grad_norm": 6.501206398010254, + "learning_rate": 9.296324183722364e-05, + "loss": 0.9821, + "step": 5891 + }, + { + "epoch": 0.3992140388915238, + "grad_norm": 8.253642082214355, + "learning_rate": 9.296187281812582e-05, + "loss": 0.8827, + "step": 5892 + }, + { + "epoch": 0.3992817941594959, + "grad_norm": 7.590344429016113, + "learning_rate": 9.2960503799028e-05, + "loss": 0.9614, + "step": 5893 + }, + { + "epoch": 0.399349549427468, + "grad_norm": 6.755953311920166, + "learning_rate": 9.295913477993018e-05, + "loss": 0.8298, + "step": 5894 + }, + { + "epoch": 0.39941730469544007, + "grad_norm": 6.040559768676758, + "learning_rate": 9.295776576083237e-05, + "loss": 0.7182, + "step": 5895 + }, + { + "epoch": 0.39948505996341216, + "grad_norm": 6.301966667175293, + "learning_rate": 9.295639674173455e-05, + "loss": 0.8545, + "step": 5896 + }, + { + "epoch": 0.39955281523138425, + "grad_norm": 5.77929162979126, + "learning_rate": 9.295502772263673e-05, + "loss": 0.9398, + "step": 5897 + }, + { + "epoch": 0.39962057049935634, + "grad_norm": 6.633763313293457, + "learning_rate": 9.295365870353891e-05, + "loss": 0.8914, + "step": 5898 + }, + { + "epoch": 0.39968832576732843, + "grad_norm": 7.260994911193848, + "learning_rate": 9.295228968444111e-05, + "loss": 0.7725, + "step": 5899 + }, + { + "epoch": 0.39975608103530047, + "grad_norm": 8.820511817932129, + "learning_rate": 9.295092066534329e-05, + "loss": 0.7847, + "step": 5900 + }, + { + "epoch": 0.39982383630327256, + "grad_norm": 7.335788726806641, + "learning_rate": 9.294955164624547e-05, + "loss": 0.683, + "step": 5901 + }, + { + "epoch": 0.39989159157124465, + "grad_norm": 7.959702491760254, + "learning_rate": 9.294818262714765e-05, + "loss": 0.7598, + "step": 5902 + }, + { + "epoch": 0.39995934683921675, + "grad_norm": 9.680485725402832, + "learning_rate": 9.294681360804983e-05, + "loss": 1.0359, + "step": 5903 + }, + { + "epoch": 0.40002710210718884, + "grad_norm": 6.917464733123779, + "learning_rate": 9.294544458895202e-05, + "loss": 1.0155, + "step": 5904 + }, + { + "epoch": 0.40009485737516093, + "grad_norm": 6.221781253814697, + "learning_rate": 9.29440755698542e-05, + "loss": 0.9645, + "step": 5905 + }, + { + "epoch": 0.400162612643133, + "grad_norm": 6.778574466705322, + "learning_rate": 9.294270655075638e-05, + "loss": 0.794, + "step": 5906 + }, + { + "epoch": 0.4002303679111051, + "grad_norm": 7.8015522956848145, + "learning_rate": 9.294133753165856e-05, + "loss": 0.9472, + "step": 5907 + }, + { + "epoch": 0.40029812317907715, + "grad_norm": 5.018773555755615, + "learning_rate": 9.293996851256076e-05, + "loss": 0.8961, + "step": 5908 + }, + { + "epoch": 0.40036587844704924, + "grad_norm": 6.184563636779785, + "learning_rate": 9.293859949346294e-05, + "loss": 0.7988, + "step": 5909 + }, + { + "epoch": 0.40043363371502133, + "grad_norm": 6.593270778656006, + "learning_rate": 9.293723047436512e-05, + "loss": 0.8248, + "step": 5910 + }, + { + "epoch": 0.4005013889829934, + "grad_norm": 7.14009952545166, + "learning_rate": 9.29358614552673e-05, + "loss": 0.8589, + "step": 5911 + }, + { + "epoch": 0.4005691442509655, + "grad_norm": 8.044157981872559, + "learning_rate": 9.293449243616948e-05, + "loss": 0.9397, + "step": 5912 + }, + { + "epoch": 0.4006368995189376, + "grad_norm": 7.3374247550964355, + "learning_rate": 9.293312341707167e-05, + "loss": 0.8615, + "step": 5913 + }, + { + "epoch": 0.4007046547869097, + "grad_norm": 6.510500907897949, + "learning_rate": 9.293175439797385e-05, + "loss": 0.7151, + "step": 5914 + }, + { + "epoch": 0.4007724100548818, + "grad_norm": 15.020646095275879, + "learning_rate": 9.293038537887603e-05, + "loss": 0.9349, + "step": 5915 + }, + { + "epoch": 0.4008401653228538, + "grad_norm": 6.779942989349365, + "learning_rate": 9.292901635977821e-05, + "loss": 0.8018, + "step": 5916 + }, + { + "epoch": 0.4009079205908259, + "grad_norm": 6.3740339279174805, + "learning_rate": 9.29276473406804e-05, + "loss": 0.7691, + "step": 5917 + }, + { + "epoch": 0.400975675858798, + "grad_norm": 6.902404308319092, + "learning_rate": 9.292627832158259e-05, + "loss": 0.7431, + "step": 5918 + }, + { + "epoch": 0.4010434311267701, + "grad_norm": 7.324024200439453, + "learning_rate": 9.292490930248477e-05, + "loss": 0.9314, + "step": 5919 + }, + { + "epoch": 0.4011111863947422, + "grad_norm": 8.297179222106934, + "learning_rate": 9.292354028338695e-05, + "loss": 1.0072, + "step": 5920 + }, + { + "epoch": 0.4011789416627143, + "grad_norm": 6.1418914794921875, + "learning_rate": 9.292217126428914e-05, + "loss": 0.948, + "step": 5921 + }, + { + "epoch": 0.4012466969306864, + "grad_norm": 8.698518753051758, + "learning_rate": 9.292080224519132e-05, + "loss": 0.8509, + "step": 5922 + }, + { + "epoch": 0.40131445219865847, + "grad_norm": 6.628043174743652, + "learning_rate": 9.29194332260935e-05, + "loss": 0.9388, + "step": 5923 + }, + { + "epoch": 0.4013822074666305, + "grad_norm": 8.38621711730957, + "learning_rate": 9.29180642069957e-05, + "loss": 1.0795, + "step": 5924 + }, + { + "epoch": 0.4014499627346026, + "grad_norm": 7.546327114105225, + "learning_rate": 9.291669518789788e-05, + "loss": 0.865, + "step": 5925 + }, + { + "epoch": 0.4015177180025747, + "grad_norm": 7.399687767028809, + "learning_rate": 9.291532616880006e-05, + "loss": 0.8276, + "step": 5926 + }, + { + "epoch": 0.4015854732705468, + "grad_norm": 6.921968460083008, + "learning_rate": 9.291395714970225e-05, + "loss": 0.9275, + "step": 5927 + }, + { + "epoch": 0.40165322853851887, + "grad_norm": 7.856932640075684, + "learning_rate": 9.291258813060443e-05, + "loss": 1.1376, + "step": 5928 + }, + { + "epoch": 0.40172098380649096, + "grad_norm": 7.86414098739624, + "learning_rate": 9.291121911150661e-05, + "loss": 1.0225, + "step": 5929 + }, + { + "epoch": 0.40178873907446305, + "grad_norm": 7.028629302978516, + "learning_rate": 9.29098500924088e-05, + "loss": 1.0157, + "step": 5930 + }, + { + "epoch": 0.40185649434243514, + "grad_norm": 5.826450347900391, + "learning_rate": 9.290848107331099e-05, + "loss": 0.8605, + "step": 5931 + }, + { + "epoch": 0.40192424961040724, + "grad_norm": 6.072175025939941, + "learning_rate": 9.290711205421317e-05, + "loss": 0.6261, + "step": 5932 + }, + { + "epoch": 0.40199200487837927, + "grad_norm": 6.829746246337891, + "learning_rate": 9.290574303511535e-05, + "loss": 0.849, + "step": 5933 + }, + { + "epoch": 0.40205976014635136, + "grad_norm": 6.809370994567871, + "learning_rate": 9.290437401601753e-05, + "loss": 0.8659, + "step": 5934 + }, + { + "epoch": 0.40212751541432346, + "grad_norm": 7.195353984832764, + "learning_rate": 9.290300499691971e-05, + "loss": 0.9559, + "step": 5935 + }, + { + "epoch": 0.40219527068229555, + "grad_norm": 8.443836212158203, + "learning_rate": 9.29016359778219e-05, + "loss": 0.7687, + "step": 5936 + }, + { + "epoch": 0.40226302595026764, + "grad_norm": 5.932136058807373, + "learning_rate": 9.290026695872408e-05, + "loss": 0.7125, + "step": 5937 + }, + { + "epoch": 0.40233078121823973, + "grad_norm": 6.654352188110352, + "learning_rate": 9.289889793962626e-05, + "loss": 0.9933, + "step": 5938 + }, + { + "epoch": 0.4023985364862118, + "grad_norm": 7.100243091583252, + "learning_rate": 9.289752892052844e-05, + "loss": 0.9271, + "step": 5939 + }, + { + "epoch": 0.4024662917541839, + "grad_norm": 7.78497838973999, + "learning_rate": 9.289615990143062e-05, + "loss": 0.9358, + "step": 5940 + }, + { + "epoch": 0.40253404702215595, + "grad_norm": 6.294493198394775, + "learning_rate": 9.289479088233282e-05, + "loss": 0.8308, + "step": 5941 + }, + { + "epoch": 0.40260180229012804, + "grad_norm": 6.8807244300842285, + "learning_rate": 9.2893421863235e-05, + "loss": 0.7728, + "step": 5942 + }, + { + "epoch": 0.40266955755810013, + "grad_norm": 7.030758857727051, + "learning_rate": 9.289205284413718e-05, + "loss": 0.8725, + "step": 5943 + }, + { + "epoch": 0.4027373128260722, + "grad_norm": 6.787613868713379, + "learning_rate": 9.289068382503936e-05, + "loss": 0.8725, + "step": 5944 + }, + { + "epoch": 0.4028050680940443, + "grad_norm": 6.4748148918151855, + "learning_rate": 9.288931480594155e-05, + "loss": 0.9305, + "step": 5945 + }, + { + "epoch": 0.4028728233620164, + "grad_norm": 5.930908203125, + "learning_rate": 9.288794578684373e-05, + "loss": 0.814, + "step": 5946 + }, + { + "epoch": 0.4029405786299885, + "grad_norm": 6.687366008758545, + "learning_rate": 9.288657676774591e-05, + "loss": 0.7893, + "step": 5947 + }, + { + "epoch": 0.4030083338979606, + "grad_norm": 9.194374084472656, + "learning_rate": 9.28852077486481e-05, + "loss": 1.1462, + "step": 5948 + }, + { + "epoch": 0.4030760891659326, + "grad_norm": 8.393781661987305, + "learning_rate": 9.288383872955027e-05, + "loss": 0.9375, + "step": 5949 + }, + { + "epoch": 0.4031438444339047, + "grad_norm": 7.802282810211182, + "learning_rate": 9.288246971045247e-05, + "loss": 1.1499, + "step": 5950 + }, + { + "epoch": 0.4032115997018768, + "grad_norm": 7.085485458374023, + "learning_rate": 9.288110069135465e-05, + "loss": 0.9834, + "step": 5951 + }, + { + "epoch": 0.4032793549698489, + "grad_norm": 7.905081272125244, + "learning_rate": 9.287973167225683e-05, + "loss": 1.11, + "step": 5952 + }, + { + "epoch": 0.403347110237821, + "grad_norm": 5.531884670257568, + "learning_rate": 9.287836265315901e-05, + "loss": 0.8386, + "step": 5953 + }, + { + "epoch": 0.4034148655057931, + "grad_norm": 5.959394931793213, + "learning_rate": 9.28769936340612e-05, + "loss": 0.858, + "step": 5954 + }, + { + "epoch": 0.4034826207737652, + "grad_norm": 7.020748138427734, + "learning_rate": 9.287562461496338e-05, + "loss": 0.8708, + "step": 5955 + }, + { + "epoch": 0.40355037604173727, + "grad_norm": 9.01052474975586, + "learning_rate": 9.287425559586556e-05, + "loss": 1.0213, + "step": 5956 + }, + { + "epoch": 0.4036181313097093, + "grad_norm": 7.091532230377197, + "learning_rate": 9.287288657676774e-05, + "loss": 1.0133, + "step": 5957 + }, + { + "epoch": 0.4036858865776814, + "grad_norm": 5.7289276123046875, + "learning_rate": 9.287151755766992e-05, + "loss": 0.6517, + "step": 5958 + }, + { + "epoch": 0.4037536418456535, + "grad_norm": 5.692935466766357, + "learning_rate": 9.287014853857212e-05, + "loss": 0.865, + "step": 5959 + }, + { + "epoch": 0.4038213971136256, + "grad_norm": 7.469212532043457, + "learning_rate": 9.28687795194743e-05, + "loss": 0.8707, + "step": 5960 + }, + { + "epoch": 0.40388915238159767, + "grad_norm": 6.172707557678223, + "learning_rate": 9.286741050037648e-05, + "loss": 0.8217, + "step": 5961 + }, + { + "epoch": 0.40395690764956976, + "grad_norm": 8.741066932678223, + "learning_rate": 9.286604148127866e-05, + "loss": 0.8087, + "step": 5962 + }, + { + "epoch": 0.40402466291754185, + "grad_norm": 6.240641117095947, + "learning_rate": 9.286467246218085e-05, + "loss": 1.1012, + "step": 5963 + }, + { + "epoch": 0.40409241818551395, + "grad_norm": 6.801406383514404, + "learning_rate": 9.286330344308303e-05, + "loss": 0.9335, + "step": 5964 + }, + { + "epoch": 0.404160173453486, + "grad_norm": 6.114485263824463, + "learning_rate": 9.286193442398521e-05, + "loss": 0.8987, + "step": 5965 + }, + { + "epoch": 0.4042279287214581, + "grad_norm": 9.715784072875977, + "learning_rate": 9.28605654048874e-05, + "loss": 1.0337, + "step": 5966 + }, + { + "epoch": 0.40429568398943017, + "grad_norm": 10.362801551818848, + "learning_rate": 9.285919638578959e-05, + "loss": 0.7685, + "step": 5967 + }, + { + "epoch": 0.40436343925740226, + "grad_norm": 7.567534446716309, + "learning_rate": 9.285782736669177e-05, + "loss": 0.8397, + "step": 5968 + }, + { + "epoch": 0.40443119452537435, + "grad_norm": 7.282614231109619, + "learning_rate": 9.285645834759395e-05, + "loss": 1.0515, + "step": 5969 + }, + { + "epoch": 0.40449894979334644, + "grad_norm": 6.683037757873535, + "learning_rate": 9.285508932849614e-05, + "loss": 1.1006, + "step": 5970 + }, + { + "epoch": 0.40456670506131853, + "grad_norm": 8.437498092651367, + "learning_rate": 9.285372030939832e-05, + "loss": 1.0656, + "step": 5971 + }, + { + "epoch": 0.4046344603292906, + "grad_norm": 6.153039932250977, + "learning_rate": 9.28523512903005e-05, + "loss": 0.8285, + "step": 5972 + }, + { + "epoch": 0.40470221559726266, + "grad_norm": 6.3334221839904785, + "learning_rate": 9.28509822712027e-05, + "loss": 0.8968, + "step": 5973 + }, + { + "epoch": 0.40476997086523475, + "grad_norm": 9.119121551513672, + "learning_rate": 9.284961325210488e-05, + "loss": 0.8814, + "step": 5974 + }, + { + "epoch": 0.40483772613320684, + "grad_norm": 7.621852397918701, + "learning_rate": 9.284824423300706e-05, + "loss": 0.9297, + "step": 5975 + }, + { + "epoch": 0.40490548140117893, + "grad_norm": 7.415964126586914, + "learning_rate": 9.284687521390924e-05, + "loss": 0.8597, + "step": 5976 + }, + { + "epoch": 0.404973236669151, + "grad_norm": 6.424054145812988, + "learning_rate": 9.284550619481143e-05, + "loss": 0.6792, + "step": 5977 + }, + { + "epoch": 0.4050409919371231, + "grad_norm": 7.875925540924072, + "learning_rate": 9.284413717571361e-05, + "loss": 0.7931, + "step": 5978 + }, + { + "epoch": 0.4051087472050952, + "grad_norm": 5.702389240264893, + "learning_rate": 9.284276815661579e-05, + "loss": 0.7124, + "step": 5979 + }, + { + "epoch": 0.4051765024730673, + "grad_norm": 6.58071231842041, + "learning_rate": 9.284139913751797e-05, + "loss": 0.8965, + "step": 5980 + }, + { + "epoch": 0.4052442577410394, + "grad_norm": 8.180785179138184, + "learning_rate": 9.284003011842015e-05, + "loss": 1.0545, + "step": 5981 + }, + { + "epoch": 0.40531201300901143, + "grad_norm": 7.393392562866211, + "learning_rate": 9.283866109932235e-05, + "loss": 0.8044, + "step": 5982 + }, + { + "epoch": 0.4053797682769835, + "grad_norm": 6.009011745452881, + "learning_rate": 9.283729208022453e-05, + "loss": 0.7688, + "step": 5983 + }, + { + "epoch": 0.4054475235449556, + "grad_norm": 6.8165388107299805, + "learning_rate": 9.283592306112671e-05, + "loss": 0.7743, + "step": 5984 + }, + { + "epoch": 0.4055152788129277, + "grad_norm": 7.851406574249268, + "learning_rate": 9.283455404202889e-05, + "loss": 0.9081, + "step": 5985 + }, + { + "epoch": 0.4055830340808998, + "grad_norm": 7.138257026672363, + "learning_rate": 9.283318502293108e-05, + "loss": 0.9843, + "step": 5986 + }, + { + "epoch": 0.4056507893488719, + "grad_norm": 7.70706033706665, + "learning_rate": 9.283181600383326e-05, + "loss": 0.9158, + "step": 5987 + }, + { + "epoch": 0.405718544616844, + "grad_norm": 7.948725700378418, + "learning_rate": 9.283044698473544e-05, + "loss": 0.8694, + "step": 5988 + }, + { + "epoch": 0.40578629988481607, + "grad_norm": 6.4174628257751465, + "learning_rate": 9.282907796563762e-05, + "loss": 0.8547, + "step": 5989 + }, + { + "epoch": 0.4058540551527881, + "grad_norm": 7.62558126449585, + "learning_rate": 9.28277089465398e-05, + "loss": 0.7978, + "step": 5990 + }, + { + "epoch": 0.4059218104207602, + "grad_norm": 7.624577522277832, + "learning_rate": 9.2826339927442e-05, + "loss": 1.0346, + "step": 5991 + }, + { + "epoch": 0.4059895656887323, + "grad_norm": 7.392852783203125, + "learning_rate": 9.282497090834418e-05, + "loss": 0.8833, + "step": 5992 + }, + { + "epoch": 0.4060573209567044, + "grad_norm": 7.652538776397705, + "learning_rate": 9.282360188924636e-05, + "loss": 1.0438, + "step": 5993 + }, + { + "epoch": 0.4061250762246765, + "grad_norm": 7.050436973571777, + "learning_rate": 9.282223287014854e-05, + "loss": 0.8291, + "step": 5994 + }, + { + "epoch": 0.40619283149264857, + "grad_norm": 7.119441032409668, + "learning_rate": 9.282086385105072e-05, + "loss": 0.6427, + "step": 5995 + }, + { + "epoch": 0.40626058676062066, + "grad_norm": 6.1112565994262695, + "learning_rate": 9.281949483195291e-05, + "loss": 0.7737, + "step": 5996 + }, + { + "epoch": 0.40632834202859275, + "grad_norm": 6.173165321350098, + "learning_rate": 9.28181258128551e-05, + "loss": 0.8099, + "step": 5997 + }, + { + "epoch": 0.4063960972965648, + "grad_norm": 6.372697830200195, + "learning_rate": 9.281675679375727e-05, + "loss": 0.7902, + "step": 5998 + }, + { + "epoch": 0.4064638525645369, + "grad_norm": 7.213540077209473, + "learning_rate": 9.281538777465945e-05, + "loss": 0.7273, + "step": 5999 + }, + { + "epoch": 0.40653160783250897, + "grad_norm": 8.106986999511719, + "learning_rate": 9.281401875556165e-05, + "loss": 0.7516, + "step": 6000 + }, + { + "epoch": 0.40659936310048106, + "grad_norm": 8.372703552246094, + "learning_rate": 9.281264973646383e-05, + "loss": 1.1737, + "step": 6001 + }, + { + "epoch": 0.40666711836845315, + "grad_norm": 9.540267944335938, + "learning_rate": 9.281128071736601e-05, + "loss": 1.068, + "step": 6002 + }, + { + "epoch": 0.40673487363642524, + "grad_norm": 6.878968715667725, + "learning_rate": 9.280991169826819e-05, + "loss": 0.9329, + "step": 6003 + }, + { + "epoch": 0.40680262890439733, + "grad_norm": 6.826279163360596, + "learning_rate": 9.280854267917037e-05, + "loss": 0.8267, + "step": 6004 + }, + { + "epoch": 0.4068703841723694, + "grad_norm": 7.155866622924805, + "learning_rate": 9.280717366007256e-05, + "loss": 0.9133, + "step": 6005 + }, + { + "epoch": 0.40693813944034146, + "grad_norm": 6.883568286895752, + "learning_rate": 9.280580464097474e-05, + "loss": 0.709, + "step": 6006 + }, + { + "epoch": 0.40700589470831355, + "grad_norm": 6.944139003753662, + "learning_rate": 9.280443562187692e-05, + "loss": 0.8137, + "step": 6007 + }, + { + "epoch": 0.40707364997628565, + "grad_norm": 5.899077892303467, + "learning_rate": 9.28030666027791e-05, + "loss": 0.733, + "step": 6008 + }, + { + "epoch": 0.40714140524425774, + "grad_norm": 7.213099956512451, + "learning_rate": 9.28016975836813e-05, + "loss": 1.1728, + "step": 6009 + }, + { + "epoch": 0.40720916051222983, + "grad_norm": 7.830915927886963, + "learning_rate": 9.280032856458348e-05, + "loss": 0.7465, + "step": 6010 + }, + { + "epoch": 0.4072769157802019, + "grad_norm": 7.609717845916748, + "learning_rate": 9.279895954548566e-05, + "loss": 1.2036, + "step": 6011 + }, + { + "epoch": 0.407344671048174, + "grad_norm": 8.978927612304688, + "learning_rate": 9.279759052638784e-05, + "loss": 0.8903, + "step": 6012 + }, + { + "epoch": 0.4074124263161461, + "grad_norm": 8.331847190856934, + "learning_rate": 9.279622150729003e-05, + "loss": 1.0853, + "step": 6013 + }, + { + "epoch": 0.40748018158411814, + "grad_norm": 6.639584541320801, + "learning_rate": 9.279485248819221e-05, + "loss": 0.9113, + "step": 6014 + }, + { + "epoch": 0.40754793685209023, + "grad_norm": 6.87017822265625, + "learning_rate": 9.27934834690944e-05, + "loss": 0.748, + "step": 6015 + }, + { + "epoch": 0.4076156921200623, + "grad_norm": 6.675489902496338, + "learning_rate": 9.279211444999659e-05, + "loss": 0.8002, + "step": 6016 + }, + { + "epoch": 0.4076834473880344, + "grad_norm": 7.666563034057617, + "learning_rate": 9.279074543089877e-05, + "loss": 0.9973, + "step": 6017 + }, + { + "epoch": 0.4077512026560065, + "grad_norm": 7.878670692443848, + "learning_rate": 9.278937641180095e-05, + "loss": 1.0647, + "step": 6018 + }, + { + "epoch": 0.4078189579239786, + "grad_norm": 6.263443946838379, + "learning_rate": 9.278800739270314e-05, + "loss": 0.8827, + "step": 6019 + }, + { + "epoch": 0.4078867131919507, + "grad_norm": 9.69717788696289, + "learning_rate": 9.278663837360532e-05, + "loss": 0.728, + "step": 6020 + }, + { + "epoch": 0.4079544684599228, + "grad_norm": 8.236658096313477, + "learning_rate": 9.27852693545075e-05, + "loss": 0.78, + "step": 6021 + }, + { + "epoch": 0.4080222237278948, + "grad_norm": 6.9468512535095215, + "learning_rate": 9.278390033540968e-05, + "loss": 0.7674, + "step": 6022 + }, + { + "epoch": 0.4080899789958669, + "grad_norm": 8.783222198486328, + "learning_rate": 9.278253131631188e-05, + "loss": 0.9062, + "step": 6023 + }, + { + "epoch": 0.408157734263839, + "grad_norm": 6.449438571929932, + "learning_rate": 9.278116229721406e-05, + "loss": 0.9683, + "step": 6024 + }, + { + "epoch": 0.4082254895318111, + "grad_norm": 6.185464382171631, + "learning_rate": 9.277979327811624e-05, + "loss": 0.8899, + "step": 6025 + }, + { + "epoch": 0.4082932447997832, + "grad_norm": 6.422550678253174, + "learning_rate": 9.277842425901842e-05, + "loss": 0.8765, + "step": 6026 + }, + { + "epoch": 0.4083610000677553, + "grad_norm": 6.8761372566223145, + "learning_rate": 9.27770552399206e-05, + "loss": 0.7326, + "step": 6027 + }, + { + "epoch": 0.40842875533572737, + "grad_norm": 5.4224162101745605, + "learning_rate": 9.277568622082279e-05, + "loss": 0.7882, + "step": 6028 + }, + { + "epoch": 0.40849651060369946, + "grad_norm": 6.6288275718688965, + "learning_rate": 9.277431720172497e-05, + "loss": 0.936, + "step": 6029 + }, + { + "epoch": 0.4085642658716715, + "grad_norm": 7.305326461791992, + "learning_rate": 9.277294818262715e-05, + "loss": 0.8254, + "step": 6030 + }, + { + "epoch": 0.4086320211396436, + "grad_norm": 6.697827339172363, + "learning_rate": 9.277157916352933e-05, + "loss": 0.9365, + "step": 6031 + }, + { + "epoch": 0.4086997764076157, + "grad_norm": 8.074564933776855, + "learning_rate": 9.277021014443153e-05, + "loss": 1.0454, + "step": 6032 + }, + { + "epoch": 0.40876753167558777, + "grad_norm": 7.539030075073242, + "learning_rate": 9.276884112533371e-05, + "loss": 0.9275, + "step": 6033 + }, + { + "epoch": 0.40883528694355986, + "grad_norm": 7.1338582038879395, + "learning_rate": 9.276747210623589e-05, + "loss": 0.8395, + "step": 6034 + }, + { + "epoch": 0.40890304221153195, + "grad_norm": 8.182660102844238, + "learning_rate": 9.276610308713807e-05, + "loss": 1.1907, + "step": 6035 + }, + { + "epoch": 0.40897079747950404, + "grad_norm": 6.1452813148498535, + "learning_rate": 9.276473406804025e-05, + "loss": 0.6271, + "step": 6036 + }, + { + "epoch": 0.40903855274747614, + "grad_norm": 7.247335433959961, + "learning_rate": 9.276336504894244e-05, + "loss": 0.8665, + "step": 6037 + }, + { + "epoch": 0.40910630801544823, + "grad_norm": 8.27696418762207, + "learning_rate": 9.276199602984462e-05, + "loss": 0.968, + "step": 6038 + }, + { + "epoch": 0.40917406328342026, + "grad_norm": 6.888766288757324, + "learning_rate": 9.27606270107468e-05, + "loss": 0.8212, + "step": 6039 + }, + { + "epoch": 0.40924181855139236, + "grad_norm": 6.885414123535156, + "learning_rate": 9.275925799164898e-05, + "loss": 0.8978, + "step": 6040 + }, + { + "epoch": 0.40930957381936445, + "grad_norm": 7.307199478149414, + "learning_rate": 9.275788897255118e-05, + "loss": 0.9602, + "step": 6041 + }, + { + "epoch": 0.40937732908733654, + "grad_norm": 6.921801567077637, + "learning_rate": 9.275651995345336e-05, + "loss": 0.863, + "step": 6042 + }, + { + "epoch": 0.40944508435530863, + "grad_norm": 6.910282611846924, + "learning_rate": 9.275515093435554e-05, + "loss": 0.8481, + "step": 6043 + }, + { + "epoch": 0.4095128396232807, + "grad_norm": 7.98552131652832, + "learning_rate": 9.275378191525772e-05, + "loss": 0.9096, + "step": 6044 + }, + { + "epoch": 0.4095805948912528, + "grad_norm": 9.74826431274414, + "learning_rate": 9.27524128961599e-05, + "loss": 0.867, + "step": 6045 + }, + { + "epoch": 0.4096483501592249, + "grad_norm": 10.083016395568848, + "learning_rate": 9.275104387706209e-05, + "loss": 0.9664, + "step": 6046 + }, + { + "epoch": 0.40971610542719694, + "grad_norm": 8.351798057556152, + "learning_rate": 9.274967485796427e-05, + "loss": 0.8164, + "step": 6047 + }, + { + "epoch": 0.40978386069516903, + "grad_norm": 7.7515106201171875, + "learning_rate": 9.274830583886645e-05, + "loss": 0.9784, + "step": 6048 + }, + { + "epoch": 0.4098516159631411, + "grad_norm": 6.339774131774902, + "learning_rate": 9.274693681976863e-05, + "loss": 0.7614, + "step": 6049 + }, + { + "epoch": 0.4099193712311132, + "grad_norm": 8.455992698669434, + "learning_rate": 9.274556780067081e-05, + "loss": 0.989, + "step": 6050 + }, + { + "epoch": 0.4099871264990853, + "grad_norm": 9.367591857910156, + "learning_rate": 9.274419878157301e-05, + "loss": 1.0155, + "step": 6051 + }, + { + "epoch": 0.4100548817670574, + "grad_norm": 8.611092567443848, + "learning_rate": 9.274282976247519e-05, + "loss": 0.9541, + "step": 6052 + }, + { + "epoch": 0.4101226370350295, + "grad_norm": 8.239481925964355, + "learning_rate": 9.274146074337737e-05, + "loss": 0.9511, + "step": 6053 + }, + { + "epoch": 0.4101903923030016, + "grad_norm": 7.31620979309082, + "learning_rate": 9.274009172427955e-05, + "loss": 1.0126, + "step": 6054 + }, + { + "epoch": 0.4102581475709736, + "grad_norm": 6.814750671386719, + "learning_rate": 9.273872270518174e-05, + "loss": 0.9463, + "step": 6055 + }, + { + "epoch": 0.4103259028389457, + "grad_norm": 7.000329494476318, + "learning_rate": 9.273735368608392e-05, + "loss": 1.0664, + "step": 6056 + }, + { + "epoch": 0.4103936581069178, + "grad_norm": 6.309933662414551, + "learning_rate": 9.27359846669861e-05, + "loss": 0.8605, + "step": 6057 + }, + { + "epoch": 0.4104614133748899, + "grad_norm": 8.13158893585205, + "learning_rate": 9.273461564788828e-05, + "loss": 0.9158, + "step": 6058 + }, + { + "epoch": 0.410529168642862, + "grad_norm": 8.334741592407227, + "learning_rate": 9.273324662879048e-05, + "loss": 0.8699, + "step": 6059 + }, + { + "epoch": 0.4105969239108341, + "grad_norm": 8.019463539123535, + "learning_rate": 9.273187760969266e-05, + "loss": 1.0366, + "step": 6060 + }, + { + "epoch": 0.41066467917880617, + "grad_norm": 6.104535102844238, + "learning_rate": 9.273050859059484e-05, + "loss": 0.8273, + "step": 6061 + }, + { + "epoch": 0.41073243444677826, + "grad_norm": 8.345130920410156, + "learning_rate": 9.272913957149703e-05, + "loss": 1.3653, + "step": 6062 + }, + { + "epoch": 0.4108001897147503, + "grad_norm": 8.028311729431152, + "learning_rate": 9.272777055239921e-05, + "loss": 1.096, + "step": 6063 + }, + { + "epoch": 0.4108679449827224, + "grad_norm": 7.079861164093018, + "learning_rate": 9.272640153330139e-05, + "loss": 1.0157, + "step": 6064 + }, + { + "epoch": 0.4109357002506945, + "grad_norm": 8.497052192687988, + "learning_rate": 9.272503251420359e-05, + "loss": 0.9972, + "step": 6065 + }, + { + "epoch": 0.41100345551866657, + "grad_norm": 8.042581558227539, + "learning_rate": 9.272366349510577e-05, + "loss": 0.908, + "step": 6066 + }, + { + "epoch": 0.41107121078663866, + "grad_norm": 6.567187786102295, + "learning_rate": 9.272229447600795e-05, + "loss": 0.7992, + "step": 6067 + }, + { + "epoch": 0.41113896605461075, + "grad_norm": 5.78397274017334, + "learning_rate": 9.272092545691013e-05, + "loss": 0.773, + "step": 6068 + }, + { + "epoch": 0.41120672132258285, + "grad_norm": 8.116515159606934, + "learning_rate": 9.271955643781232e-05, + "loss": 0.8101, + "step": 6069 + }, + { + "epoch": 0.41127447659055494, + "grad_norm": 7.628951549530029, + "learning_rate": 9.27181874187145e-05, + "loss": 0.9282, + "step": 6070 + }, + { + "epoch": 0.411342231858527, + "grad_norm": 7.830180644989014, + "learning_rate": 9.271681839961668e-05, + "loss": 0.8681, + "step": 6071 + }, + { + "epoch": 0.41140998712649907, + "grad_norm": 7.690285682678223, + "learning_rate": 9.271544938051886e-05, + "loss": 0.9064, + "step": 6072 + }, + { + "epoch": 0.41147774239447116, + "grad_norm": 7.585133075714111, + "learning_rate": 9.271408036142106e-05, + "loss": 0.8305, + "step": 6073 + }, + { + "epoch": 0.41154549766244325, + "grad_norm": 6.18320894241333, + "learning_rate": 9.271271134232324e-05, + "loss": 1.0395, + "step": 6074 + }, + { + "epoch": 0.41161325293041534, + "grad_norm": 8.301353454589844, + "learning_rate": 9.271134232322542e-05, + "loss": 1.1528, + "step": 6075 + }, + { + "epoch": 0.41168100819838743, + "grad_norm": 6.261874198913574, + "learning_rate": 9.27099733041276e-05, + "loss": 0.7802, + "step": 6076 + }, + { + "epoch": 0.4117487634663595, + "grad_norm": 7.161525726318359, + "learning_rate": 9.270860428502978e-05, + "loss": 1.1292, + "step": 6077 + }, + { + "epoch": 0.4118165187343316, + "grad_norm": 6.522838115692139, + "learning_rate": 9.270723526593197e-05, + "loss": 0.8013, + "step": 6078 + }, + { + "epoch": 0.41188427400230365, + "grad_norm": 6.919328212738037, + "learning_rate": 9.270586624683415e-05, + "loss": 0.8172, + "step": 6079 + }, + { + "epoch": 0.41195202927027574, + "grad_norm": 6.236283302307129, + "learning_rate": 9.270449722773633e-05, + "loss": 0.9677, + "step": 6080 + }, + { + "epoch": 0.41201978453824784, + "grad_norm": 5.593216896057129, + "learning_rate": 9.270312820863851e-05, + "loss": 0.922, + "step": 6081 + }, + { + "epoch": 0.4120875398062199, + "grad_norm": 8.651509284973145, + "learning_rate": 9.27017591895407e-05, + "loss": 0.9209, + "step": 6082 + }, + { + "epoch": 0.412155295074192, + "grad_norm": 6.787774085998535, + "learning_rate": 9.270039017044289e-05, + "loss": 1.0991, + "step": 6083 + }, + { + "epoch": 0.4122230503421641, + "grad_norm": 6.774304389953613, + "learning_rate": 9.269902115134507e-05, + "loss": 0.7265, + "step": 6084 + }, + { + "epoch": 0.4122908056101362, + "grad_norm": 7.097721099853516, + "learning_rate": 9.269765213224725e-05, + "loss": 1.0353, + "step": 6085 + }, + { + "epoch": 0.4123585608781083, + "grad_norm": 8.65166187286377, + "learning_rate": 9.269628311314943e-05, + "loss": 1.0157, + "step": 6086 + }, + { + "epoch": 0.4124263161460804, + "grad_norm": 5.778721809387207, + "learning_rate": 9.269491409405162e-05, + "loss": 0.8381, + "step": 6087 + }, + { + "epoch": 0.4124940714140524, + "grad_norm": 6.297547340393066, + "learning_rate": 9.26935450749538e-05, + "loss": 0.9421, + "step": 6088 + }, + { + "epoch": 0.4125618266820245, + "grad_norm": 7.8641462326049805, + "learning_rate": 9.269217605585598e-05, + "loss": 0.9987, + "step": 6089 + }, + { + "epoch": 0.4126295819499966, + "grad_norm": 7.222883701324463, + "learning_rate": 9.269080703675816e-05, + "loss": 0.7314, + "step": 6090 + }, + { + "epoch": 0.4126973372179687, + "grad_norm": 7.291220188140869, + "learning_rate": 9.268943801766034e-05, + "loss": 0.9199, + "step": 6091 + }, + { + "epoch": 0.4127650924859408, + "grad_norm": 7.00157356262207, + "learning_rate": 9.268806899856254e-05, + "loss": 0.8965, + "step": 6092 + }, + { + "epoch": 0.4128328477539129, + "grad_norm": 8.229329109191895, + "learning_rate": 9.268669997946472e-05, + "loss": 1.0392, + "step": 6093 + }, + { + "epoch": 0.41290060302188497, + "grad_norm": 6.9192633628845215, + "learning_rate": 9.26853309603669e-05, + "loss": 0.8987, + "step": 6094 + }, + { + "epoch": 0.41296835828985706, + "grad_norm": 6.3700761795043945, + "learning_rate": 9.268396194126908e-05, + "loss": 0.7713, + "step": 6095 + }, + { + "epoch": 0.4130361135578291, + "grad_norm": 5.654745101928711, + "learning_rate": 9.268259292217127e-05, + "loss": 0.6733, + "step": 6096 + }, + { + "epoch": 0.4131038688258012, + "grad_norm": 6.898359298706055, + "learning_rate": 9.268122390307345e-05, + "loss": 0.9611, + "step": 6097 + }, + { + "epoch": 0.4131716240937733, + "grad_norm": 6.53093147277832, + "learning_rate": 9.267985488397563e-05, + "loss": 0.795, + "step": 6098 + }, + { + "epoch": 0.4132393793617454, + "grad_norm": 8.565315246582031, + "learning_rate": 9.267848586487781e-05, + "loss": 0.7333, + "step": 6099 + }, + { + "epoch": 0.41330713462971747, + "grad_norm": 7.8006744384765625, + "learning_rate": 9.267711684578e-05, + "loss": 0.9579, + "step": 6100 + }, + { + "epoch": 0.41337488989768956, + "grad_norm": 7.270709037780762, + "learning_rate": 9.267574782668219e-05, + "loss": 0.9105, + "step": 6101 + }, + { + "epoch": 0.41344264516566165, + "grad_norm": 7.98935079574585, + "learning_rate": 9.267437880758437e-05, + "loss": 0.9697, + "step": 6102 + }, + { + "epoch": 0.41351040043363374, + "grad_norm": 6.606309413909912, + "learning_rate": 9.267300978848655e-05, + "loss": 0.8649, + "step": 6103 + }, + { + "epoch": 0.4135781557016058, + "grad_norm": 7.456545352935791, + "learning_rate": 9.267164076938873e-05, + "loss": 1.0884, + "step": 6104 + }, + { + "epoch": 0.41364591096957787, + "grad_norm": 6.820968151092529, + "learning_rate": 9.267027175029092e-05, + "loss": 0.6651, + "step": 6105 + }, + { + "epoch": 0.41371366623754996, + "grad_norm": 7.569576740264893, + "learning_rate": 9.26689027311931e-05, + "loss": 1.024, + "step": 6106 + }, + { + "epoch": 0.41378142150552205, + "grad_norm": 7.982132911682129, + "learning_rate": 9.266753371209528e-05, + "loss": 0.9111, + "step": 6107 + }, + { + "epoch": 0.41384917677349414, + "grad_norm": 7.610587120056152, + "learning_rate": 9.266616469299748e-05, + "loss": 0.9005, + "step": 6108 + }, + { + "epoch": 0.41391693204146623, + "grad_norm": 6.773017883300781, + "learning_rate": 9.266479567389966e-05, + "loss": 0.6696, + "step": 6109 + }, + { + "epoch": 0.4139846873094383, + "grad_norm": 7.702723979949951, + "learning_rate": 9.266342665480184e-05, + "loss": 0.7951, + "step": 6110 + }, + { + "epoch": 0.4140524425774104, + "grad_norm": 7.299111843109131, + "learning_rate": 9.266205763570403e-05, + "loss": 1.0422, + "step": 6111 + }, + { + "epoch": 0.41412019784538245, + "grad_norm": 5.148745536804199, + "learning_rate": 9.266068861660621e-05, + "loss": 0.6821, + "step": 6112 + }, + { + "epoch": 0.41418795311335455, + "grad_norm": 6.768344402313232, + "learning_rate": 9.265931959750839e-05, + "loss": 0.8119, + "step": 6113 + }, + { + "epoch": 0.41425570838132664, + "grad_norm": 6.882130146026611, + "learning_rate": 9.265795057841057e-05, + "loss": 0.9931, + "step": 6114 + }, + { + "epoch": 0.41432346364929873, + "grad_norm": 7.292698383331299, + "learning_rate": 9.265658155931277e-05, + "loss": 1.0415, + "step": 6115 + }, + { + "epoch": 0.4143912189172708, + "grad_norm": 7.241159915924072, + "learning_rate": 9.265521254021495e-05, + "loss": 0.9122, + "step": 6116 + }, + { + "epoch": 0.4144589741852429, + "grad_norm": 6.7898736000061035, + "learning_rate": 9.265384352111713e-05, + "loss": 0.9399, + "step": 6117 + }, + { + "epoch": 0.414526729453215, + "grad_norm": 6.505312919616699, + "learning_rate": 9.265247450201931e-05, + "loss": 0.9463, + "step": 6118 + }, + { + "epoch": 0.4145944847211871, + "grad_norm": 5.761348724365234, + "learning_rate": 9.26511054829215e-05, + "loss": 0.8365, + "step": 6119 + }, + { + "epoch": 0.41466223998915913, + "grad_norm": 6.377706527709961, + "learning_rate": 9.264973646382368e-05, + "loss": 0.7451, + "step": 6120 + }, + { + "epoch": 0.4147299952571312, + "grad_norm": 8.465597152709961, + "learning_rate": 9.264836744472586e-05, + "loss": 0.8582, + "step": 6121 + }, + { + "epoch": 0.4147977505251033, + "grad_norm": 6.3482770919799805, + "learning_rate": 9.264699842562804e-05, + "loss": 0.8291, + "step": 6122 + }, + { + "epoch": 0.4148655057930754, + "grad_norm": 7.984723091125488, + "learning_rate": 9.264562940653022e-05, + "loss": 0.9641, + "step": 6123 + }, + { + "epoch": 0.4149332610610475, + "grad_norm": 6.695097923278809, + "learning_rate": 9.264426038743242e-05, + "loss": 0.7661, + "step": 6124 + }, + { + "epoch": 0.4150010163290196, + "grad_norm": 6.403726100921631, + "learning_rate": 9.26428913683346e-05, + "loss": 0.6342, + "step": 6125 + }, + { + "epoch": 0.4150687715969917, + "grad_norm": 5.906423091888428, + "learning_rate": 9.264152234923678e-05, + "loss": 0.9588, + "step": 6126 + }, + { + "epoch": 0.4151365268649638, + "grad_norm": 6.482539653778076, + "learning_rate": 9.264015333013896e-05, + "loss": 1.0244, + "step": 6127 + }, + { + "epoch": 0.4152042821329358, + "grad_norm": 7.783926486968994, + "learning_rate": 9.263878431104114e-05, + "loss": 0.8611, + "step": 6128 + }, + { + "epoch": 0.4152720374009079, + "grad_norm": 8.377721786499023, + "learning_rate": 9.263741529194333e-05, + "loss": 0.9757, + "step": 6129 + }, + { + "epoch": 0.41533979266888, + "grad_norm": 8.719101905822754, + "learning_rate": 9.263604627284551e-05, + "loss": 1.0766, + "step": 6130 + }, + { + "epoch": 0.4154075479368521, + "grad_norm": 6.0139641761779785, + "learning_rate": 9.263467725374769e-05, + "loss": 0.8036, + "step": 6131 + }, + { + "epoch": 0.4154753032048242, + "grad_norm": 8.683677673339844, + "learning_rate": 9.263330823464987e-05, + "loss": 0.9476, + "step": 6132 + }, + { + "epoch": 0.41554305847279627, + "grad_norm": 7.566380977630615, + "learning_rate": 9.263193921555207e-05, + "loss": 1.0983, + "step": 6133 + }, + { + "epoch": 0.41561081374076836, + "grad_norm": 6.716690540313721, + "learning_rate": 9.263057019645425e-05, + "loss": 0.8803, + "step": 6134 + }, + { + "epoch": 0.41567856900874045, + "grad_norm": 6.123441696166992, + "learning_rate": 9.262920117735643e-05, + "loss": 0.7158, + "step": 6135 + }, + { + "epoch": 0.4157463242767125, + "grad_norm": 8.015522956848145, + "learning_rate": 9.262783215825861e-05, + "loss": 1.0269, + "step": 6136 + }, + { + "epoch": 0.4158140795446846, + "grad_norm": 5.564431667327881, + "learning_rate": 9.262646313916079e-05, + "loss": 0.8309, + "step": 6137 + }, + { + "epoch": 0.41588183481265667, + "grad_norm": 7.946048736572266, + "learning_rate": 9.262509412006298e-05, + "loss": 0.9334, + "step": 6138 + }, + { + "epoch": 0.41594959008062876, + "grad_norm": 6.741854667663574, + "learning_rate": 9.262372510096516e-05, + "loss": 0.6461, + "step": 6139 + }, + { + "epoch": 0.41601734534860085, + "grad_norm": 8.229652404785156, + "learning_rate": 9.262235608186734e-05, + "loss": 0.9046, + "step": 6140 + }, + { + "epoch": 0.41608510061657294, + "grad_norm": 5.590304851531982, + "learning_rate": 9.262098706276952e-05, + "loss": 0.6979, + "step": 6141 + }, + { + "epoch": 0.41615285588454504, + "grad_norm": 6.79884672164917, + "learning_rate": 9.261961804367172e-05, + "loss": 0.853, + "step": 6142 + }, + { + "epoch": 0.41622061115251713, + "grad_norm": 7.175544738769531, + "learning_rate": 9.26182490245739e-05, + "loss": 0.9265, + "step": 6143 + }, + { + "epoch": 0.4162883664204892, + "grad_norm": 8.839093208312988, + "learning_rate": 9.261688000547608e-05, + "loss": 0.9541, + "step": 6144 + }, + { + "epoch": 0.41635612168846126, + "grad_norm": 6.818619728088379, + "learning_rate": 9.261551098637826e-05, + "loss": 0.6284, + "step": 6145 + }, + { + "epoch": 0.41642387695643335, + "grad_norm": 7.31305456161499, + "learning_rate": 9.261414196728044e-05, + "loss": 0.797, + "step": 6146 + }, + { + "epoch": 0.41649163222440544, + "grad_norm": 8.566871643066406, + "learning_rate": 9.261277294818263e-05, + "loss": 1.0893, + "step": 6147 + }, + { + "epoch": 0.41655938749237753, + "grad_norm": 6.028139114379883, + "learning_rate": 9.261140392908481e-05, + "loss": 0.781, + "step": 6148 + }, + { + "epoch": 0.4166271427603496, + "grad_norm": 7.383317947387695, + "learning_rate": 9.261003490998699e-05, + "loss": 0.5846, + "step": 6149 + }, + { + "epoch": 0.4166948980283217, + "grad_norm": 6.392228126525879, + "learning_rate": 9.260866589088917e-05, + "loss": 0.9403, + "step": 6150 + }, + { + "epoch": 0.4167626532962938, + "grad_norm": 6.1830644607543945, + "learning_rate": 9.260729687179137e-05, + "loss": 0.9502, + "step": 6151 + }, + { + "epoch": 0.4168304085642659, + "grad_norm": 6.851447582244873, + "learning_rate": 9.260592785269355e-05, + "loss": 0.8042, + "step": 6152 + }, + { + "epoch": 0.41689816383223793, + "grad_norm": 5.995123386383057, + "learning_rate": 9.260455883359573e-05, + "loss": 0.8875, + "step": 6153 + }, + { + "epoch": 0.41696591910021, + "grad_norm": 6.834797382354736, + "learning_rate": 9.260318981449791e-05, + "loss": 0.8652, + "step": 6154 + }, + { + "epoch": 0.4170336743681821, + "grad_norm": 6.0200371742248535, + "learning_rate": 9.26018207954001e-05, + "loss": 0.9198, + "step": 6155 + }, + { + "epoch": 0.4171014296361542, + "grad_norm": 7.138192176818848, + "learning_rate": 9.260045177630228e-05, + "loss": 0.8284, + "step": 6156 + }, + { + "epoch": 0.4171691849041263, + "grad_norm": 7.5531229972839355, + "learning_rate": 9.259908275720446e-05, + "loss": 0.7188, + "step": 6157 + }, + { + "epoch": 0.4172369401720984, + "grad_norm": 8.347415924072266, + "learning_rate": 9.259771373810666e-05, + "loss": 1.1853, + "step": 6158 + }, + { + "epoch": 0.4173046954400705, + "grad_norm": 8.188237190246582, + "learning_rate": 9.259634471900884e-05, + "loss": 0.7712, + "step": 6159 + }, + { + "epoch": 0.4173724507080426, + "grad_norm": 7.238736629486084, + "learning_rate": 9.259497569991102e-05, + "loss": 0.7731, + "step": 6160 + }, + { + "epoch": 0.4174402059760146, + "grad_norm": 8.175471305847168, + "learning_rate": 9.259360668081321e-05, + "loss": 1.0769, + "step": 6161 + }, + { + "epoch": 0.4175079612439867, + "grad_norm": 6.97186803817749, + "learning_rate": 9.259223766171539e-05, + "loss": 0.7469, + "step": 6162 + }, + { + "epoch": 0.4175757165119588, + "grad_norm": 5.761664390563965, + "learning_rate": 9.259086864261757e-05, + "loss": 0.8716, + "step": 6163 + }, + { + "epoch": 0.4176434717799309, + "grad_norm": 8.567249298095703, + "learning_rate": 9.258949962351975e-05, + "loss": 0.8376, + "step": 6164 + }, + { + "epoch": 0.417711227047903, + "grad_norm": 5.827561378479004, + "learning_rate": 9.258813060442195e-05, + "loss": 0.8468, + "step": 6165 + }, + { + "epoch": 0.41777898231587507, + "grad_norm": 7.336645603179932, + "learning_rate": 9.258676158532413e-05, + "loss": 0.8529, + "step": 6166 + }, + { + "epoch": 0.41784673758384716, + "grad_norm": 7.711108207702637, + "learning_rate": 9.258539256622631e-05, + "loss": 0.8802, + "step": 6167 + }, + { + "epoch": 0.41791449285181925, + "grad_norm": 8.625036239624023, + "learning_rate": 9.258402354712849e-05, + "loss": 1.0263, + "step": 6168 + }, + { + "epoch": 0.4179822481197913, + "grad_norm": 7.005527019500732, + "learning_rate": 9.258265452803067e-05, + "loss": 0.9728, + "step": 6169 + }, + { + "epoch": 0.4180500033877634, + "grad_norm": 6.906123161315918, + "learning_rate": 9.258128550893286e-05, + "loss": 0.8958, + "step": 6170 + }, + { + "epoch": 0.41811775865573547, + "grad_norm": 7.147536754608154, + "learning_rate": 9.257991648983504e-05, + "loss": 1.1466, + "step": 6171 + }, + { + "epoch": 0.41818551392370756, + "grad_norm": 5.8256001472473145, + "learning_rate": 9.257854747073722e-05, + "loss": 0.7189, + "step": 6172 + }, + { + "epoch": 0.41825326919167966, + "grad_norm": 5.144516944885254, + "learning_rate": 9.25771784516394e-05, + "loss": 0.6368, + "step": 6173 + }, + { + "epoch": 0.41832102445965175, + "grad_norm": 7.161872863769531, + "learning_rate": 9.25758094325416e-05, + "loss": 1.0074, + "step": 6174 + }, + { + "epoch": 0.41838877972762384, + "grad_norm": 9.098782539367676, + "learning_rate": 9.257444041344378e-05, + "loss": 0.9261, + "step": 6175 + }, + { + "epoch": 0.41845653499559593, + "grad_norm": 7.566091060638428, + "learning_rate": 9.257307139434596e-05, + "loss": 0.8845, + "step": 6176 + }, + { + "epoch": 0.41852429026356797, + "grad_norm": 8.618456840515137, + "learning_rate": 9.257170237524814e-05, + "loss": 0.8643, + "step": 6177 + }, + { + "epoch": 0.41859204553154006, + "grad_norm": 8.870187759399414, + "learning_rate": 9.257033335615032e-05, + "loss": 1.1915, + "step": 6178 + }, + { + "epoch": 0.41865980079951215, + "grad_norm": 6.494687557220459, + "learning_rate": 9.256896433705251e-05, + "loss": 0.6523, + "step": 6179 + }, + { + "epoch": 0.41872755606748424, + "grad_norm": 7.417816638946533, + "learning_rate": 9.256759531795469e-05, + "loss": 0.9454, + "step": 6180 + }, + { + "epoch": 0.41879531133545633, + "grad_norm": 11.310504913330078, + "learning_rate": 9.256622629885687e-05, + "loss": 1.2984, + "step": 6181 + }, + { + "epoch": 0.4188630666034284, + "grad_norm": 6.3585615158081055, + "learning_rate": 9.256485727975905e-05, + "loss": 0.8379, + "step": 6182 + }, + { + "epoch": 0.4189308218714005, + "grad_norm": 8.134123802185059, + "learning_rate": 9.256348826066123e-05, + "loss": 0.9962, + "step": 6183 + }, + { + "epoch": 0.4189985771393726, + "grad_norm": 5.962007999420166, + "learning_rate": 9.256211924156343e-05, + "loss": 0.8923, + "step": 6184 + }, + { + "epoch": 0.41906633240734464, + "grad_norm": 6.423482894897461, + "learning_rate": 9.256075022246561e-05, + "loss": 0.7991, + "step": 6185 + }, + { + "epoch": 0.41913408767531674, + "grad_norm": 6.406213283538818, + "learning_rate": 9.255938120336779e-05, + "loss": 0.9454, + "step": 6186 + }, + { + "epoch": 0.4192018429432888, + "grad_norm": 4.926299571990967, + "learning_rate": 9.255801218426997e-05, + "loss": 0.6442, + "step": 6187 + }, + { + "epoch": 0.4192695982112609, + "grad_norm": 6.160318374633789, + "learning_rate": 9.255664316517216e-05, + "loss": 0.8575, + "step": 6188 + }, + { + "epoch": 0.419337353479233, + "grad_norm": 9.17479133605957, + "learning_rate": 9.255527414607434e-05, + "loss": 1.2344, + "step": 6189 + }, + { + "epoch": 0.4194051087472051, + "grad_norm": 8.95871639251709, + "learning_rate": 9.255390512697652e-05, + "loss": 0.9193, + "step": 6190 + }, + { + "epoch": 0.4194728640151772, + "grad_norm": 5.636984825134277, + "learning_rate": 9.25525361078787e-05, + "loss": 0.725, + "step": 6191 + }, + { + "epoch": 0.4195406192831493, + "grad_norm": 6.044233798980713, + "learning_rate": 9.255116708878088e-05, + "loss": 0.7526, + "step": 6192 + }, + { + "epoch": 0.4196083745511214, + "grad_norm": 6.230378150939941, + "learning_rate": 9.254979806968308e-05, + "loss": 0.7923, + "step": 6193 + }, + { + "epoch": 0.4196761298190934, + "grad_norm": 8.391528129577637, + "learning_rate": 9.254842905058526e-05, + "loss": 1.0368, + "step": 6194 + }, + { + "epoch": 0.4197438850870655, + "grad_norm": 7.150004863739014, + "learning_rate": 9.254706003148744e-05, + "loss": 0.9087, + "step": 6195 + }, + { + "epoch": 0.4198116403550376, + "grad_norm": 8.44096565246582, + "learning_rate": 9.254569101238962e-05, + "loss": 0.9511, + "step": 6196 + }, + { + "epoch": 0.4198793956230097, + "grad_norm": 7.8519768714904785, + "learning_rate": 9.254432199329181e-05, + "loss": 1.0565, + "step": 6197 + }, + { + "epoch": 0.4199471508909818, + "grad_norm": 6.873769760131836, + "learning_rate": 9.254295297419399e-05, + "loss": 0.9016, + "step": 6198 + }, + { + "epoch": 0.42001490615895387, + "grad_norm": 6.1226396560668945, + "learning_rate": 9.254158395509617e-05, + "loss": 0.8439, + "step": 6199 + }, + { + "epoch": 0.42008266142692596, + "grad_norm": 6.68569803237915, + "learning_rate": 9.254021493599835e-05, + "loss": 0.9836, + "step": 6200 + }, + { + "epoch": 0.42015041669489805, + "grad_norm": 5.907567501068115, + "learning_rate": 9.253884591690055e-05, + "loss": 0.8047, + "step": 6201 + }, + { + "epoch": 0.4202181719628701, + "grad_norm": 5.849989891052246, + "learning_rate": 9.253747689780273e-05, + "loss": 0.8791, + "step": 6202 + }, + { + "epoch": 0.4202859272308422, + "grad_norm": 5.872654914855957, + "learning_rate": 9.253610787870491e-05, + "loss": 0.8368, + "step": 6203 + }, + { + "epoch": 0.4203536824988143, + "grad_norm": 5.799046039581299, + "learning_rate": 9.25347388596071e-05, + "loss": 0.8215, + "step": 6204 + }, + { + "epoch": 0.42042143776678637, + "grad_norm": 5.331169605255127, + "learning_rate": 9.253336984050928e-05, + "loss": 0.9533, + "step": 6205 + }, + { + "epoch": 0.42048919303475846, + "grad_norm": 6.436175346374512, + "learning_rate": 9.253200082141146e-05, + "loss": 0.7356, + "step": 6206 + }, + { + "epoch": 0.42055694830273055, + "grad_norm": 6.290867328643799, + "learning_rate": 9.253063180231366e-05, + "loss": 0.7771, + "step": 6207 + }, + { + "epoch": 0.42062470357070264, + "grad_norm": 8.476110458374023, + "learning_rate": 9.252926278321584e-05, + "loss": 1.0357, + "step": 6208 + }, + { + "epoch": 0.42069245883867473, + "grad_norm": 8.561487197875977, + "learning_rate": 9.252789376411802e-05, + "loss": 0.974, + "step": 6209 + }, + { + "epoch": 0.42076021410664677, + "grad_norm": 7.24920654296875, + "learning_rate": 9.25265247450202e-05, + "loss": 1.2393, + "step": 6210 + }, + { + "epoch": 0.42082796937461886, + "grad_norm": 7.135931015014648, + "learning_rate": 9.252515572592239e-05, + "loss": 1.0491, + "step": 6211 + }, + { + "epoch": 0.42089572464259095, + "grad_norm": 8.867389678955078, + "learning_rate": 9.252378670682457e-05, + "loss": 0.836, + "step": 6212 + }, + { + "epoch": 0.42096347991056304, + "grad_norm": 6.738379955291748, + "learning_rate": 9.252241768772675e-05, + "loss": 0.7829, + "step": 6213 + }, + { + "epoch": 0.42103123517853513, + "grad_norm": 7.226996898651123, + "learning_rate": 9.252104866862893e-05, + "loss": 0.8716, + "step": 6214 + }, + { + "epoch": 0.4210989904465072, + "grad_norm": 6.236578941345215, + "learning_rate": 9.251967964953111e-05, + "loss": 0.62, + "step": 6215 + }, + { + "epoch": 0.4211667457144793, + "grad_norm": 7.801733016967773, + "learning_rate": 9.25183106304333e-05, + "loss": 1.1107, + "step": 6216 + }, + { + "epoch": 0.4212345009824514, + "grad_norm": 6.062146186828613, + "learning_rate": 9.251694161133549e-05, + "loss": 0.8655, + "step": 6217 + }, + { + "epoch": 0.42130225625042345, + "grad_norm": 5.408603668212891, + "learning_rate": 9.251557259223767e-05, + "loss": 0.7054, + "step": 6218 + }, + { + "epoch": 0.42137001151839554, + "grad_norm": 9.531839370727539, + "learning_rate": 9.251420357313985e-05, + "loss": 0.9609, + "step": 6219 + }, + { + "epoch": 0.42143776678636763, + "grad_norm": 6.054145336151123, + "learning_rate": 9.251283455404204e-05, + "loss": 0.9311, + "step": 6220 + }, + { + "epoch": 0.4215055220543397, + "grad_norm": 6.372133731842041, + "learning_rate": 9.251146553494422e-05, + "loss": 0.8797, + "step": 6221 + }, + { + "epoch": 0.4215732773223118, + "grad_norm": 6.394374847412109, + "learning_rate": 9.25100965158464e-05, + "loss": 0.7603, + "step": 6222 + }, + { + "epoch": 0.4216410325902839, + "grad_norm": 7.188971519470215, + "learning_rate": 9.250872749674858e-05, + "loss": 0.5915, + "step": 6223 + }, + { + "epoch": 0.421708787858256, + "grad_norm": 7.44260311126709, + "learning_rate": 9.250735847765076e-05, + "loss": 0.8215, + "step": 6224 + }, + { + "epoch": 0.4217765431262281, + "grad_norm": 7.43394660949707, + "learning_rate": 9.250598945855296e-05, + "loss": 0.891, + "step": 6225 + }, + { + "epoch": 0.4218442983942001, + "grad_norm": 6.9161200523376465, + "learning_rate": 9.250462043945514e-05, + "loss": 0.8795, + "step": 6226 + }, + { + "epoch": 0.4219120536621722, + "grad_norm": 6.955477714538574, + "learning_rate": 9.250325142035732e-05, + "loss": 0.9781, + "step": 6227 + }, + { + "epoch": 0.4219798089301443, + "grad_norm": 7.632521629333496, + "learning_rate": 9.25018824012595e-05, + "loss": 0.7374, + "step": 6228 + }, + { + "epoch": 0.4220475641981164, + "grad_norm": 7.344869613647461, + "learning_rate": 9.250051338216169e-05, + "loss": 0.9506, + "step": 6229 + }, + { + "epoch": 0.4221153194660885, + "grad_norm": 7.199647426605225, + "learning_rate": 9.249914436306387e-05, + "loss": 0.9225, + "step": 6230 + }, + { + "epoch": 0.4221830747340606, + "grad_norm": 5.884180068969727, + "learning_rate": 9.249777534396605e-05, + "loss": 0.729, + "step": 6231 + }, + { + "epoch": 0.4222508300020327, + "grad_norm": 6.82388973236084, + "learning_rate": 9.249640632486823e-05, + "loss": 1.0978, + "step": 6232 + }, + { + "epoch": 0.42231858527000476, + "grad_norm": 8.883796691894531, + "learning_rate": 9.249503730577041e-05, + "loss": 1.1025, + "step": 6233 + }, + { + "epoch": 0.4223863405379768, + "grad_norm": 6.1692938804626465, + "learning_rate": 9.24936682866726e-05, + "loss": 0.6734, + "step": 6234 + }, + { + "epoch": 0.4224540958059489, + "grad_norm": 8.827327728271484, + "learning_rate": 9.249229926757479e-05, + "loss": 1.1097, + "step": 6235 + }, + { + "epoch": 0.422521851073921, + "grad_norm": 6.649320602416992, + "learning_rate": 9.249093024847697e-05, + "loss": 0.9978, + "step": 6236 + }, + { + "epoch": 0.4225896063418931, + "grad_norm": 8.04487419128418, + "learning_rate": 9.248956122937915e-05, + "loss": 0.9293, + "step": 6237 + }, + { + "epoch": 0.42265736160986517, + "grad_norm": 7.031024932861328, + "learning_rate": 9.248819221028133e-05, + "loss": 0.7474, + "step": 6238 + }, + { + "epoch": 0.42272511687783726, + "grad_norm": 4.858736038208008, + "learning_rate": 9.248682319118352e-05, + "loss": 0.6429, + "step": 6239 + }, + { + "epoch": 0.42279287214580935, + "grad_norm": 6.075960159301758, + "learning_rate": 9.24854541720857e-05, + "loss": 0.7595, + "step": 6240 + }, + { + "epoch": 0.42286062741378144, + "grad_norm": 11.653654098510742, + "learning_rate": 9.248408515298788e-05, + "loss": 0.7962, + "step": 6241 + }, + { + "epoch": 0.4229283826817535, + "grad_norm": 6.255251884460449, + "learning_rate": 9.248271613389006e-05, + "loss": 0.9027, + "step": 6242 + }, + { + "epoch": 0.42299613794972557, + "grad_norm": 6.908040523529053, + "learning_rate": 9.248134711479226e-05, + "loss": 0.936, + "step": 6243 + }, + { + "epoch": 0.42306389321769766, + "grad_norm": 7.853124141693115, + "learning_rate": 9.247997809569444e-05, + "loss": 1.0966, + "step": 6244 + }, + { + "epoch": 0.42313164848566975, + "grad_norm": 7.661264896392822, + "learning_rate": 9.247860907659662e-05, + "loss": 0.8407, + "step": 6245 + }, + { + "epoch": 0.42319940375364185, + "grad_norm": 6.52161169052124, + "learning_rate": 9.24772400574988e-05, + "loss": 0.862, + "step": 6246 + }, + { + "epoch": 0.42326715902161394, + "grad_norm": 6.456472873687744, + "learning_rate": 9.247587103840099e-05, + "loss": 0.713, + "step": 6247 + }, + { + "epoch": 0.42333491428958603, + "grad_norm": 6.655475616455078, + "learning_rate": 9.247450201930317e-05, + "loss": 0.7422, + "step": 6248 + }, + { + "epoch": 0.4234026695575581, + "grad_norm": 6.448397636413574, + "learning_rate": 9.247313300020535e-05, + "loss": 0.8645, + "step": 6249 + }, + { + "epoch": 0.4234704248255302, + "grad_norm": 5.964814186096191, + "learning_rate": 9.247176398110755e-05, + "loss": 0.6173, + "step": 6250 + }, + { + "epoch": 0.42353818009350225, + "grad_norm": 6.125659465789795, + "learning_rate": 9.247039496200973e-05, + "loss": 0.6951, + "step": 6251 + }, + { + "epoch": 0.42360593536147434, + "grad_norm": 6.881291389465332, + "learning_rate": 9.246902594291191e-05, + "loss": 0.7525, + "step": 6252 + }, + { + "epoch": 0.42367369062944643, + "grad_norm": 6.647464752197266, + "learning_rate": 9.24676569238141e-05, + "loss": 0.6757, + "step": 6253 + }, + { + "epoch": 0.4237414458974185, + "grad_norm": 6.530306816101074, + "learning_rate": 9.246628790471628e-05, + "loss": 1.1478, + "step": 6254 + }, + { + "epoch": 0.4238092011653906, + "grad_norm": 7.5514984130859375, + "learning_rate": 9.246491888561846e-05, + "loss": 0.868, + "step": 6255 + }, + { + "epoch": 0.4238769564333627, + "grad_norm": 10.405878067016602, + "learning_rate": 9.246354986652064e-05, + "loss": 1.0607, + "step": 6256 + }, + { + "epoch": 0.4239447117013348, + "grad_norm": 6.7608418464660645, + "learning_rate": 9.246218084742284e-05, + "loss": 0.9173, + "step": 6257 + }, + { + "epoch": 0.4240124669693069, + "grad_norm": 7.583088397979736, + "learning_rate": 9.246081182832502e-05, + "loss": 0.9091, + "step": 6258 + }, + { + "epoch": 0.4240802222372789, + "grad_norm": 7.009425163269043, + "learning_rate": 9.24594428092272e-05, + "loss": 0.9509, + "step": 6259 + }, + { + "epoch": 0.424147977505251, + "grad_norm": 10.783225059509277, + "learning_rate": 9.245807379012938e-05, + "loss": 0.9033, + "step": 6260 + }, + { + "epoch": 0.4242157327732231, + "grad_norm": 6.1244611740112305, + "learning_rate": 9.245670477103156e-05, + "loss": 0.7285, + "step": 6261 + }, + { + "epoch": 0.4242834880411952, + "grad_norm": 6.983814716339111, + "learning_rate": 9.245533575193375e-05, + "loss": 0.9463, + "step": 6262 + }, + { + "epoch": 0.4243512433091673, + "grad_norm": 6.019556522369385, + "learning_rate": 9.245396673283593e-05, + "loss": 0.8072, + "step": 6263 + }, + { + "epoch": 0.4244189985771394, + "grad_norm": 7.2982001304626465, + "learning_rate": 9.245259771373811e-05, + "loss": 0.8905, + "step": 6264 + }, + { + "epoch": 0.4244867538451115, + "grad_norm": 7.231256008148193, + "learning_rate": 9.245122869464029e-05, + "loss": 0.9114, + "step": 6265 + }, + { + "epoch": 0.42455450911308357, + "grad_norm": 6.597518444061279, + "learning_rate": 9.244985967554249e-05, + "loss": 1.0431, + "step": 6266 + }, + { + "epoch": 0.4246222643810556, + "grad_norm": 8.651244163513184, + "learning_rate": 9.244849065644467e-05, + "loss": 1.1346, + "step": 6267 + }, + { + "epoch": 0.4246900196490277, + "grad_norm": 9.052881240844727, + "learning_rate": 9.244712163734685e-05, + "loss": 0.7779, + "step": 6268 + }, + { + "epoch": 0.4247577749169998, + "grad_norm": 6.886295318603516, + "learning_rate": 9.244575261824903e-05, + "loss": 0.9643, + "step": 6269 + }, + { + "epoch": 0.4248255301849719, + "grad_norm": 6.241147518157959, + "learning_rate": 9.244438359915121e-05, + "loss": 0.7539, + "step": 6270 + }, + { + "epoch": 0.42489328545294397, + "grad_norm": 7.910638332366943, + "learning_rate": 9.24430145800534e-05, + "loss": 0.9967, + "step": 6271 + }, + { + "epoch": 0.42496104072091606, + "grad_norm": 7.068695545196533, + "learning_rate": 9.244164556095558e-05, + "loss": 0.9531, + "step": 6272 + }, + { + "epoch": 0.42502879598888815, + "grad_norm": 7.7406907081604, + "learning_rate": 9.244027654185776e-05, + "loss": 0.8593, + "step": 6273 + }, + { + "epoch": 0.42509655125686024, + "grad_norm": 6.548360824584961, + "learning_rate": 9.243890752275994e-05, + "loss": 0.7806, + "step": 6274 + }, + { + "epoch": 0.4251643065248323, + "grad_norm": 6.20359468460083, + "learning_rate": 9.243753850366214e-05, + "loss": 0.846, + "step": 6275 + }, + { + "epoch": 0.42523206179280437, + "grad_norm": 6.6235270500183105, + "learning_rate": 9.243616948456432e-05, + "loss": 0.806, + "step": 6276 + }, + { + "epoch": 0.42529981706077646, + "grad_norm": 7.216398239135742, + "learning_rate": 9.24348004654665e-05, + "loss": 1.0497, + "step": 6277 + }, + { + "epoch": 0.42536757232874856, + "grad_norm": 6.946768283843994, + "learning_rate": 9.243343144636868e-05, + "loss": 0.7862, + "step": 6278 + }, + { + "epoch": 0.42543532759672065, + "grad_norm": 6.441595554351807, + "learning_rate": 9.243206242727086e-05, + "loss": 0.8663, + "step": 6279 + }, + { + "epoch": 0.42550308286469274, + "grad_norm": 8.521356582641602, + "learning_rate": 9.243069340817305e-05, + "loss": 1.3689, + "step": 6280 + }, + { + "epoch": 0.42557083813266483, + "grad_norm": 7.311733722686768, + "learning_rate": 9.242932438907523e-05, + "loss": 0.8788, + "step": 6281 + }, + { + "epoch": 0.4256385934006369, + "grad_norm": 7.157277584075928, + "learning_rate": 9.242795536997741e-05, + "loss": 0.8245, + "step": 6282 + }, + { + "epoch": 0.42570634866860896, + "grad_norm": 5.332032680511475, + "learning_rate": 9.242658635087959e-05, + "loss": 0.7333, + "step": 6283 + }, + { + "epoch": 0.42577410393658105, + "grad_norm": 6.090252876281738, + "learning_rate": 9.242521733178179e-05, + "loss": 0.7084, + "step": 6284 + }, + { + "epoch": 0.42584185920455314, + "grad_norm": 6.971512317657471, + "learning_rate": 9.242384831268397e-05, + "loss": 0.7226, + "step": 6285 + }, + { + "epoch": 0.42590961447252523, + "grad_norm": 5.37277364730835, + "learning_rate": 9.242247929358615e-05, + "loss": 0.701, + "step": 6286 + }, + { + "epoch": 0.4259773697404973, + "grad_norm": 6.7667622566223145, + "learning_rate": 9.242111027448833e-05, + "loss": 0.9389, + "step": 6287 + }, + { + "epoch": 0.4260451250084694, + "grad_norm": 6.8329596519470215, + "learning_rate": 9.241974125539051e-05, + "loss": 0.8574, + "step": 6288 + }, + { + "epoch": 0.4261128802764415, + "grad_norm": 6.124345779418945, + "learning_rate": 9.24183722362927e-05, + "loss": 0.9497, + "step": 6289 + }, + { + "epoch": 0.4261806355444136, + "grad_norm": 6.190674304962158, + "learning_rate": 9.241700321719488e-05, + "loss": 0.8933, + "step": 6290 + }, + { + "epoch": 0.42624839081238564, + "grad_norm": 6.881906986236572, + "learning_rate": 9.241563419809706e-05, + "loss": 1.0343, + "step": 6291 + }, + { + "epoch": 0.4263161460803577, + "grad_norm": 6.8810648918151855, + "learning_rate": 9.241426517899924e-05, + "loss": 1.1141, + "step": 6292 + }, + { + "epoch": 0.4263839013483298, + "grad_norm": 7.524305820465088, + "learning_rate": 9.241289615990144e-05, + "loss": 0.9136, + "step": 6293 + }, + { + "epoch": 0.4264516566163019, + "grad_norm": 5.732180595397949, + "learning_rate": 9.241152714080362e-05, + "loss": 0.8808, + "step": 6294 + }, + { + "epoch": 0.426519411884274, + "grad_norm": 5.485534191131592, + "learning_rate": 9.24101581217058e-05, + "loss": 0.8755, + "step": 6295 + }, + { + "epoch": 0.4265871671522461, + "grad_norm": 7.75483512878418, + "learning_rate": 9.240878910260799e-05, + "loss": 0.6602, + "step": 6296 + }, + { + "epoch": 0.4266549224202182, + "grad_norm": 5.6334075927734375, + "learning_rate": 9.240742008351017e-05, + "loss": 0.6075, + "step": 6297 + }, + { + "epoch": 0.4267226776881903, + "grad_norm": 6.316882133483887, + "learning_rate": 9.240605106441235e-05, + "loss": 0.8607, + "step": 6298 + }, + { + "epoch": 0.42679043295616237, + "grad_norm": 7.0021257400512695, + "learning_rate": 9.240468204531455e-05, + "loss": 0.8648, + "step": 6299 + }, + { + "epoch": 0.4268581882241344, + "grad_norm": 5.8421125411987305, + "learning_rate": 9.240331302621673e-05, + "loss": 0.7887, + "step": 6300 + }, + { + "epoch": 0.4269259434921065, + "grad_norm": 8.877615928649902, + "learning_rate": 9.24019440071189e-05, + "loss": 1.3777, + "step": 6301 + }, + { + "epoch": 0.4269936987600786, + "grad_norm": 7.440095901489258, + "learning_rate": 9.240057498802109e-05, + "loss": 1.0015, + "step": 6302 + }, + { + "epoch": 0.4270614540280507, + "grad_norm": 6.460360527038574, + "learning_rate": 9.239920596892328e-05, + "loss": 0.6861, + "step": 6303 + }, + { + "epoch": 0.42712920929602277, + "grad_norm": 9.731608390808105, + "learning_rate": 9.239783694982546e-05, + "loss": 0.8601, + "step": 6304 + }, + { + "epoch": 0.42719696456399486, + "grad_norm": 7.891725540161133, + "learning_rate": 9.239646793072764e-05, + "loss": 0.7478, + "step": 6305 + }, + { + "epoch": 0.42726471983196695, + "grad_norm": 6.742074966430664, + "learning_rate": 9.239509891162982e-05, + "loss": 0.7774, + "step": 6306 + }, + { + "epoch": 0.42733247509993905, + "grad_norm": 7.515460014343262, + "learning_rate": 9.239372989253202e-05, + "loss": 0.9524, + "step": 6307 + }, + { + "epoch": 0.4274002303679111, + "grad_norm": 6.278934478759766, + "learning_rate": 9.23923608734342e-05, + "loss": 0.9216, + "step": 6308 + }, + { + "epoch": 0.4274679856358832, + "grad_norm": 6.528146266937256, + "learning_rate": 9.239099185433638e-05, + "loss": 0.9959, + "step": 6309 + }, + { + "epoch": 0.42753574090385527, + "grad_norm": 5.953476905822754, + "learning_rate": 9.238962283523856e-05, + "loss": 1.0062, + "step": 6310 + }, + { + "epoch": 0.42760349617182736, + "grad_norm": 7.488530158996582, + "learning_rate": 9.238825381614074e-05, + "loss": 0.8633, + "step": 6311 + }, + { + "epoch": 0.42767125143979945, + "grad_norm": 6.974276542663574, + "learning_rate": 9.238688479704293e-05, + "loss": 1.0849, + "step": 6312 + }, + { + "epoch": 0.42773900670777154, + "grad_norm": 7.560183048248291, + "learning_rate": 9.238551577794511e-05, + "loss": 0.9238, + "step": 6313 + }, + { + "epoch": 0.42780676197574363, + "grad_norm": 6.859951019287109, + "learning_rate": 9.238414675884729e-05, + "loss": 0.7021, + "step": 6314 + }, + { + "epoch": 0.4278745172437157, + "grad_norm": 8.095294952392578, + "learning_rate": 9.238277773974947e-05, + "loss": 0.9374, + "step": 6315 + }, + { + "epoch": 0.42794227251168776, + "grad_norm": 6.5458221435546875, + "learning_rate": 9.238140872065165e-05, + "loss": 0.7089, + "step": 6316 + }, + { + "epoch": 0.42801002777965985, + "grad_norm": 6.9012274742126465, + "learning_rate": 9.238003970155385e-05, + "loss": 1.0967, + "step": 6317 + }, + { + "epoch": 0.42807778304763194, + "grad_norm": 7.991293430328369, + "learning_rate": 9.237867068245603e-05, + "loss": 0.8149, + "step": 6318 + }, + { + "epoch": 0.42814553831560404, + "grad_norm": 6.636972904205322, + "learning_rate": 9.23773016633582e-05, + "loss": 0.9311, + "step": 6319 + }, + { + "epoch": 0.4282132935835761, + "grad_norm": 7.938980579376221, + "learning_rate": 9.237593264426039e-05, + "loss": 1.021, + "step": 6320 + }, + { + "epoch": 0.4282810488515482, + "grad_norm": 6.600100994110107, + "learning_rate": 9.237456362516258e-05, + "loss": 0.9375, + "step": 6321 + }, + { + "epoch": 0.4283488041195203, + "grad_norm": 8.848981857299805, + "learning_rate": 9.237319460606476e-05, + "loss": 0.7755, + "step": 6322 + }, + { + "epoch": 0.4284165593874924, + "grad_norm": 8.488492012023926, + "learning_rate": 9.237182558696694e-05, + "loss": 0.8267, + "step": 6323 + }, + { + "epoch": 0.42848431465546444, + "grad_norm": 5.880466938018799, + "learning_rate": 9.237045656786912e-05, + "loss": 0.7597, + "step": 6324 + }, + { + "epoch": 0.42855206992343653, + "grad_norm": 6.318797588348389, + "learning_rate": 9.23690875487713e-05, + "loss": 1.0001, + "step": 6325 + }, + { + "epoch": 0.4286198251914086, + "grad_norm": 6.730221271514893, + "learning_rate": 9.23677185296735e-05, + "loss": 0.9811, + "step": 6326 + }, + { + "epoch": 0.4286875804593807, + "grad_norm": 6.913969993591309, + "learning_rate": 9.236634951057568e-05, + "loss": 0.8782, + "step": 6327 + }, + { + "epoch": 0.4287553357273528, + "grad_norm": 7.533535480499268, + "learning_rate": 9.236498049147786e-05, + "loss": 0.652, + "step": 6328 + }, + { + "epoch": 0.4288230909953249, + "grad_norm": 7.687053680419922, + "learning_rate": 9.236361147238004e-05, + "loss": 0.9665, + "step": 6329 + }, + { + "epoch": 0.428890846263297, + "grad_norm": 7.747707843780518, + "learning_rate": 9.236224245328223e-05, + "loss": 1.1481, + "step": 6330 + }, + { + "epoch": 0.4289586015312691, + "grad_norm": 7.709572792053223, + "learning_rate": 9.236087343418441e-05, + "loss": 0.9191, + "step": 6331 + }, + { + "epoch": 0.4290263567992411, + "grad_norm": 8.473289489746094, + "learning_rate": 9.235950441508659e-05, + "loss": 1.0646, + "step": 6332 + }, + { + "epoch": 0.4290941120672132, + "grad_norm": 6.672016620635986, + "learning_rate": 9.235813539598877e-05, + "loss": 0.7865, + "step": 6333 + }, + { + "epoch": 0.4291618673351853, + "grad_norm": 6.022176265716553, + "learning_rate": 9.235676637689095e-05, + "loss": 0.8593, + "step": 6334 + }, + { + "epoch": 0.4292296226031574, + "grad_norm": 6.375059604644775, + "learning_rate": 9.235539735779315e-05, + "loss": 0.8754, + "step": 6335 + }, + { + "epoch": 0.4292973778711295, + "grad_norm": 6.617783546447754, + "learning_rate": 9.235402833869533e-05, + "loss": 1.022, + "step": 6336 + }, + { + "epoch": 0.4293651331391016, + "grad_norm": 5.790512561798096, + "learning_rate": 9.235265931959751e-05, + "loss": 0.8903, + "step": 6337 + }, + { + "epoch": 0.42943288840707367, + "grad_norm": 6.528816223144531, + "learning_rate": 9.235129030049969e-05, + "loss": 0.8999, + "step": 6338 + }, + { + "epoch": 0.42950064367504576, + "grad_norm": 7.407886505126953, + "learning_rate": 9.234992128140188e-05, + "loss": 0.6748, + "step": 6339 + }, + { + "epoch": 0.4295683989430178, + "grad_norm": 6.933940887451172, + "learning_rate": 9.234855226230406e-05, + "loss": 1.0334, + "step": 6340 + }, + { + "epoch": 0.4296361542109899, + "grad_norm": 6.4698686599731445, + "learning_rate": 9.234718324320624e-05, + "loss": 0.8778, + "step": 6341 + }, + { + "epoch": 0.429703909478962, + "grad_norm": 7.047656059265137, + "learning_rate": 9.234581422410844e-05, + "loss": 0.9406, + "step": 6342 + }, + { + "epoch": 0.42977166474693407, + "grad_norm": 6.107243061065674, + "learning_rate": 9.234444520501062e-05, + "loss": 0.716, + "step": 6343 + }, + { + "epoch": 0.42983942001490616, + "grad_norm": 7.644023895263672, + "learning_rate": 9.23430761859128e-05, + "loss": 0.9632, + "step": 6344 + }, + { + "epoch": 0.42990717528287825, + "grad_norm": 8.450486183166504, + "learning_rate": 9.234170716681499e-05, + "loss": 0.9608, + "step": 6345 + }, + { + "epoch": 0.42997493055085034, + "grad_norm": 6.914335250854492, + "learning_rate": 9.234033814771717e-05, + "loss": 0.9473, + "step": 6346 + }, + { + "epoch": 0.43004268581882243, + "grad_norm": 7.677610874176025, + "learning_rate": 9.233896912861935e-05, + "loss": 0.7403, + "step": 6347 + }, + { + "epoch": 0.4301104410867945, + "grad_norm": 8.03879165649414, + "learning_rate": 9.233760010952153e-05, + "loss": 1.1334, + "step": 6348 + }, + { + "epoch": 0.43017819635476656, + "grad_norm": 8.243664741516113, + "learning_rate": 9.233623109042373e-05, + "loss": 0.8745, + "step": 6349 + }, + { + "epoch": 0.43024595162273865, + "grad_norm": 5.417998313903809, + "learning_rate": 9.23348620713259e-05, + "loss": 0.8243, + "step": 6350 + }, + { + "epoch": 0.43031370689071075, + "grad_norm": 9.148211479187012, + "learning_rate": 9.233349305222809e-05, + "loss": 1.0485, + "step": 6351 + }, + { + "epoch": 0.43038146215868284, + "grad_norm": 5.762056827545166, + "learning_rate": 9.233212403313027e-05, + "loss": 0.8528, + "step": 6352 + }, + { + "epoch": 0.43044921742665493, + "grad_norm": 7.960264682769775, + "learning_rate": 9.233075501403246e-05, + "loss": 1.1308, + "step": 6353 + }, + { + "epoch": 0.430516972694627, + "grad_norm": 5.9596710205078125, + "learning_rate": 9.232938599493464e-05, + "loss": 0.8385, + "step": 6354 + }, + { + "epoch": 0.4305847279625991, + "grad_norm": 7.483585357666016, + "learning_rate": 9.232801697583682e-05, + "loss": 0.8117, + "step": 6355 + }, + { + "epoch": 0.4306524832305712, + "grad_norm": 11.998135566711426, + "learning_rate": 9.2326647956739e-05, + "loss": 0.8077, + "step": 6356 + }, + { + "epoch": 0.43072023849854324, + "grad_norm": 5.418194770812988, + "learning_rate": 9.232527893764118e-05, + "loss": 0.6575, + "step": 6357 + }, + { + "epoch": 0.43078799376651533, + "grad_norm": 7.568809509277344, + "learning_rate": 9.232390991854338e-05, + "loss": 0.9036, + "step": 6358 + }, + { + "epoch": 0.4308557490344874, + "grad_norm": 6.613016128540039, + "learning_rate": 9.232254089944556e-05, + "loss": 0.9918, + "step": 6359 + }, + { + "epoch": 0.4309235043024595, + "grad_norm": 7.7656683921813965, + "learning_rate": 9.232117188034774e-05, + "loss": 1.1026, + "step": 6360 + }, + { + "epoch": 0.4309912595704316, + "grad_norm": 6.714265823364258, + "learning_rate": 9.231980286124992e-05, + "loss": 0.8636, + "step": 6361 + }, + { + "epoch": 0.4310590148384037, + "grad_norm": 6.468122482299805, + "learning_rate": 9.231843384215211e-05, + "loss": 0.7608, + "step": 6362 + }, + { + "epoch": 0.4311267701063758, + "grad_norm": 7.882842540740967, + "learning_rate": 9.231706482305429e-05, + "loss": 0.8926, + "step": 6363 + }, + { + "epoch": 0.4311945253743479, + "grad_norm": 6.002429485321045, + "learning_rate": 9.231569580395647e-05, + "loss": 0.8137, + "step": 6364 + }, + { + "epoch": 0.4312622806423199, + "grad_norm": 6.818765163421631, + "learning_rate": 9.231432678485865e-05, + "loss": 0.8249, + "step": 6365 + }, + { + "epoch": 0.431330035910292, + "grad_norm": 8.535319328308105, + "learning_rate": 9.231295776576083e-05, + "loss": 0.888, + "step": 6366 + }, + { + "epoch": 0.4313977911782641, + "grad_norm": 7.823378086090088, + "learning_rate": 9.231158874666303e-05, + "loss": 0.8847, + "step": 6367 + }, + { + "epoch": 0.4314655464462362, + "grad_norm": 7.392731189727783, + "learning_rate": 9.23102197275652e-05, + "loss": 0.7595, + "step": 6368 + }, + { + "epoch": 0.4315333017142083, + "grad_norm": 5.637443542480469, + "learning_rate": 9.230885070846739e-05, + "loss": 0.6636, + "step": 6369 + }, + { + "epoch": 0.4316010569821804, + "grad_norm": 8.26915454864502, + "learning_rate": 9.230748168936957e-05, + "loss": 0.7704, + "step": 6370 + }, + { + "epoch": 0.43166881225015247, + "grad_norm": 8.045785903930664, + "learning_rate": 9.230611267027175e-05, + "loss": 0.8427, + "step": 6371 + }, + { + "epoch": 0.43173656751812456, + "grad_norm": 8.981038093566895, + "learning_rate": 9.230474365117394e-05, + "loss": 0.9997, + "step": 6372 + }, + { + "epoch": 0.4318043227860966, + "grad_norm": 7.137066841125488, + "learning_rate": 9.230337463207612e-05, + "loss": 0.7654, + "step": 6373 + }, + { + "epoch": 0.4318720780540687, + "grad_norm": 6.7576165199279785, + "learning_rate": 9.23020056129783e-05, + "loss": 0.8766, + "step": 6374 + }, + { + "epoch": 0.4319398333220408, + "grad_norm": 6.381602764129639, + "learning_rate": 9.230063659388048e-05, + "loss": 0.9203, + "step": 6375 + }, + { + "epoch": 0.43200758859001287, + "grad_norm": 6.969717502593994, + "learning_rate": 9.229926757478268e-05, + "loss": 0.9507, + "step": 6376 + }, + { + "epoch": 0.43207534385798496, + "grad_norm": 5.76108455657959, + "learning_rate": 9.229789855568486e-05, + "loss": 0.8533, + "step": 6377 + }, + { + "epoch": 0.43214309912595705, + "grad_norm": 6.446774005889893, + "learning_rate": 9.229652953658704e-05, + "loss": 0.9094, + "step": 6378 + }, + { + "epoch": 0.43221085439392914, + "grad_norm": 9.543696403503418, + "learning_rate": 9.229516051748922e-05, + "loss": 0.8955, + "step": 6379 + }, + { + "epoch": 0.43227860966190124, + "grad_norm": 8.095965385437012, + "learning_rate": 9.22937914983914e-05, + "loss": 0.633, + "step": 6380 + }, + { + "epoch": 0.4323463649298733, + "grad_norm": 9.561396598815918, + "learning_rate": 9.229242247929359e-05, + "loss": 0.9367, + "step": 6381 + }, + { + "epoch": 0.43241412019784536, + "grad_norm": 7.348874092102051, + "learning_rate": 9.229105346019577e-05, + "loss": 0.7585, + "step": 6382 + }, + { + "epoch": 0.43248187546581746, + "grad_norm": 5.866179466247559, + "learning_rate": 9.228968444109795e-05, + "loss": 0.8046, + "step": 6383 + }, + { + "epoch": 0.43254963073378955, + "grad_norm": 7.945270538330078, + "learning_rate": 9.228831542200013e-05, + "loss": 0.8599, + "step": 6384 + }, + { + "epoch": 0.43261738600176164, + "grad_norm": 6.365564346313477, + "learning_rate": 9.228694640290233e-05, + "loss": 0.8213, + "step": 6385 + }, + { + "epoch": 0.43268514126973373, + "grad_norm": 7.802513599395752, + "learning_rate": 9.22855773838045e-05, + "loss": 0.7779, + "step": 6386 + }, + { + "epoch": 0.4327528965377058, + "grad_norm": 7.740975379943848, + "learning_rate": 9.228420836470669e-05, + "loss": 0.9564, + "step": 6387 + }, + { + "epoch": 0.4328206518056779, + "grad_norm": 8.962937355041504, + "learning_rate": 9.228283934560887e-05, + "loss": 0.7704, + "step": 6388 + }, + { + "epoch": 0.43288840707364995, + "grad_norm": 9.266582489013672, + "learning_rate": 9.228147032651106e-05, + "loss": 1.055, + "step": 6389 + }, + { + "epoch": 0.43295616234162204, + "grad_norm": 8.01395320892334, + "learning_rate": 9.228010130741324e-05, + "loss": 1.0197, + "step": 6390 + }, + { + "epoch": 0.43302391760959413, + "grad_norm": 6.83071231842041, + "learning_rate": 9.227873228831543e-05, + "loss": 0.8489, + "step": 6391 + }, + { + "epoch": 0.4330916728775662, + "grad_norm": 7.923128604888916, + "learning_rate": 9.227736326921762e-05, + "loss": 0.8601, + "step": 6392 + }, + { + "epoch": 0.4331594281455383, + "grad_norm": 6.395363807678223, + "learning_rate": 9.22759942501198e-05, + "loss": 0.8167, + "step": 6393 + }, + { + "epoch": 0.4332271834135104, + "grad_norm": 5.6835408210754395, + "learning_rate": 9.227462523102198e-05, + "loss": 0.895, + "step": 6394 + }, + { + "epoch": 0.4332949386814825, + "grad_norm": 7.308006286621094, + "learning_rate": 9.227325621192417e-05, + "loss": 0.985, + "step": 6395 + }, + { + "epoch": 0.4333626939494546, + "grad_norm": 6.754067897796631, + "learning_rate": 9.227188719282635e-05, + "loss": 0.9144, + "step": 6396 + }, + { + "epoch": 0.4334304492174266, + "grad_norm": 7.917202949523926, + "learning_rate": 9.227051817372853e-05, + "loss": 0.8683, + "step": 6397 + }, + { + "epoch": 0.4334982044853987, + "grad_norm": 7.502418041229248, + "learning_rate": 9.226914915463071e-05, + "loss": 1.0849, + "step": 6398 + }, + { + "epoch": 0.4335659597533708, + "grad_norm": 6.86057186126709, + "learning_rate": 9.22677801355329e-05, + "loss": 0.9286, + "step": 6399 + }, + { + "epoch": 0.4336337150213429, + "grad_norm": 8.595507621765137, + "learning_rate": 9.226641111643509e-05, + "loss": 0.9766, + "step": 6400 + }, + { + "epoch": 0.433701470289315, + "grad_norm": 6.231963157653809, + "learning_rate": 9.226504209733727e-05, + "loss": 0.8714, + "step": 6401 + }, + { + "epoch": 0.4337692255572871, + "grad_norm": 7.1258625984191895, + "learning_rate": 9.226367307823945e-05, + "loss": 1.1432, + "step": 6402 + }, + { + "epoch": 0.4338369808252592, + "grad_norm": 6.715908050537109, + "learning_rate": 9.226230405914163e-05, + "loss": 0.8271, + "step": 6403 + }, + { + "epoch": 0.43390473609323127, + "grad_norm": 7.551729679107666, + "learning_rate": 9.226093504004382e-05, + "loss": 0.8919, + "step": 6404 + }, + { + "epoch": 0.43397249136120336, + "grad_norm": 7.136280059814453, + "learning_rate": 9.2259566020946e-05, + "loss": 1.0586, + "step": 6405 + }, + { + "epoch": 0.4340402466291754, + "grad_norm": 5.612763404846191, + "learning_rate": 9.225819700184818e-05, + "loss": 0.7163, + "step": 6406 + }, + { + "epoch": 0.4341080018971475, + "grad_norm": 7.445765972137451, + "learning_rate": 9.225682798275036e-05, + "loss": 1.0488, + "step": 6407 + }, + { + "epoch": 0.4341757571651196, + "grad_norm": 7.239571571350098, + "learning_rate": 9.225545896365255e-05, + "loss": 0.9174, + "step": 6408 + }, + { + "epoch": 0.43424351243309167, + "grad_norm": 7.86649751663208, + "learning_rate": 9.225408994455474e-05, + "loss": 0.9312, + "step": 6409 + }, + { + "epoch": 0.43431126770106376, + "grad_norm": 7.049384117126465, + "learning_rate": 9.225272092545692e-05, + "loss": 0.8739, + "step": 6410 + }, + { + "epoch": 0.43437902296903586, + "grad_norm": 6.824424743652344, + "learning_rate": 9.22513519063591e-05, + "loss": 0.9335, + "step": 6411 + }, + { + "epoch": 0.43444677823700795, + "grad_norm": 6.616923809051514, + "learning_rate": 9.224998288726128e-05, + "loss": 0.7854, + "step": 6412 + }, + { + "epoch": 0.43451453350498004, + "grad_norm": 6.989858150482178, + "learning_rate": 9.224861386816347e-05, + "loss": 0.915, + "step": 6413 + }, + { + "epoch": 0.4345822887729521, + "grad_norm": 7.5784406661987305, + "learning_rate": 9.224724484906565e-05, + "loss": 0.9316, + "step": 6414 + }, + { + "epoch": 0.43465004404092417, + "grad_norm": 5.958365440368652, + "learning_rate": 9.224587582996783e-05, + "loss": 0.9103, + "step": 6415 + }, + { + "epoch": 0.43471779930889626, + "grad_norm": 6.811526775360107, + "learning_rate": 9.224450681087001e-05, + "loss": 0.8778, + "step": 6416 + }, + { + "epoch": 0.43478555457686835, + "grad_norm": 6.9535722732543945, + "learning_rate": 9.22431377917722e-05, + "loss": 0.9407, + "step": 6417 + }, + { + "epoch": 0.43485330984484044, + "grad_norm": 7.412068843841553, + "learning_rate": 9.224176877267439e-05, + "loss": 0.7526, + "step": 6418 + }, + { + "epoch": 0.43492106511281253, + "grad_norm": 6.990227699279785, + "learning_rate": 9.224039975357657e-05, + "loss": 0.8432, + "step": 6419 + }, + { + "epoch": 0.4349888203807846, + "grad_norm": 7.477065563201904, + "learning_rate": 9.223903073447875e-05, + "loss": 0.9579, + "step": 6420 + }, + { + "epoch": 0.4350565756487567, + "grad_norm": 6.9343581199646, + "learning_rate": 9.223766171538093e-05, + "loss": 0.7967, + "step": 6421 + }, + { + "epoch": 0.43512433091672875, + "grad_norm": 7.360040664672852, + "learning_rate": 9.223629269628312e-05, + "loss": 0.807, + "step": 6422 + }, + { + "epoch": 0.43519208618470084, + "grad_norm": 6.708899974822998, + "learning_rate": 9.22349236771853e-05, + "loss": 0.723, + "step": 6423 + }, + { + "epoch": 0.43525984145267294, + "grad_norm": 7.195553302764893, + "learning_rate": 9.223355465808748e-05, + "loss": 1.0543, + "step": 6424 + }, + { + "epoch": 0.435327596720645, + "grad_norm": 6.833982467651367, + "learning_rate": 9.223218563898966e-05, + "loss": 0.7874, + "step": 6425 + }, + { + "epoch": 0.4353953519886171, + "grad_norm": 7.278003692626953, + "learning_rate": 9.223081661989184e-05, + "loss": 1.0377, + "step": 6426 + }, + { + "epoch": 0.4354631072565892, + "grad_norm": 6.5219407081604, + "learning_rate": 9.222944760079404e-05, + "loss": 0.8975, + "step": 6427 + }, + { + "epoch": 0.4355308625245613, + "grad_norm": 15.948746681213379, + "learning_rate": 9.222807858169622e-05, + "loss": 1.1554, + "step": 6428 + }, + { + "epoch": 0.4355986177925334, + "grad_norm": 8.12380599975586, + "learning_rate": 9.22267095625984e-05, + "loss": 0.8078, + "step": 6429 + }, + { + "epoch": 0.43566637306050543, + "grad_norm": 6.6606340408325195, + "learning_rate": 9.222534054350058e-05, + "loss": 1.065, + "step": 6430 + }, + { + "epoch": 0.4357341283284775, + "grad_norm": 8.079832077026367, + "learning_rate": 9.222397152440277e-05, + "loss": 0.9253, + "step": 6431 + }, + { + "epoch": 0.4358018835964496, + "grad_norm": 6.005470275878906, + "learning_rate": 9.222260250530495e-05, + "loss": 0.8853, + "step": 6432 + }, + { + "epoch": 0.4358696388644217, + "grad_norm": 8.552530288696289, + "learning_rate": 9.222123348620713e-05, + "loss": 0.9477, + "step": 6433 + }, + { + "epoch": 0.4359373941323938, + "grad_norm": 6.014112949371338, + "learning_rate": 9.221986446710931e-05, + "loss": 0.9249, + "step": 6434 + }, + { + "epoch": 0.4360051494003659, + "grad_norm": 8.349777221679688, + "learning_rate": 9.22184954480115e-05, + "loss": 0.9278, + "step": 6435 + }, + { + "epoch": 0.436072904668338, + "grad_norm": 7.474494934082031, + "learning_rate": 9.221712642891369e-05, + "loss": 1.1279, + "step": 6436 + }, + { + "epoch": 0.43614065993631007, + "grad_norm": 7.27971076965332, + "learning_rate": 9.221575740981587e-05, + "loss": 0.8476, + "step": 6437 + }, + { + "epoch": 0.4362084152042821, + "grad_norm": 7.819595813751221, + "learning_rate": 9.221438839071806e-05, + "loss": 0.828, + "step": 6438 + }, + { + "epoch": 0.4362761704722542, + "grad_norm": 8.474798202514648, + "learning_rate": 9.221301937162024e-05, + "loss": 1.0775, + "step": 6439 + }, + { + "epoch": 0.4363439257402263, + "grad_norm": 7.175133228302002, + "learning_rate": 9.221165035252242e-05, + "loss": 0.8889, + "step": 6440 + }, + { + "epoch": 0.4364116810081984, + "grad_norm": 5.881869316101074, + "learning_rate": 9.221028133342461e-05, + "loss": 0.7435, + "step": 6441 + }, + { + "epoch": 0.4364794362761705, + "grad_norm": 8.366180419921875, + "learning_rate": 9.22089123143268e-05, + "loss": 1.1461, + "step": 6442 + }, + { + "epoch": 0.43654719154414257, + "grad_norm": 7.311695098876953, + "learning_rate": 9.220754329522898e-05, + "loss": 0.7601, + "step": 6443 + }, + { + "epoch": 0.43661494681211466, + "grad_norm": 6.886138439178467, + "learning_rate": 9.220617427613116e-05, + "loss": 0.9432, + "step": 6444 + }, + { + "epoch": 0.43668270208008675, + "grad_norm": 10.014945983886719, + "learning_rate": 9.220480525703335e-05, + "loss": 0.8914, + "step": 6445 + }, + { + "epoch": 0.4367504573480588, + "grad_norm": 6.025598526000977, + "learning_rate": 9.220343623793553e-05, + "loss": 0.8601, + "step": 6446 + }, + { + "epoch": 0.4368182126160309, + "grad_norm": 6.072788715362549, + "learning_rate": 9.220206721883771e-05, + "loss": 0.7345, + "step": 6447 + }, + { + "epoch": 0.43688596788400297, + "grad_norm": 8.272831916809082, + "learning_rate": 9.220069819973989e-05, + "loss": 0.7427, + "step": 6448 + }, + { + "epoch": 0.43695372315197506, + "grad_norm": 6.721330642700195, + "learning_rate": 9.219932918064207e-05, + "loss": 0.8294, + "step": 6449 + }, + { + "epoch": 0.43702147841994715, + "grad_norm": 7.108659267425537, + "learning_rate": 9.219796016154426e-05, + "loss": 0.7635, + "step": 6450 + }, + { + "epoch": 0.43708923368791924, + "grad_norm": 11.399370193481445, + "learning_rate": 9.219659114244645e-05, + "loss": 1.0645, + "step": 6451 + }, + { + "epoch": 0.43715698895589133, + "grad_norm": 7.976221561431885, + "learning_rate": 9.219522212334863e-05, + "loss": 0.9437, + "step": 6452 + }, + { + "epoch": 0.4372247442238634, + "grad_norm": 8.320938110351562, + "learning_rate": 9.21938531042508e-05, + "loss": 0.8029, + "step": 6453 + }, + { + "epoch": 0.4372924994918355, + "grad_norm": 6.146340370178223, + "learning_rate": 9.2192484085153e-05, + "loss": 0.7872, + "step": 6454 + }, + { + "epoch": 0.43736025475980755, + "grad_norm": 6.408365726470947, + "learning_rate": 9.219111506605518e-05, + "loss": 0.5922, + "step": 6455 + }, + { + "epoch": 0.43742801002777965, + "grad_norm": 7.011279106140137, + "learning_rate": 9.218974604695736e-05, + "loss": 0.9152, + "step": 6456 + }, + { + "epoch": 0.43749576529575174, + "grad_norm": 6.819314002990723, + "learning_rate": 9.218837702785954e-05, + "loss": 1.002, + "step": 6457 + }, + { + "epoch": 0.43756352056372383, + "grad_norm": 10.63784122467041, + "learning_rate": 9.218700800876172e-05, + "loss": 1.0873, + "step": 6458 + }, + { + "epoch": 0.4376312758316959, + "grad_norm": 6.964840888977051, + "learning_rate": 9.218563898966391e-05, + "loss": 0.6972, + "step": 6459 + }, + { + "epoch": 0.437699031099668, + "grad_norm": 8.421072959899902, + "learning_rate": 9.21842699705661e-05, + "loss": 1.0543, + "step": 6460 + }, + { + "epoch": 0.4377667863676401, + "grad_norm": 5.883727073669434, + "learning_rate": 9.218290095146828e-05, + "loss": 0.8138, + "step": 6461 + }, + { + "epoch": 0.4378345416356122, + "grad_norm": 8.054207801818848, + "learning_rate": 9.218153193237046e-05, + "loss": 0.8885, + "step": 6462 + }, + { + "epoch": 0.43790229690358423, + "grad_norm": 7.546504974365234, + "learning_rate": 9.218016291327265e-05, + "loss": 1.2157, + "step": 6463 + }, + { + "epoch": 0.4379700521715563, + "grad_norm": 10.255965232849121, + "learning_rate": 9.217879389417483e-05, + "loss": 1.0082, + "step": 6464 + }, + { + "epoch": 0.4380378074395284, + "grad_norm": 6.057038307189941, + "learning_rate": 9.217742487507701e-05, + "loss": 0.8386, + "step": 6465 + }, + { + "epoch": 0.4381055627075005, + "grad_norm": 8.343341827392578, + "learning_rate": 9.217605585597919e-05, + "loss": 0.7051, + "step": 6466 + }, + { + "epoch": 0.4381733179754726, + "grad_norm": 8.163117408752441, + "learning_rate": 9.217468683688137e-05, + "loss": 1.1442, + "step": 6467 + }, + { + "epoch": 0.4382410732434447, + "grad_norm": 6.132575035095215, + "learning_rate": 9.217331781778357e-05, + "loss": 0.7034, + "step": 6468 + }, + { + "epoch": 0.4383088285114168, + "grad_norm": 6.2581658363342285, + "learning_rate": 9.217194879868575e-05, + "loss": 0.919, + "step": 6469 + }, + { + "epoch": 0.4383765837793889, + "grad_norm": 6.228326797485352, + "learning_rate": 9.217057977958793e-05, + "loss": 0.7785, + "step": 6470 + }, + { + "epoch": 0.4384443390473609, + "grad_norm": 5.782427787780762, + "learning_rate": 9.21692107604901e-05, + "loss": 0.9185, + "step": 6471 + }, + { + "epoch": 0.438512094315333, + "grad_norm": 5.726394176483154, + "learning_rate": 9.21678417413923e-05, + "loss": 0.593, + "step": 6472 + }, + { + "epoch": 0.4385798495833051, + "grad_norm": 7.62056303024292, + "learning_rate": 9.216647272229448e-05, + "loss": 0.8947, + "step": 6473 + }, + { + "epoch": 0.4386476048512772, + "grad_norm": 7.467504024505615, + "learning_rate": 9.216510370319666e-05, + "loss": 0.8738, + "step": 6474 + }, + { + "epoch": 0.4387153601192493, + "grad_norm": 6.0644989013671875, + "learning_rate": 9.216373468409884e-05, + "loss": 0.7715, + "step": 6475 + }, + { + "epoch": 0.43878311538722137, + "grad_norm": 7.380848407745361, + "learning_rate": 9.216236566500102e-05, + "loss": 0.8055, + "step": 6476 + }, + { + "epoch": 0.43885087065519346, + "grad_norm": 7.162757873535156, + "learning_rate": 9.216099664590322e-05, + "loss": 0.8826, + "step": 6477 + }, + { + "epoch": 0.43891862592316555, + "grad_norm": 8.315613746643066, + "learning_rate": 9.21596276268054e-05, + "loss": 1.1239, + "step": 6478 + }, + { + "epoch": 0.4389863811911376, + "grad_norm": 6.754839897155762, + "learning_rate": 9.215825860770758e-05, + "loss": 0.8331, + "step": 6479 + }, + { + "epoch": 0.4390541364591097, + "grad_norm": 5.4843831062316895, + "learning_rate": 9.215688958860976e-05, + "loss": 0.8134, + "step": 6480 + }, + { + "epoch": 0.43912189172708177, + "grad_norm": 7.7780656814575195, + "learning_rate": 9.215552056951195e-05, + "loss": 1.0216, + "step": 6481 + }, + { + "epoch": 0.43918964699505386, + "grad_norm": 7.779257297515869, + "learning_rate": 9.215415155041413e-05, + "loss": 0.91, + "step": 6482 + }, + { + "epoch": 0.43925740226302595, + "grad_norm": 7.926174640655518, + "learning_rate": 9.215278253131631e-05, + "loss": 0.9635, + "step": 6483 + }, + { + "epoch": 0.43932515753099805, + "grad_norm": 5.595751762390137, + "learning_rate": 9.21514135122185e-05, + "loss": 0.9092, + "step": 6484 + }, + { + "epoch": 0.43939291279897014, + "grad_norm": 8.04053020477295, + "learning_rate": 9.215004449312069e-05, + "loss": 0.6872, + "step": 6485 + }, + { + "epoch": 0.43946066806694223, + "grad_norm": 12.462115287780762, + "learning_rate": 9.214867547402287e-05, + "loss": 1.2741, + "step": 6486 + }, + { + "epoch": 0.43952842333491426, + "grad_norm": 6.239933490753174, + "learning_rate": 9.214730645492506e-05, + "loss": 0.7711, + "step": 6487 + }, + { + "epoch": 0.43959617860288636, + "grad_norm": 7.584579944610596, + "learning_rate": 9.214593743582724e-05, + "loss": 0.8666, + "step": 6488 + }, + { + "epoch": 0.43966393387085845, + "grad_norm": 7.129024505615234, + "learning_rate": 9.214456841672942e-05, + "loss": 0.8664, + "step": 6489 + }, + { + "epoch": 0.43973168913883054, + "grad_norm": 7.35465145111084, + "learning_rate": 9.21431993976316e-05, + "loss": 1.2072, + "step": 6490 + }, + { + "epoch": 0.43979944440680263, + "grad_norm": 7.911463737487793, + "learning_rate": 9.21418303785338e-05, + "loss": 0.8019, + "step": 6491 + }, + { + "epoch": 0.4398671996747747, + "grad_norm": 7.9478349685668945, + "learning_rate": 9.214046135943597e-05, + "loss": 0.8945, + "step": 6492 + }, + { + "epoch": 0.4399349549427468, + "grad_norm": 7.73642635345459, + "learning_rate": 9.213909234033815e-05, + "loss": 1.2203, + "step": 6493 + }, + { + "epoch": 0.4400027102107189, + "grad_norm": 8.306556701660156, + "learning_rate": 9.213772332124034e-05, + "loss": 0.8548, + "step": 6494 + }, + { + "epoch": 0.44007046547869094, + "grad_norm": 7.496469497680664, + "learning_rate": 9.213635430214253e-05, + "loss": 1.1485, + "step": 6495 + }, + { + "epoch": 0.44013822074666303, + "grad_norm": 6.734534740447998, + "learning_rate": 9.213498528304471e-05, + "loss": 0.8758, + "step": 6496 + }, + { + "epoch": 0.4402059760146351, + "grad_norm": 6.782371997833252, + "learning_rate": 9.213361626394689e-05, + "loss": 0.5879, + "step": 6497 + }, + { + "epoch": 0.4402737312826072, + "grad_norm": 6.044846057891846, + "learning_rate": 9.213224724484907e-05, + "loss": 1.0055, + "step": 6498 + }, + { + "epoch": 0.4403414865505793, + "grad_norm": 7.979533672332764, + "learning_rate": 9.213087822575125e-05, + "loss": 0.8265, + "step": 6499 + }, + { + "epoch": 0.4404092418185514, + "grad_norm": 8.245573997497559, + "learning_rate": 9.212950920665344e-05, + "loss": 0.5549, + "step": 6500 + }, + { + "epoch": 0.4404769970865235, + "grad_norm": 6.346557140350342, + "learning_rate": 9.212814018755562e-05, + "loss": 0.857, + "step": 6501 + }, + { + "epoch": 0.4405447523544956, + "grad_norm": 6.079600811004639, + "learning_rate": 9.21267711684578e-05, + "loss": 0.9127, + "step": 6502 + }, + { + "epoch": 0.4406125076224676, + "grad_norm": 9.402070045471191, + "learning_rate": 9.212540214935999e-05, + "loss": 0.8104, + "step": 6503 + }, + { + "epoch": 0.4406802628904397, + "grad_norm": 8.062244415283203, + "learning_rate": 9.212403313026217e-05, + "loss": 1.195, + "step": 6504 + }, + { + "epoch": 0.4407480181584118, + "grad_norm": 8.54050064086914, + "learning_rate": 9.212266411116436e-05, + "loss": 0.854, + "step": 6505 + }, + { + "epoch": 0.4408157734263839, + "grad_norm": 6.040203094482422, + "learning_rate": 9.212129509206654e-05, + "loss": 0.7028, + "step": 6506 + }, + { + "epoch": 0.440883528694356, + "grad_norm": 7.689701080322266, + "learning_rate": 9.211992607296872e-05, + "loss": 0.9999, + "step": 6507 + }, + { + "epoch": 0.4409512839623281, + "grad_norm": 7.860703468322754, + "learning_rate": 9.21185570538709e-05, + "loss": 1.0413, + "step": 6508 + }, + { + "epoch": 0.44101903923030017, + "grad_norm": 7.390681266784668, + "learning_rate": 9.21171880347731e-05, + "loss": 1.0337, + "step": 6509 + }, + { + "epoch": 0.44108679449827226, + "grad_norm": 6.800081253051758, + "learning_rate": 9.211581901567527e-05, + "loss": 0.9613, + "step": 6510 + }, + { + "epoch": 0.44115454976624435, + "grad_norm": 5.59462833404541, + "learning_rate": 9.211444999657746e-05, + "loss": 0.7001, + "step": 6511 + }, + { + "epoch": 0.4412223050342164, + "grad_norm": 8.300586700439453, + "learning_rate": 9.211308097747964e-05, + "loss": 1.0087, + "step": 6512 + }, + { + "epoch": 0.4412900603021885, + "grad_norm": 6.590997695922852, + "learning_rate": 9.211171195838182e-05, + "loss": 0.8233, + "step": 6513 + }, + { + "epoch": 0.44135781557016057, + "grad_norm": 8.995779991149902, + "learning_rate": 9.211034293928401e-05, + "loss": 0.9385, + "step": 6514 + }, + { + "epoch": 0.44142557083813266, + "grad_norm": 6.689282417297363, + "learning_rate": 9.210897392018619e-05, + "loss": 0.9455, + "step": 6515 + }, + { + "epoch": 0.44149332610610476, + "grad_norm": 6.494836330413818, + "learning_rate": 9.210760490108837e-05, + "loss": 0.9009, + "step": 6516 + }, + { + "epoch": 0.44156108137407685, + "grad_norm": 7.502823352813721, + "learning_rate": 9.210623588199055e-05, + "loss": 0.7972, + "step": 6517 + }, + { + "epoch": 0.44162883664204894, + "grad_norm": 6.561639785766602, + "learning_rate": 9.210486686289274e-05, + "loss": 0.7988, + "step": 6518 + }, + { + "epoch": 0.44169659191002103, + "grad_norm": 6.1523847579956055, + "learning_rate": 9.210349784379493e-05, + "loss": 0.9561, + "step": 6519 + }, + { + "epoch": 0.44176434717799307, + "grad_norm": 6.792056560516357, + "learning_rate": 9.21021288246971e-05, + "loss": 0.7623, + "step": 6520 + }, + { + "epoch": 0.44183210244596516, + "grad_norm": 6.477465629577637, + "learning_rate": 9.210075980559929e-05, + "loss": 0.6875, + "step": 6521 + }, + { + "epoch": 0.44189985771393725, + "grad_norm": 7.468865394592285, + "learning_rate": 9.209939078650147e-05, + "loss": 0.8727, + "step": 6522 + }, + { + "epoch": 0.44196761298190934, + "grad_norm": 5.351073265075684, + "learning_rate": 9.209802176740366e-05, + "loss": 0.8114, + "step": 6523 + }, + { + "epoch": 0.44203536824988143, + "grad_norm": 7.701816558837891, + "learning_rate": 9.209665274830584e-05, + "loss": 1.1438, + "step": 6524 + }, + { + "epoch": 0.4421031235178535, + "grad_norm": 6.072995662689209, + "learning_rate": 9.209528372920802e-05, + "loss": 0.9237, + "step": 6525 + }, + { + "epoch": 0.4421708787858256, + "grad_norm": 7.1209716796875, + "learning_rate": 9.20939147101102e-05, + "loss": 0.8445, + "step": 6526 + }, + { + "epoch": 0.4422386340537977, + "grad_norm": 7.518238067626953, + "learning_rate": 9.20925456910124e-05, + "loss": 1.0969, + "step": 6527 + }, + { + "epoch": 0.44230638932176974, + "grad_norm": 5.822054862976074, + "learning_rate": 9.209117667191458e-05, + "loss": 0.843, + "step": 6528 + }, + { + "epoch": 0.44237414458974184, + "grad_norm": 6.8769612312316895, + "learning_rate": 9.208980765281676e-05, + "loss": 0.9661, + "step": 6529 + }, + { + "epoch": 0.4424418998577139, + "grad_norm": 6.266312122344971, + "learning_rate": 9.208843863371895e-05, + "loss": 0.7924, + "step": 6530 + }, + { + "epoch": 0.442509655125686, + "grad_norm": 6.57719612121582, + "learning_rate": 9.208706961462113e-05, + "loss": 0.7929, + "step": 6531 + }, + { + "epoch": 0.4425774103936581, + "grad_norm": 6.038028240203857, + "learning_rate": 9.208570059552331e-05, + "loss": 0.7683, + "step": 6532 + }, + { + "epoch": 0.4426451656616302, + "grad_norm": 7.042256832122803, + "learning_rate": 9.20843315764255e-05, + "loss": 0.6646, + "step": 6533 + }, + { + "epoch": 0.4427129209296023, + "grad_norm": 7.218042850494385, + "learning_rate": 9.208296255732768e-05, + "loss": 0.9532, + "step": 6534 + }, + { + "epoch": 0.4427806761975744, + "grad_norm": 8.851286888122559, + "learning_rate": 9.208159353822986e-05, + "loss": 0.819, + "step": 6535 + }, + { + "epoch": 0.4428484314655464, + "grad_norm": 5.37313985824585, + "learning_rate": 9.208022451913205e-05, + "loss": 0.6068, + "step": 6536 + }, + { + "epoch": 0.4429161867335185, + "grad_norm": 6.643946647644043, + "learning_rate": 9.207885550003424e-05, + "loss": 1.1088, + "step": 6537 + }, + { + "epoch": 0.4429839420014906, + "grad_norm": 7.155415058135986, + "learning_rate": 9.207748648093642e-05, + "loss": 0.9693, + "step": 6538 + }, + { + "epoch": 0.4430516972694627, + "grad_norm": 7.293460369110107, + "learning_rate": 9.20761174618386e-05, + "loss": 1.0701, + "step": 6539 + }, + { + "epoch": 0.4431194525374348, + "grad_norm": 6.664429664611816, + "learning_rate": 9.207474844274078e-05, + "loss": 0.6425, + "step": 6540 + }, + { + "epoch": 0.4431872078054069, + "grad_norm": 5.979015350341797, + "learning_rate": 9.207337942364297e-05, + "loss": 0.6666, + "step": 6541 + }, + { + "epoch": 0.44325496307337897, + "grad_norm": 5.959255695343018, + "learning_rate": 9.207201040454515e-05, + "loss": 0.6914, + "step": 6542 + }, + { + "epoch": 0.44332271834135106, + "grad_norm": 7.694621562957764, + "learning_rate": 9.207064138544733e-05, + "loss": 0.8844, + "step": 6543 + }, + { + "epoch": 0.4433904736093231, + "grad_norm": 7.128264904022217, + "learning_rate": 9.206927236634951e-05, + "loss": 0.6571, + "step": 6544 + }, + { + "epoch": 0.4434582288772952, + "grad_norm": 7.676843166351318, + "learning_rate": 9.20679033472517e-05, + "loss": 0.8944, + "step": 6545 + }, + { + "epoch": 0.4435259841452673, + "grad_norm": 8.858941078186035, + "learning_rate": 9.206653432815389e-05, + "loss": 0.8057, + "step": 6546 + }, + { + "epoch": 0.4435937394132394, + "grad_norm": 7.0214033126831055, + "learning_rate": 9.206516530905607e-05, + "loss": 0.7815, + "step": 6547 + }, + { + "epoch": 0.44366149468121147, + "grad_norm": 5.209787845611572, + "learning_rate": 9.206379628995825e-05, + "loss": 0.8764, + "step": 6548 + }, + { + "epoch": 0.44372924994918356, + "grad_norm": 7.076624870300293, + "learning_rate": 9.206242727086043e-05, + "loss": 0.8881, + "step": 6549 + }, + { + "epoch": 0.44379700521715565, + "grad_norm": 7.659173965454102, + "learning_rate": 9.206105825176262e-05, + "loss": 0.8945, + "step": 6550 + }, + { + "epoch": 0.44386476048512774, + "grad_norm": 6.2118611335754395, + "learning_rate": 9.20596892326648e-05, + "loss": 0.8022, + "step": 6551 + }, + { + "epoch": 0.4439325157530998, + "grad_norm": 6.649642467498779, + "learning_rate": 9.205832021356698e-05, + "loss": 0.9765, + "step": 6552 + }, + { + "epoch": 0.44400027102107187, + "grad_norm": 6.0131683349609375, + "learning_rate": 9.205695119446917e-05, + "loss": 0.6928, + "step": 6553 + }, + { + "epoch": 0.44406802628904396, + "grad_norm": 6.295266151428223, + "learning_rate": 9.205558217537135e-05, + "loss": 0.9785, + "step": 6554 + }, + { + "epoch": 0.44413578155701605, + "grad_norm": 6.524127006530762, + "learning_rate": 9.205421315627354e-05, + "loss": 0.8065, + "step": 6555 + }, + { + "epoch": 0.44420353682498814, + "grad_norm": 7.478303909301758, + "learning_rate": 9.205284413717572e-05, + "loss": 0.9533, + "step": 6556 + }, + { + "epoch": 0.44427129209296023, + "grad_norm": 7.92426061630249, + "learning_rate": 9.20514751180779e-05, + "loss": 1.1578, + "step": 6557 + }, + { + "epoch": 0.4443390473609323, + "grad_norm": 7.532670497894287, + "learning_rate": 9.205010609898008e-05, + "loss": 0.9318, + "step": 6558 + }, + { + "epoch": 0.4444068026289044, + "grad_norm": 8.458464622497559, + "learning_rate": 9.204873707988226e-05, + "loss": 1.087, + "step": 6559 + }, + { + "epoch": 0.4444745578968765, + "grad_norm": 5.044344425201416, + "learning_rate": 9.204736806078445e-05, + "loss": 0.7836, + "step": 6560 + }, + { + "epoch": 0.44454231316484855, + "grad_norm": 6.682579517364502, + "learning_rate": 9.204599904168663e-05, + "loss": 0.8407, + "step": 6561 + }, + { + "epoch": 0.44461006843282064, + "grad_norm": 6.948788166046143, + "learning_rate": 9.204463002258882e-05, + "loss": 0.72, + "step": 6562 + }, + { + "epoch": 0.44467782370079273, + "grad_norm": 7.333683967590332, + "learning_rate": 9.2043261003491e-05, + "loss": 0.934, + "step": 6563 + }, + { + "epoch": 0.4447455789687648, + "grad_norm": 7.648477077484131, + "learning_rate": 9.204189198439319e-05, + "loss": 0.8992, + "step": 6564 + }, + { + "epoch": 0.4448133342367369, + "grad_norm": 6.203789710998535, + "learning_rate": 9.204052296529537e-05, + "loss": 1.011, + "step": 6565 + }, + { + "epoch": 0.444881089504709, + "grad_norm": 7.739291667938232, + "learning_rate": 9.203915394619755e-05, + "loss": 0.8695, + "step": 6566 + }, + { + "epoch": 0.4449488447726811, + "grad_norm": 6.642831325531006, + "learning_rate": 9.203778492709973e-05, + "loss": 0.666, + "step": 6567 + }, + { + "epoch": 0.4450166000406532, + "grad_norm": 7.760629653930664, + "learning_rate": 9.203641590800191e-05, + "loss": 1.1022, + "step": 6568 + }, + { + "epoch": 0.4450843553086252, + "grad_norm": 6.180607318878174, + "learning_rate": 9.20350468889041e-05, + "loss": 0.8599, + "step": 6569 + }, + { + "epoch": 0.4451521105765973, + "grad_norm": 6.284255504608154, + "learning_rate": 9.203367786980629e-05, + "loss": 0.7226, + "step": 6570 + }, + { + "epoch": 0.4452198658445694, + "grad_norm": 5.078493595123291, + "learning_rate": 9.203230885070847e-05, + "loss": 0.8089, + "step": 6571 + }, + { + "epoch": 0.4452876211125415, + "grad_norm": 6.533141136169434, + "learning_rate": 9.203093983161065e-05, + "loss": 0.8426, + "step": 6572 + }, + { + "epoch": 0.4453553763805136, + "grad_norm": 6.589559555053711, + "learning_rate": 9.202957081251284e-05, + "loss": 1.0924, + "step": 6573 + }, + { + "epoch": 0.4454231316484857, + "grad_norm": 5.858087539672852, + "learning_rate": 9.202820179341502e-05, + "loss": 0.8007, + "step": 6574 + }, + { + "epoch": 0.4454908869164578, + "grad_norm": 6.8898491859436035, + "learning_rate": 9.20268327743172e-05, + "loss": 0.7622, + "step": 6575 + }, + { + "epoch": 0.44555864218442987, + "grad_norm": 5.419859409332275, + "learning_rate": 9.20254637552194e-05, + "loss": 0.711, + "step": 6576 + }, + { + "epoch": 0.4456263974524019, + "grad_norm": 7.31076192855835, + "learning_rate": 9.202409473612157e-05, + "loss": 1.0181, + "step": 6577 + }, + { + "epoch": 0.445694152720374, + "grad_norm": 8.525489807128906, + "learning_rate": 9.202272571702375e-05, + "loss": 1.0324, + "step": 6578 + }, + { + "epoch": 0.4457619079883461, + "grad_norm": 6.309749603271484, + "learning_rate": 9.202135669792595e-05, + "loss": 1.1596, + "step": 6579 + }, + { + "epoch": 0.4458296632563182, + "grad_norm": 5.667306423187256, + "learning_rate": 9.201998767882813e-05, + "loss": 0.8091, + "step": 6580 + }, + { + "epoch": 0.44589741852429027, + "grad_norm": 6.930799961090088, + "learning_rate": 9.201861865973031e-05, + "loss": 0.915, + "step": 6581 + }, + { + "epoch": 0.44596517379226236, + "grad_norm": 8.236886024475098, + "learning_rate": 9.201724964063249e-05, + "loss": 0.8973, + "step": 6582 + }, + { + "epoch": 0.44603292906023445, + "grad_norm": 8.804180145263672, + "learning_rate": 9.201588062153468e-05, + "loss": 0.9985, + "step": 6583 + }, + { + "epoch": 0.44610068432820654, + "grad_norm": 6.966750621795654, + "learning_rate": 9.201451160243686e-05, + "loss": 0.9065, + "step": 6584 + }, + { + "epoch": 0.4461684395961786, + "grad_norm": 6.3808698654174805, + "learning_rate": 9.201314258333904e-05, + "loss": 0.8519, + "step": 6585 + }, + { + "epoch": 0.44623619486415067, + "grad_norm": 6.380456447601318, + "learning_rate": 9.201177356424122e-05, + "loss": 0.6365, + "step": 6586 + }, + { + "epoch": 0.44630395013212276, + "grad_norm": 6.893950939178467, + "learning_rate": 9.201040454514342e-05, + "loss": 0.8017, + "step": 6587 + }, + { + "epoch": 0.44637170540009485, + "grad_norm": 6.246983051300049, + "learning_rate": 9.20090355260456e-05, + "loss": 0.8914, + "step": 6588 + }, + { + "epoch": 0.44643946066806695, + "grad_norm": 5.012611389160156, + "learning_rate": 9.200766650694778e-05, + "loss": 0.7421, + "step": 6589 + }, + { + "epoch": 0.44650721593603904, + "grad_norm": 6.981064319610596, + "learning_rate": 9.200629748784996e-05, + "loss": 0.9045, + "step": 6590 + }, + { + "epoch": 0.44657497120401113, + "grad_norm": 6.434818744659424, + "learning_rate": 9.200492846875214e-05, + "loss": 1.0952, + "step": 6591 + }, + { + "epoch": 0.4466427264719832, + "grad_norm": 7.2661027908325195, + "learning_rate": 9.200355944965433e-05, + "loss": 0.8943, + "step": 6592 + }, + { + "epoch": 0.44671048173995526, + "grad_norm": 6.347513675689697, + "learning_rate": 9.200219043055651e-05, + "loss": 0.7958, + "step": 6593 + }, + { + "epoch": 0.44677823700792735, + "grad_norm": 6.98016881942749, + "learning_rate": 9.20008214114587e-05, + "loss": 0.7197, + "step": 6594 + }, + { + "epoch": 0.44684599227589944, + "grad_norm": 5.671807765960693, + "learning_rate": 9.199945239236087e-05, + "loss": 0.7577, + "step": 6595 + }, + { + "epoch": 0.44691374754387153, + "grad_norm": 8.79588794708252, + "learning_rate": 9.199808337326307e-05, + "loss": 0.9273, + "step": 6596 + }, + { + "epoch": 0.4469815028118436, + "grad_norm": 7.276648998260498, + "learning_rate": 9.199671435416525e-05, + "loss": 0.8705, + "step": 6597 + }, + { + "epoch": 0.4470492580798157, + "grad_norm": 8.435094833374023, + "learning_rate": 9.199534533506743e-05, + "loss": 0.9946, + "step": 6598 + }, + { + "epoch": 0.4471170133477878, + "grad_norm": 6.148091793060303, + "learning_rate": 9.199397631596961e-05, + "loss": 0.9541, + "step": 6599 + }, + { + "epoch": 0.4471847686157599, + "grad_norm": 8.867514610290527, + "learning_rate": 9.199260729687179e-05, + "loss": 1.029, + "step": 6600 + }, + { + "epoch": 0.44725252388373193, + "grad_norm": 7.480152606964111, + "learning_rate": 9.199123827777398e-05, + "loss": 0.8819, + "step": 6601 + }, + { + "epoch": 0.447320279151704, + "grad_norm": 7.422079086303711, + "learning_rate": 9.198986925867616e-05, + "loss": 0.9998, + "step": 6602 + }, + { + "epoch": 0.4473880344196761, + "grad_norm": 6.239773750305176, + "learning_rate": 9.198850023957834e-05, + "loss": 0.9129, + "step": 6603 + }, + { + "epoch": 0.4474557896876482, + "grad_norm": 6.59074592590332, + "learning_rate": 9.198713122048053e-05, + "loss": 1.1373, + "step": 6604 + }, + { + "epoch": 0.4475235449556203, + "grad_norm": 6.096022129058838, + "learning_rate": 9.198576220138272e-05, + "loss": 0.8916, + "step": 6605 + }, + { + "epoch": 0.4475913002235924, + "grad_norm": 5.775374889373779, + "learning_rate": 9.19843931822849e-05, + "loss": 0.8679, + "step": 6606 + }, + { + "epoch": 0.4476590554915645, + "grad_norm": 6.836724758148193, + "learning_rate": 9.198302416318708e-05, + "loss": 0.8205, + "step": 6607 + }, + { + "epoch": 0.4477268107595366, + "grad_norm": 6.876745700836182, + "learning_rate": 9.198165514408926e-05, + "loss": 0.9564, + "step": 6608 + }, + { + "epoch": 0.4477945660275086, + "grad_norm": 6.725889205932617, + "learning_rate": 9.198028612499144e-05, + "loss": 0.7783, + "step": 6609 + }, + { + "epoch": 0.4478623212954807, + "grad_norm": 6.312922477722168, + "learning_rate": 9.197891710589363e-05, + "loss": 0.8358, + "step": 6610 + }, + { + "epoch": 0.4479300765634528, + "grad_norm": 5.7454071044921875, + "learning_rate": 9.197754808679581e-05, + "loss": 0.7174, + "step": 6611 + }, + { + "epoch": 0.4479978318314249, + "grad_norm": 9.245071411132812, + "learning_rate": 9.1976179067698e-05, + "loss": 0.8224, + "step": 6612 + }, + { + "epoch": 0.448065587099397, + "grad_norm": 7.2646307945251465, + "learning_rate": 9.197481004860018e-05, + "loss": 0.7895, + "step": 6613 + }, + { + "epoch": 0.44813334236736907, + "grad_norm": 6.799703598022461, + "learning_rate": 9.197344102950236e-05, + "loss": 0.9685, + "step": 6614 + }, + { + "epoch": 0.44820109763534116, + "grad_norm": 6.408104419708252, + "learning_rate": 9.197207201040455e-05, + "loss": 0.9136, + "step": 6615 + }, + { + "epoch": 0.44826885290331325, + "grad_norm": 7.4527363777160645, + "learning_rate": 9.197070299130673e-05, + "loss": 0.9364, + "step": 6616 + }, + { + "epoch": 0.44833660817128534, + "grad_norm": 6.287598609924316, + "learning_rate": 9.196933397220891e-05, + "loss": 0.8987, + "step": 6617 + }, + { + "epoch": 0.4484043634392574, + "grad_norm": 7.270476341247559, + "learning_rate": 9.196796495311109e-05, + "loss": 1.0113, + "step": 6618 + }, + { + "epoch": 0.4484721187072295, + "grad_norm": 8.098075866699219, + "learning_rate": 9.196659593401328e-05, + "loss": 0.7957, + "step": 6619 + }, + { + "epoch": 0.44853987397520156, + "grad_norm": 6.268966197967529, + "learning_rate": 9.196522691491546e-05, + "loss": 0.7363, + "step": 6620 + }, + { + "epoch": 0.44860762924317366, + "grad_norm": 8.81617259979248, + "learning_rate": 9.196385789581765e-05, + "loss": 1.1112, + "step": 6621 + }, + { + "epoch": 0.44867538451114575, + "grad_norm": 7.593179225921631, + "learning_rate": 9.196248887671984e-05, + "loss": 0.7988, + "step": 6622 + }, + { + "epoch": 0.44874313977911784, + "grad_norm": 7.044666290283203, + "learning_rate": 9.196111985762202e-05, + "loss": 0.832, + "step": 6623 + }, + { + "epoch": 0.44881089504708993, + "grad_norm": 5.546746730804443, + "learning_rate": 9.19597508385242e-05, + "loss": 0.7273, + "step": 6624 + }, + { + "epoch": 0.448878650315062, + "grad_norm": 7.123478412628174, + "learning_rate": 9.19583818194264e-05, + "loss": 0.9006, + "step": 6625 + }, + { + "epoch": 0.44894640558303406, + "grad_norm": 9.073535919189453, + "learning_rate": 9.195701280032857e-05, + "loss": 0.8702, + "step": 6626 + }, + { + "epoch": 0.44901416085100615, + "grad_norm": 7.320106506347656, + "learning_rate": 9.195564378123075e-05, + "loss": 0.7704, + "step": 6627 + }, + { + "epoch": 0.44908191611897824, + "grad_norm": 7.6163177490234375, + "learning_rate": 9.195427476213295e-05, + "loss": 1.1432, + "step": 6628 + }, + { + "epoch": 0.44914967138695033, + "grad_norm": 6.271851062774658, + "learning_rate": 9.195290574303513e-05, + "loss": 0.9378, + "step": 6629 + }, + { + "epoch": 0.4492174266549224, + "grad_norm": 6.142657279968262, + "learning_rate": 9.195153672393731e-05, + "loss": 0.9356, + "step": 6630 + }, + { + "epoch": 0.4492851819228945, + "grad_norm": 6.669857025146484, + "learning_rate": 9.195016770483949e-05, + "loss": 0.5939, + "step": 6631 + }, + { + "epoch": 0.4493529371908666, + "grad_norm": 7.524440288543701, + "learning_rate": 9.194879868574167e-05, + "loss": 0.7756, + "step": 6632 + }, + { + "epoch": 0.4494206924588387, + "grad_norm": 7.47619104385376, + "learning_rate": 9.194742966664386e-05, + "loss": 0.7881, + "step": 6633 + }, + { + "epoch": 0.44948844772681074, + "grad_norm": 5.466754913330078, + "learning_rate": 9.194606064754604e-05, + "loss": 0.7653, + "step": 6634 + }, + { + "epoch": 0.4495562029947828, + "grad_norm": 6.149755954742432, + "learning_rate": 9.194469162844822e-05, + "loss": 0.8208, + "step": 6635 + }, + { + "epoch": 0.4496239582627549, + "grad_norm": 6.091528415679932, + "learning_rate": 9.19433226093504e-05, + "loss": 1.0214, + "step": 6636 + }, + { + "epoch": 0.449691713530727, + "grad_norm": 5.599562168121338, + "learning_rate": 9.194195359025258e-05, + "loss": 1.0719, + "step": 6637 + }, + { + "epoch": 0.4497594687986991, + "grad_norm": 7.242834091186523, + "learning_rate": 9.194058457115478e-05, + "loss": 1.0185, + "step": 6638 + }, + { + "epoch": 0.4498272240666712, + "grad_norm": 6.523613452911377, + "learning_rate": 9.193921555205696e-05, + "loss": 0.9699, + "step": 6639 + }, + { + "epoch": 0.4498949793346433, + "grad_norm": 6.815830230712891, + "learning_rate": 9.193784653295914e-05, + "loss": 0.814, + "step": 6640 + }, + { + "epoch": 0.4499627346026154, + "grad_norm": 5.4793524742126465, + "learning_rate": 9.193647751386132e-05, + "loss": 0.724, + "step": 6641 + }, + { + "epoch": 0.4500304898705874, + "grad_norm": 7.303586959838867, + "learning_rate": 9.193510849476351e-05, + "loss": 0.9991, + "step": 6642 + }, + { + "epoch": 0.4500982451385595, + "grad_norm": 7.781108856201172, + "learning_rate": 9.19337394756657e-05, + "loss": 0.925, + "step": 6643 + }, + { + "epoch": 0.4501660004065316, + "grad_norm": 5.924014568328857, + "learning_rate": 9.193237045656787e-05, + "loss": 0.8656, + "step": 6644 + }, + { + "epoch": 0.4502337556745037, + "grad_norm": 8.77978801727295, + "learning_rate": 9.193100143747005e-05, + "loss": 0.6839, + "step": 6645 + }, + { + "epoch": 0.4503015109424758, + "grad_norm": 6.725009441375732, + "learning_rate": 9.192963241837223e-05, + "loss": 1.09, + "step": 6646 + }, + { + "epoch": 0.45036926621044787, + "grad_norm": 5.697434902191162, + "learning_rate": 9.192826339927443e-05, + "loss": 0.6516, + "step": 6647 + }, + { + "epoch": 0.45043702147841996, + "grad_norm": 7.783257007598877, + "learning_rate": 9.192689438017661e-05, + "loss": 1.1097, + "step": 6648 + }, + { + "epoch": 0.45050477674639205, + "grad_norm": 7.0213541984558105, + "learning_rate": 9.192552536107879e-05, + "loss": 1.0617, + "step": 6649 + }, + { + "epoch": 0.4505725320143641, + "grad_norm": 9.152633666992188, + "learning_rate": 9.192415634198097e-05, + "loss": 0.9626, + "step": 6650 + }, + { + "epoch": 0.4506402872823362, + "grad_norm": 8.690450668334961, + "learning_rate": 9.192278732288316e-05, + "loss": 0.7741, + "step": 6651 + }, + { + "epoch": 0.4507080425503083, + "grad_norm": 6.784976482391357, + "learning_rate": 9.192141830378534e-05, + "loss": 1.092, + "step": 6652 + }, + { + "epoch": 0.45077579781828037, + "grad_norm": 6.294474124908447, + "learning_rate": 9.192004928468752e-05, + "loss": 0.9322, + "step": 6653 + }, + { + "epoch": 0.45084355308625246, + "grad_norm": 5.054771423339844, + "learning_rate": 9.19186802655897e-05, + "loss": 0.7711, + "step": 6654 + }, + { + "epoch": 0.45091130835422455, + "grad_norm": 7.640350341796875, + "learning_rate": 9.191731124649189e-05, + "loss": 0.8006, + "step": 6655 + }, + { + "epoch": 0.45097906362219664, + "grad_norm": 6.9720001220703125, + "learning_rate": 9.191594222739408e-05, + "loss": 0.7912, + "step": 6656 + }, + { + "epoch": 0.45104681889016873, + "grad_norm": 7.61221981048584, + "learning_rate": 9.191457320829626e-05, + "loss": 0.8612, + "step": 6657 + }, + { + "epoch": 0.45111457415814077, + "grad_norm": 7.7177958488464355, + "learning_rate": 9.191320418919844e-05, + "loss": 1.0009, + "step": 6658 + }, + { + "epoch": 0.45118232942611286, + "grad_norm": 6.217735290527344, + "learning_rate": 9.191183517010062e-05, + "loss": 0.8091, + "step": 6659 + }, + { + "epoch": 0.45125008469408495, + "grad_norm": 8.2019681930542, + "learning_rate": 9.191046615100281e-05, + "loss": 1.055, + "step": 6660 + }, + { + "epoch": 0.45131783996205704, + "grad_norm": 5.853175640106201, + "learning_rate": 9.1909097131905e-05, + "loss": 1.1045, + "step": 6661 + }, + { + "epoch": 0.45138559523002914, + "grad_norm": 6.276338577270508, + "learning_rate": 9.190772811280717e-05, + "loss": 0.9436, + "step": 6662 + }, + { + "epoch": 0.4514533504980012, + "grad_norm": 11.320722579956055, + "learning_rate": 9.190635909370935e-05, + "loss": 1.1028, + "step": 6663 + }, + { + "epoch": 0.4515211057659733, + "grad_norm": 6.311827659606934, + "learning_rate": 9.190499007461154e-05, + "loss": 0.9232, + "step": 6664 + }, + { + "epoch": 0.4515888610339454, + "grad_norm": 8.186714172363281, + "learning_rate": 9.190362105551373e-05, + "loss": 0.7087, + "step": 6665 + }, + { + "epoch": 0.4516566163019175, + "grad_norm": 6.127712249755859, + "learning_rate": 9.190225203641591e-05, + "loss": 1.0268, + "step": 6666 + }, + { + "epoch": 0.45172437156988954, + "grad_norm": 6.840565204620361, + "learning_rate": 9.190088301731809e-05, + "loss": 0.9955, + "step": 6667 + }, + { + "epoch": 0.45179212683786163, + "grad_norm": 6.302945137023926, + "learning_rate": 9.189951399822027e-05, + "loss": 0.9892, + "step": 6668 + }, + { + "epoch": 0.4518598821058337, + "grad_norm": 9.08460521697998, + "learning_rate": 9.189814497912246e-05, + "loss": 0.7988, + "step": 6669 + }, + { + "epoch": 0.4519276373738058, + "grad_norm": 7.497137069702148, + "learning_rate": 9.189677596002464e-05, + "loss": 0.8358, + "step": 6670 + }, + { + "epoch": 0.4519953926417779, + "grad_norm": 6.515604496002197, + "learning_rate": 9.189540694092682e-05, + "loss": 0.7724, + "step": 6671 + }, + { + "epoch": 0.45206314790975, + "grad_norm": 6.092652320861816, + "learning_rate": 9.189403792182902e-05, + "loss": 0.7308, + "step": 6672 + }, + { + "epoch": 0.4521309031777221, + "grad_norm": 8.810023307800293, + "learning_rate": 9.18926689027312e-05, + "loss": 0.7249, + "step": 6673 + }, + { + "epoch": 0.4521986584456942, + "grad_norm": 8.328235626220703, + "learning_rate": 9.189129988363338e-05, + "loss": 0.8115, + "step": 6674 + }, + { + "epoch": 0.4522664137136662, + "grad_norm": 7.251335620880127, + "learning_rate": 9.188993086453557e-05, + "loss": 0.9843, + "step": 6675 + }, + { + "epoch": 0.4523341689816383, + "grad_norm": 8.03663444519043, + "learning_rate": 9.188856184543775e-05, + "loss": 0.9925, + "step": 6676 + }, + { + "epoch": 0.4524019242496104, + "grad_norm": 9.7393798828125, + "learning_rate": 9.188719282633993e-05, + "loss": 0.7575, + "step": 6677 + }, + { + "epoch": 0.4524696795175825, + "grad_norm": 6.343038558959961, + "learning_rate": 9.188582380724211e-05, + "loss": 1.0638, + "step": 6678 + }, + { + "epoch": 0.4525374347855546, + "grad_norm": 7.530363082885742, + "learning_rate": 9.188445478814431e-05, + "loss": 1.0212, + "step": 6679 + }, + { + "epoch": 0.4526051900535267, + "grad_norm": 8.741231918334961, + "learning_rate": 9.188308576904649e-05, + "loss": 1.1723, + "step": 6680 + }, + { + "epoch": 0.45267294532149877, + "grad_norm": 9.057670593261719, + "learning_rate": 9.188171674994867e-05, + "loss": 1.0396, + "step": 6681 + }, + { + "epoch": 0.45274070058947086, + "grad_norm": 6.638017177581787, + "learning_rate": 9.188034773085085e-05, + "loss": 0.9465, + "step": 6682 + }, + { + "epoch": 0.4528084558574429, + "grad_norm": 7.643139839172363, + "learning_rate": 9.187897871175304e-05, + "loss": 0.9346, + "step": 6683 + }, + { + "epoch": 0.452876211125415, + "grad_norm": 7.176743984222412, + "learning_rate": 9.187760969265522e-05, + "loss": 1.036, + "step": 6684 + }, + { + "epoch": 0.4529439663933871, + "grad_norm": 7.506284713745117, + "learning_rate": 9.18762406735574e-05, + "loss": 0.9487, + "step": 6685 + }, + { + "epoch": 0.45301172166135917, + "grad_norm": 6.865815162658691, + "learning_rate": 9.187487165445958e-05, + "loss": 0.8211, + "step": 6686 + }, + { + "epoch": 0.45307947692933126, + "grad_norm": 6.889796733856201, + "learning_rate": 9.187350263536176e-05, + "loss": 0.7901, + "step": 6687 + }, + { + "epoch": 0.45314723219730335, + "grad_norm": 7.016167163848877, + "learning_rate": 9.187213361626396e-05, + "loss": 0.7699, + "step": 6688 + }, + { + "epoch": 0.45321498746527544, + "grad_norm": 8.25091552734375, + "learning_rate": 9.187076459716614e-05, + "loss": 0.9778, + "step": 6689 + }, + { + "epoch": 0.45328274273324753, + "grad_norm": 7.092465877532959, + "learning_rate": 9.186939557806832e-05, + "loss": 1.2544, + "step": 6690 + }, + { + "epoch": 0.45335049800121957, + "grad_norm": 6.402629375457764, + "learning_rate": 9.18680265589705e-05, + "loss": 0.9225, + "step": 6691 + }, + { + "epoch": 0.45341825326919166, + "grad_norm": 7.404470920562744, + "learning_rate": 9.186665753987268e-05, + "loss": 0.7975, + "step": 6692 + }, + { + "epoch": 0.45348600853716375, + "grad_norm": 5.976271629333496, + "learning_rate": 9.186528852077487e-05, + "loss": 0.8868, + "step": 6693 + }, + { + "epoch": 0.45355376380513585, + "grad_norm": 7.843896389007568, + "learning_rate": 9.186391950167705e-05, + "loss": 0.8034, + "step": 6694 + }, + { + "epoch": 0.45362151907310794, + "grad_norm": 7.217418193817139, + "learning_rate": 9.186255048257923e-05, + "loss": 0.6763, + "step": 6695 + }, + { + "epoch": 0.45368927434108003, + "grad_norm": 6.091420650482178, + "learning_rate": 9.186118146348141e-05, + "loss": 0.6565, + "step": 6696 + }, + { + "epoch": 0.4537570296090521, + "grad_norm": 8.277983665466309, + "learning_rate": 9.185981244438361e-05, + "loss": 0.7589, + "step": 6697 + }, + { + "epoch": 0.4538247848770242, + "grad_norm": 9.001012802124023, + "learning_rate": 9.185844342528579e-05, + "loss": 0.8664, + "step": 6698 + }, + { + "epoch": 0.45389254014499625, + "grad_norm": 6.039417266845703, + "learning_rate": 9.185707440618797e-05, + "loss": 0.809, + "step": 6699 + }, + { + "epoch": 0.45396029541296834, + "grad_norm": 6.930225849151611, + "learning_rate": 9.185570538709015e-05, + "loss": 0.7567, + "step": 6700 + }, + { + "epoch": 0.45402805068094043, + "grad_norm": 7.135040760040283, + "learning_rate": 9.185433636799233e-05, + "loss": 0.7669, + "step": 6701 + }, + { + "epoch": 0.4540958059489125, + "grad_norm": 7.994284152984619, + "learning_rate": 9.185296734889452e-05, + "loss": 0.7671, + "step": 6702 + }, + { + "epoch": 0.4541635612168846, + "grad_norm": 6.146820068359375, + "learning_rate": 9.18515983297967e-05, + "loss": 0.8975, + "step": 6703 + }, + { + "epoch": 0.4542313164848567, + "grad_norm": 6.619115352630615, + "learning_rate": 9.185022931069888e-05, + "loss": 0.9972, + "step": 6704 + }, + { + "epoch": 0.4542990717528288, + "grad_norm": 5.762700080871582, + "learning_rate": 9.184886029160106e-05, + "loss": 0.6798, + "step": 6705 + }, + { + "epoch": 0.4543668270208009, + "grad_norm": 7.270383358001709, + "learning_rate": 9.184749127250326e-05, + "loss": 0.8676, + "step": 6706 + }, + { + "epoch": 0.4544345822887729, + "grad_norm": 6.667981147766113, + "learning_rate": 9.184612225340544e-05, + "loss": 0.7988, + "step": 6707 + }, + { + "epoch": 0.454502337556745, + "grad_norm": 5.513166904449463, + "learning_rate": 9.184475323430762e-05, + "loss": 0.868, + "step": 6708 + }, + { + "epoch": 0.4545700928247171, + "grad_norm": 6.376199245452881, + "learning_rate": 9.18433842152098e-05, + "loss": 0.7859, + "step": 6709 + }, + { + "epoch": 0.4546378480926892, + "grad_norm": 6.444526672363281, + "learning_rate": 9.184201519611198e-05, + "loss": 0.7793, + "step": 6710 + }, + { + "epoch": 0.4547056033606613, + "grad_norm": 8.83858871459961, + "learning_rate": 9.184064617701417e-05, + "loss": 0.9903, + "step": 6711 + }, + { + "epoch": 0.4547733586286334, + "grad_norm": 6.07218599319458, + "learning_rate": 9.183927715791635e-05, + "loss": 0.8349, + "step": 6712 + }, + { + "epoch": 0.4548411138966055, + "grad_norm": 6.652568340301514, + "learning_rate": 9.183790813881853e-05, + "loss": 0.8982, + "step": 6713 + }, + { + "epoch": 0.45490886916457757, + "grad_norm": 7.582399845123291, + "learning_rate": 9.183653911972071e-05, + "loss": 0.9833, + "step": 6714 + }, + { + "epoch": 0.4549766244325496, + "grad_norm": 8.411558151245117, + "learning_rate": 9.183517010062291e-05, + "loss": 0.9433, + "step": 6715 + }, + { + "epoch": 0.4550443797005217, + "grad_norm": 7.420217037200928, + "learning_rate": 9.183380108152509e-05, + "loss": 0.9508, + "step": 6716 + }, + { + "epoch": 0.4551121349684938, + "grad_norm": 7.064800262451172, + "learning_rate": 9.183243206242727e-05, + "loss": 0.9414, + "step": 6717 + }, + { + "epoch": 0.4551798902364659, + "grad_norm": 8.102936744689941, + "learning_rate": 9.183106304332946e-05, + "loss": 0.9061, + "step": 6718 + }, + { + "epoch": 0.45524764550443797, + "grad_norm": 8.261469841003418, + "learning_rate": 9.182969402423164e-05, + "loss": 1.2411, + "step": 6719 + }, + { + "epoch": 0.45531540077241006, + "grad_norm": 7.5003204345703125, + "learning_rate": 9.182832500513382e-05, + "loss": 0.9296, + "step": 6720 + }, + { + "epoch": 0.45538315604038215, + "grad_norm": 7.358273029327393, + "learning_rate": 9.182695598603602e-05, + "loss": 0.8109, + "step": 6721 + }, + { + "epoch": 0.45545091130835424, + "grad_norm": 7.0917768478393555, + "learning_rate": 9.18255869669382e-05, + "loss": 0.9517, + "step": 6722 + }, + { + "epoch": 0.45551866657632634, + "grad_norm": 6.559359550476074, + "learning_rate": 9.182421794784038e-05, + "loss": 0.8381, + "step": 6723 + }, + { + "epoch": 0.4555864218442984, + "grad_norm": 6.368768215179443, + "learning_rate": 9.182284892874256e-05, + "loss": 0.9104, + "step": 6724 + }, + { + "epoch": 0.45565417711227046, + "grad_norm": 6.576778411865234, + "learning_rate": 9.182147990964475e-05, + "loss": 0.9116, + "step": 6725 + }, + { + "epoch": 0.45572193238024256, + "grad_norm": 6.608447551727295, + "learning_rate": 9.182011089054693e-05, + "loss": 0.9371, + "step": 6726 + }, + { + "epoch": 0.45578968764821465, + "grad_norm": 6.38028621673584, + "learning_rate": 9.181874187144911e-05, + "loss": 0.9607, + "step": 6727 + }, + { + "epoch": 0.45585744291618674, + "grad_norm": 8.349663734436035, + "learning_rate": 9.18173728523513e-05, + "loss": 0.8739, + "step": 6728 + }, + { + "epoch": 0.45592519818415883, + "grad_norm": 7.229827880859375, + "learning_rate": 9.181600383325349e-05, + "loss": 0.6525, + "step": 6729 + }, + { + "epoch": 0.4559929534521309, + "grad_norm": 7.870299816131592, + "learning_rate": 9.181463481415567e-05, + "loss": 0.9938, + "step": 6730 + }, + { + "epoch": 0.456060708720103, + "grad_norm": 7.6586503982543945, + "learning_rate": 9.181326579505785e-05, + "loss": 0.9489, + "step": 6731 + }, + { + "epoch": 0.45612846398807505, + "grad_norm": 8.91909122467041, + "learning_rate": 9.181189677596003e-05, + "loss": 0.9642, + "step": 6732 + }, + { + "epoch": 0.45619621925604714, + "grad_norm": 8.341778755187988, + "learning_rate": 9.181052775686221e-05, + "loss": 1.0161, + "step": 6733 + }, + { + "epoch": 0.45626397452401923, + "grad_norm": 7.269313812255859, + "learning_rate": 9.18091587377644e-05, + "loss": 0.8535, + "step": 6734 + }, + { + "epoch": 0.4563317297919913, + "grad_norm": 6.635743141174316, + "learning_rate": 9.180778971866658e-05, + "loss": 0.6519, + "step": 6735 + }, + { + "epoch": 0.4563994850599634, + "grad_norm": 7.213603973388672, + "learning_rate": 9.180642069956876e-05, + "loss": 0.8314, + "step": 6736 + }, + { + "epoch": 0.4564672403279355, + "grad_norm": 6.491673469543457, + "learning_rate": 9.180505168047094e-05, + "loss": 0.8691, + "step": 6737 + }, + { + "epoch": 0.4565349955959076, + "grad_norm": 6.087094783782959, + "learning_rate": 9.180368266137314e-05, + "loss": 0.8732, + "step": 6738 + }, + { + "epoch": 0.4566027508638797, + "grad_norm": 6.0057172775268555, + "learning_rate": 9.180231364227532e-05, + "loss": 0.74, + "step": 6739 + }, + { + "epoch": 0.45667050613185173, + "grad_norm": 6.74835729598999, + "learning_rate": 9.18009446231775e-05, + "loss": 0.7628, + "step": 6740 + }, + { + "epoch": 0.4567382613998238, + "grad_norm": 5.433539867401123, + "learning_rate": 9.179957560407968e-05, + "loss": 0.6438, + "step": 6741 + }, + { + "epoch": 0.4568060166677959, + "grad_norm": 7.143089771270752, + "learning_rate": 9.179820658498186e-05, + "loss": 0.807, + "step": 6742 + }, + { + "epoch": 0.456873771935768, + "grad_norm": 6.345632076263428, + "learning_rate": 9.179683756588405e-05, + "loss": 0.8652, + "step": 6743 + }, + { + "epoch": 0.4569415272037401, + "grad_norm": 6.4180908203125, + "learning_rate": 9.179546854678623e-05, + "loss": 0.8225, + "step": 6744 + }, + { + "epoch": 0.4570092824717122, + "grad_norm": 7.375923156738281, + "learning_rate": 9.179409952768841e-05, + "loss": 0.7922, + "step": 6745 + }, + { + "epoch": 0.4570770377396843, + "grad_norm": 5.134999752044678, + "learning_rate": 9.17927305085906e-05, + "loss": 0.8943, + "step": 6746 + }, + { + "epoch": 0.45714479300765637, + "grad_norm": 8.040843963623047, + "learning_rate": 9.179136148949277e-05, + "loss": 0.9971, + "step": 6747 + }, + { + "epoch": 0.4572125482756284, + "grad_norm": 5.302629470825195, + "learning_rate": 9.178999247039497e-05, + "loss": 0.6811, + "step": 6748 + }, + { + "epoch": 0.4572803035436005, + "grad_norm": 6.630457878112793, + "learning_rate": 9.178862345129715e-05, + "loss": 0.7929, + "step": 6749 + }, + { + "epoch": 0.4573480588115726, + "grad_norm": 5.959926605224609, + "learning_rate": 9.178725443219933e-05, + "loss": 0.7941, + "step": 6750 + }, + { + "epoch": 0.4574158140795447, + "grad_norm": 7.855923175811768, + "learning_rate": 9.178588541310151e-05, + "loss": 0.911, + "step": 6751 + }, + { + "epoch": 0.45748356934751677, + "grad_norm": 7.775002479553223, + "learning_rate": 9.17845163940037e-05, + "loss": 0.8315, + "step": 6752 + }, + { + "epoch": 0.45755132461548886, + "grad_norm": 7.259592056274414, + "learning_rate": 9.178314737490588e-05, + "loss": 0.8383, + "step": 6753 + }, + { + "epoch": 0.45761907988346096, + "grad_norm": 8.843462944030762, + "learning_rate": 9.178177835580806e-05, + "loss": 1.1005, + "step": 6754 + }, + { + "epoch": 0.45768683515143305, + "grad_norm": 6.322036266326904, + "learning_rate": 9.178040933671024e-05, + "loss": 0.8553, + "step": 6755 + }, + { + "epoch": 0.4577545904194051, + "grad_norm": 6.644796848297119, + "learning_rate": 9.177904031761242e-05, + "loss": 1.0933, + "step": 6756 + }, + { + "epoch": 0.4578223456873772, + "grad_norm": 7.463156700134277, + "learning_rate": 9.177767129851462e-05, + "loss": 0.9584, + "step": 6757 + }, + { + "epoch": 0.45789010095534927, + "grad_norm": 5.70056676864624, + "learning_rate": 9.17763022794168e-05, + "loss": 0.6945, + "step": 6758 + }, + { + "epoch": 0.45795785622332136, + "grad_norm": 6.347555637359619, + "learning_rate": 9.177493326031898e-05, + "loss": 0.9143, + "step": 6759 + }, + { + "epoch": 0.45802561149129345, + "grad_norm": 6.00151252746582, + "learning_rate": 9.177356424122116e-05, + "loss": 0.8139, + "step": 6760 + }, + { + "epoch": 0.45809336675926554, + "grad_norm": 8.02507495880127, + "learning_rate": 9.177219522212335e-05, + "loss": 0.8941, + "step": 6761 + }, + { + "epoch": 0.45816112202723763, + "grad_norm": 7.0266876220703125, + "learning_rate": 9.177082620302553e-05, + "loss": 1.0009, + "step": 6762 + }, + { + "epoch": 0.4582288772952097, + "grad_norm": 5.483405113220215, + "learning_rate": 9.176945718392771e-05, + "loss": 0.8113, + "step": 6763 + }, + { + "epoch": 0.45829663256318176, + "grad_norm": 5.661553382873535, + "learning_rate": 9.176808816482991e-05, + "loss": 0.7524, + "step": 6764 + }, + { + "epoch": 0.45836438783115385, + "grad_norm": 7.485569953918457, + "learning_rate": 9.176671914573209e-05, + "loss": 0.8712, + "step": 6765 + }, + { + "epoch": 0.45843214309912594, + "grad_norm": 8.262595176696777, + "learning_rate": 9.176535012663427e-05, + "loss": 0.7777, + "step": 6766 + }, + { + "epoch": 0.45849989836709804, + "grad_norm": 6.955083847045898, + "learning_rate": 9.176398110753646e-05, + "loss": 0.8515, + "step": 6767 + }, + { + "epoch": 0.4585676536350701, + "grad_norm": 5.7011871337890625, + "learning_rate": 9.176261208843864e-05, + "loss": 0.7977, + "step": 6768 + }, + { + "epoch": 0.4586354089030422, + "grad_norm": 7.344122886657715, + "learning_rate": 9.176124306934082e-05, + "loss": 0.877, + "step": 6769 + }, + { + "epoch": 0.4587031641710143, + "grad_norm": 6.595543384552002, + "learning_rate": 9.1759874050243e-05, + "loss": 0.7739, + "step": 6770 + }, + { + "epoch": 0.4587709194389864, + "grad_norm": 6.087022304534912, + "learning_rate": 9.17585050311452e-05, + "loss": 0.9127, + "step": 6771 + }, + { + "epoch": 0.4588386747069585, + "grad_norm": 6.13311243057251, + "learning_rate": 9.175713601204738e-05, + "loss": 0.8803, + "step": 6772 + }, + { + "epoch": 0.45890642997493053, + "grad_norm": 5.866177558898926, + "learning_rate": 9.175576699294956e-05, + "loss": 0.9859, + "step": 6773 + }, + { + "epoch": 0.4589741852429026, + "grad_norm": 7.406826972961426, + "learning_rate": 9.175439797385174e-05, + "loss": 1.2433, + "step": 6774 + }, + { + "epoch": 0.4590419405108747, + "grad_norm": 5.886981964111328, + "learning_rate": 9.175302895475393e-05, + "loss": 0.9994, + "step": 6775 + }, + { + "epoch": 0.4591096957788468, + "grad_norm": 6.240331172943115, + "learning_rate": 9.175165993565611e-05, + "loss": 0.6988, + "step": 6776 + }, + { + "epoch": 0.4591774510468189, + "grad_norm": 6.331019878387451, + "learning_rate": 9.175029091655829e-05, + "loss": 0.9625, + "step": 6777 + }, + { + "epoch": 0.459245206314791, + "grad_norm": 6.082772731781006, + "learning_rate": 9.174892189746047e-05, + "loss": 0.6876, + "step": 6778 + }, + { + "epoch": 0.4593129615827631, + "grad_norm": 9.173615455627441, + "learning_rate": 9.174755287836265e-05, + "loss": 0.9572, + "step": 6779 + }, + { + "epoch": 0.45938071685073517, + "grad_norm": 7.658091068267822, + "learning_rate": 9.174618385926485e-05, + "loss": 1.0724, + "step": 6780 + }, + { + "epoch": 0.4594484721187072, + "grad_norm": 5.414113521575928, + "learning_rate": 9.174481484016703e-05, + "loss": 0.963, + "step": 6781 + }, + { + "epoch": 0.4595162273866793, + "grad_norm": 8.290900230407715, + "learning_rate": 9.174344582106921e-05, + "loss": 0.8012, + "step": 6782 + }, + { + "epoch": 0.4595839826546514, + "grad_norm": 5.778069019317627, + "learning_rate": 9.174207680197139e-05, + "loss": 0.9199, + "step": 6783 + }, + { + "epoch": 0.4596517379226235, + "grad_norm": 7.42002010345459, + "learning_rate": 9.174070778287358e-05, + "loss": 1.022, + "step": 6784 + }, + { + "epoch": 0.4597194931905956, + "grad_norm": 5.714671611785889, + "learning_rate": 9.173933876377576e-05, + "loss": 0.8194, + "step": 6785 + }, + { + "epoch": 0.45978724845856767, + "grad_norm": 6.420266628265381, + "learning_rate": 9.173796974467794e-05, + "loss": 0.8879, + "step": 6786 + }, + { + "epoch": 0.45985500372653976, + "grad_norm": 7.046072006225586, + "learning_rate": 9.173660072558012e-05, + "loss": 0.7672, + "step": 6787 + }, + { + "epoch": 0.45992275899451185, + "grad_norm": 8.797179222106934, + "learning_rate": 9.17352317064823e-05, + "loss": 0.7714, + "step": 6788 + }, + { + "epoch": 0.4599905142624839, + "grad_norm": 6.534037113189697, + "learning_rate": 9.17338626873845e-05, + "loss": 0.8629, + "step": 6789 + }, + { + "epoch": 0.460058269530456, + "grad_norm": 7.647995948791504, + "learning_rate": 9.173249366828668e-05, + "loss": 0.8143, + "step": 6790 + }, + { + "epoch": 0.46012602479842807, + "grad_norm": 5.4971418380737305, + "learning_rate": 9.173112464918886e-05, + "loss": 0.7946, + "step": 6791 + }, + { + "epoch": 0.46019378006640016, + "grad_norm": 6.304740905761719, + "learning_rate": 9.172975563009104e-05, + "loss": 0.9233, + "step": 6792 + }, + { + "epoch": 0.46026153533437225, + "grad_norm": 7.571120262145996, + "learning_rate": 9.172838661099323e-05, + "loss": 0.9919, + "step": 6793 + }, + { + "epoch": 0.46032929060234434, + "grad_norm": 6.336854457855225, + "learning_rate": 9.172701759189541e-05, + "loss": 0.7232, + "step": 6794 + }, + { + "epoch": 0.46039704587031643, + "grad_norm": 6.926676273345947, + "learning_rate": 9.17256485727976e-05, + "loss": 0.758, + "step": 6795 + }, + { + "epoch": 0.4604648011382885, + "grad_norm": 5.480643272399902, + "learning_rate": 9.172427955369977e-05, + "loss": 0.5871, + "step": 6796 + }, + { + "epoch": 0.46053255640626056, + "grad_norm": 8.286799430847168, + "learning_rate": 9.172291053460195e-05, + "loss": 0.8574, + "step": 6797 + }, + { + "epoch": 0.46060031167423265, + "grad_norm": 8.043119430541992, + "learning_rate": 9.172154151550415e-05, + "loss": 0.6363, + "step": 6798 + }, + { + "epoch": 0.46066806694220475, + "grad_norm": 8.308424949645996, + "learning_rate": 9.172017249640633e-05, + "loss": 0.9203, + "step": 6799 + }, + { + "epoch": 0.46073582221017684, + "grad_norm": 7.131639003753662, + "learning_rate": 9.171880347730851e-05, + "loss": 0.685, + "step": 6800 + }, + { + "epoch": 0.46080357747814893, + "grad_norm": 8.397472381591797, + "learning_rate": 9.171743445821069e-05, + "loss": 0.9637, + "step": 6801 + }, + { + "epoch": 0.460871332746121, + "grad_norm": 6.666365146636963, + "learning_rate": 9.171606543911287e-05, + "loss": 1.0735, + "step": 6802 + }, + { + "epoch": 0.4609390880140931, + "grad_norm": 11.102825164794922, + "learning_rate": 9.171469642001506e-05, + "loss": 0.8958, + "step": 6803 + }, + { + "epoch": 0.4610068432820652, + "grad_norm": 7.294391632080078, + "learning_rate": 9.171332740091724e-05, + "loss": 1.0354, + "step": 6804 + }, + { + "epoch": 0.46107459855003724, + "grad_norm": 5.9984517097473145, + "learning_rate": 9.171195838181942e-05, + "loss": 0.605, + "step": 6805 + }, + { + "epoch": 0.46114235381800933, + "grad_norm": 6.209224224090576, + "learning_rate": 9.17105893627216e-05, + "loss": 0.8715, + "step": 6806 + }, + { + "epoch": 0.4612101090859814, + "grad_norm": 5.404207229614258, + "learning_rate": 9.17092203436238e-05, + "loss": 0.6694, + "step": 6807 + }, + { + "epoch": 0.4612778643539535, + "grad_norm": 6.675217151641846, + "learning_rate": 9.170785132452598e-05, + "loss": 0.7288, + "step": 6808 + }, + { + "epoch": 0.4613456196219256, + "grad_norm": 8.444995880126953, + "learning_rate": 9.170648230542816e-05, + "loss": 1.0756, + "step": 6809 + }, + { + "epoch": 0.4614133748898977, + "grad_norm": 8.167703628540039, + "learning_rate": 9.170511328633035e-05, + "loss": 1.1605, + "step": 6810 + }, + { + "epoch": 0.4614811301578698, + "grad_norm": 7.55181360244751, + "learning_rate": 9.170374426723253e-05, + "loss": 0.7805, + "step": 6811 + }, + { + "epoch": 0.4615488854258419, + "grad_norm": 6.380015850067139, + "learning_rate": 9.170237524813471e-05, + "loss": 0.6584, + "step": 6812 + }, + { + "epoch": 0.4616166406938139, + "grad_norm": 6.2454376220703125, + "learning_rate": 9.170100622903691e-05, + "loss": 0.8212, + "step": 6813 + }, + { + "epoch": 0.461684395961786, + "grad_norm": 7.530882358551025, + "learning_rate": 9.169963720993909e-05, + "loss": 0.8926, + "step": 6814 + }, + { + "epoch": 0.4617521512297581, + "grad_norm": 6.3245158195495605, + "learning_rate": 9.169826819084127e-05, + "loss": 0.8672, + "step": 6815 + }, + { + "epoch": 0.4618199064977302, + "grad_norm": 5.822272300720215, + "learning_rate": 9.169689917174346e-05, + "loss": 0.7972, + "step": 6816 + }, + { + "epoch": 0.4618876617657023, + "grad_norm": 8.160684585571289, + "learning_rate": 9.169553015264564e-05, + "loss": 1.0076, + "step": 6817 + }, + { + "epoch": 0.4619554170336744, + "grad_norm": 6.745192050933838, + "learning_rate": 9.169416113354782e-05, + "loss": 0.8187, + "step": 6818 + }, + { + "epoch": 0.46202317230164647, + "grad_norm": 8.530631065368652, + "learning_rate": 9.169279211445e-05, + "loss": 0.7655, + "step": 6819 + }, + { + "epoch": 0.46209092756961856, + "grad_norm": 7.213817596435547, + "learning_rate": 9.169142309535218e-05, + "loss": 0.8737, + "step": 6820 + }, + { + "epoch": 0.4621586828375906, + "grad_norm": 6.5060577392578125, + "learning_rate": 9.169005407625438e-05, + "loss": 0.815, + "step": 6821 + }, + { + "epoch": 0.4622264381055627, + "grad_norm": 6.782070636749268, + "learning_rate": 9.168868505715656e-05, + "loss": 0.792, + "step": 6822 + }, + { + "epoch": 0.4622941933735348, + "grad_norm": 6.130987644195557, + "learning_rate": 9.168731603805874e-05, + "loss": 0.6743, + "step": 6823 + }, + { + "epoch": 0.46236194864150687, + "grad_norm": 8.178153991699219, + "learning_rate": 9.168594701896092e-05, + "loss": 1.0958, + "step": 6824 + }, + { + "epoch": 0.46242970390947896, + "grad_norm": 8.360578536987305, + "learning_rate": 9.16845779998631e-05, + "loss": 0.9709, + "step": 6825 + }, + { + "epoch": 0.46249745917745105, + "grad_norm": 7.507253170013428, + "learning_rate": 9.168320898076529e-05, + "loss": 0.9384, + "step": 6826 + }, + { + "epoch": 0.46256521444542315, + "grad_norm": 8.285658836364746, + "learning_rate": 9.168183996166747e-05, + "loss": 0.8913, + "step": 6827 + }, + { + "epoch": 0.46263296971339524, + "grad_norm": 7.765392780303955, + "learning_rate": 9.168047094256965e-05, + "loss": 0.8754, + "step": 6828 + }, + { + "epoch": 0.46270072498136733, + "grad_norm": 7.6219892501831055, + "learning_rate": 9.167910192347183e-05, + "loss": 0.7401, + "step": 6829 + }, + { + "epoch": 0.46276848024933936, + "grad_norm": 9.139801979064941, + "learning_rate": 9.167773290437403e-05, + "loss": 0.8068, + "step": 6830 + }, + { + "epoch": 0.46283623551731146, + "grad_norm": 7.655275821685791, + "learning_rate": 9.167636388527621e-05, + "loss": 0.8314, + "step": 6831 + }, + { + "epoch": 0.46290399078528355, + "grad_norm": 6.699294090270996, + "learning_rate": 9.167499486617839e-05, + "loss": 0.8665, + "step": 6832 + }, + { + "epoch": 0.46297174605325564, + "grad_norm": 8.436169624328613, + "learning_rate": 9.167362584708057e-05, + "loss": 1.3026, + "step": 6833 + }, + { + "epoch": 0.46303950132122773, + "grad_norm": 5.195062160491943, + "learning_rate": 9.167225682798275e-05, + "loss": 0.7468, + "step": 6834 + }, + { + "epoch": 0.4631072565891998, + "grad_norm": 5.665080547332764, + "learning_rate": 9.167088780888494e-05, + "loss": 0.7357, + "step": 6835 + }, + { + "epoch": 0.4631750118571719, + "grad_norm": 6.243655681610107, + "learning_rate": 9.166951878978712e-05, + "loss": 0.7705, + "step": 6836 + }, + { + "epoch": 0.463242767125144, + "grad_norm": 7.49260950088501, + "learning_rate": 9.16681497706893e-05, + "loss": 0.8061, + "step": 6837 + }, + { + "epoch": 0.46331052239311604, + "grad_norm": 7.076335430145264, + "learning_rate": 9.166678075159148e-05, + "loss": 0.8307, + "step": 6838 + }, + { + "epoch": 0.46337827766108813, + "grad_norm": 6.332518100738525, + "learning_rate": 9.166541173249368e-05, + "loss": 0.7431, + "step": 6839 + }, + { + "epoch": 0.4634460329290602, + "grad_norm": 6.261449813842773, + "learning_rate": 9.166404271339586e-05, + "loss": 0.8817, + "step": 6840 + }, + { + "epoch": 0.4635137881970323, + "grad_norm": 7.467769622802734, + "learning_rate": 9.166267369429804e-05, + "loss": 0.9473, + "step": 6841 + }, + { + "epoch": 0.4635815434650044, + "grad_norm": 5.55718469619751, + "learning_rate": 9.166130467520022e-05, + "loss": 0.7726, + "step": 6842 + }, + { + "epoch": 0.4636492987329765, + "grad_norm": 7.012959003448486, + "learning_rate": 9.16599356561024e-05, + "loss": 0.9274, + "step": 6843 + }, + { + "epoch": 0.4637170540009486, + "grad_norm": 6.344860076904297, + "learning_rate": 9.165856663700459e-05, + "loss": 1.0617, + "step": 6844 + }, + { + "epoch": 0.4637848092689207, + "grad_norm": 6.024655342102051, + "learning_rate": 9.165719761790677e-05, + "loss": 0.8224, + "step": 6845 + }, + { + "epoch": 0.4638525645368927, + "grad_norm": 6.97758674621582, + "learning_rate": 9.165582859880895e-05, + "loss": 0.8344, + "step": 6846 + }, + { + "epoch": 0.4639203198048648, + "grad_norm": 6.874304294586182, + "learning_rate": 9.165445957971113e-05, + "loss": 0.8698, + "step": 6847 + }, + { + "epoch": 0.4639880750728369, + "grad_norm": 8.15816593170166, + "learning_rate": 9.165309056061331e-05, + "loss": 1.0457, + "step": 6848 + }, + { + "epoch": 0.464055830340809, + "grad_norm": 7.304631233215332, + "learning_rate": 9.165172154151551e-05, + "loss": 0.9027, + "step": 6849 + }, + { + "epoch": 0.4641235856087811, + "grad_norm": 7.377762317657471, + "learning_rate": 9.165035252241769e-05, + "loss": 0.6635, + "step": 6850 + }, + { + "epoch": 0.4641913408767532, + "grad_norm": 6.051864147186279, + "learning_rate": 9.164898350331987e-05, + "loss": 0.925, + "step": 6851 + }, + { + "epoch": 0.46425909614472527, + "grad_norm": 6.560476779937744, + "learning_rate": 9.164761448422205e-05, + "loss": 0.7674, + "step": 6852 + }, + { + "epoch": 0.46432685141269736, + "grad_norm": 6.828582763671875, + "learning_rate": 9.164624546512424e-05, + "loss": 0.8159, + "step": 6853 + }, + { + "epoch": 0.4643946066806694, + "grad_norm": 6.463281631469727, + "learning_rate": 9.164487644602642e-05, + "loss": 0.6261, + "step": 6854 + }, + { + "epoch": 0.4644623619486415, + "grad_norm": 6.3371734619140625, + "learning_rate": 9.16435074269286e-05, + "loss": 0.6489, + "step": 6855 + }, + { + "epoch": 0.4645301172166136, + "grad_norm": 6.052369117736816, + "learning_rate": 9.16421384078308e-05, + "loss": 0.6899, + "step": 6856 + }, + { + "epoch": 0.46459787248458567, + "grad_norm": 6.464377403259277, + "learning_rate": 9.164076938873298e-05, + "loss": 0.7603, + "step": 6857 + }, + { + "epoch": 0.46466562775255776, + "grad_norm": 5.9912567138671875, + "learning_rate": 9.163940036963516e-05, + "loss": 0.7188, + "step": 6858 + }, + { + "epoch": 0.46473338302052986, + "grad_norm": 8.469727516174316, + "learning_rate": 9.163803135053735e-05, + "loss": 0.9991, + "step": 6859 + }, + { + "epoch": 0.46480113828850195, + "grad_norm": 6.261715888977051, + "learning_rate": 9.163666233143953e-05, + "loss": 0.7429, + "step": 6860 + }, + { + "epoch": 0.46486889355647404, + "grad_norm": 7.389404296875, + "learning_rate": 9.163529331234171e-05, + "loss": 0.6294, + "step": 6861 + }, + { + "epoch": 0.4649366488244461, + "grad_norm": 6.988171100616455, + "learning_rate": 9.16339242932439e-05, + "loss": 0.8705, + "step": 6862 + }, + { + "epoch": 0.46500440409241817, + "grad_norm": 6.285641193389893, + "learning_rate": 9.163255527414609e-05, + "loss": 0.7026, + "step": 6863 + }, + { + "epoch": 0.46507215936039026, + "grad_norm": 8.77840518951416, + "learning_rate": 9.163118625504827e-05, + "loss": 0.8766, + "step": 6864 + }, + { + "epoch": 0.46513991462836235, + "grad_norm": 6.199909210205078, + "learning_rate": 9.162981723595045e-05, + "loss": 0.7304, + "step": 6865 + }, + { + "epoch": 0.46520766989633444, + "grad_norm": 6.947317600250244, + "learning_rate": 9.162844821685263e-05, + "loss": 0.8649, + "step": 6866 + }, + { + "epoch": 0.46527542516430653, + "grad_norm": 8.396434783935547, + "learning_rate": 9.162707919775482e-05, + "loss": 1.2161, + "step": 6867 + }, + { + "epoch": 0.4653431804322786, + "grad_norm": 5.935467720031738, + "learning_rate": 9.1625710178657e-05, + "loss": 0.8499, + "step": 6868 + }, + { + "epoch": 0.4654109357002507, + "grad_norm": 6.258296966552734, + "learning_rate": 9.162434115955918e-05, + "loss": 0.9541, + "step": 6869 + }, + { + "epoch": 0.46547869096822275, + "grad_norm": 6.29650354385376, + "learning_rate": 9.162297214046136e-05, + "loss": 0.7774, + "step": 6870 + }, + { + "epoch": 0.46554644623619484, + "grad_norm": 5.948836803436279, + "learning_rate": 9.162160312136356e-05, + "loss": 0.7579, + "step": 6871 + }, + { + "epoch": 0.46561420150416694, + "grad_norm": 5.719659805297852, + "learning_rate": 9.162023410226574e-05, + "loss": 0.7482, + "step": 6872 + }, + { + "epoch": 0.465681956772139, + "grad_norm": 6.145468235015869, + "learning_rate": 9.161886508316792e-05, + "loss": 0.8816, + "step": 6873 + }, + { + "epoch": 0.4657497120401111, + "grad_norm": 6.33701753616333, + "learning_rate": 9.16174960640701e-05, + "loss": 0.7617, + "step": 6874 + }, + { + "epoch": 0.4658174673080832, + "grad_norm": 6.143844127655029, + "learning_rate": 9.161612704497228e-05, + "loss": 0.9695, + "step": 6875 + }, + { + "epoch": 0.4658852225760553, + "grad_norm": 6.316319942474365, + "learning_rate": 9.161475802587447e-05, + "loss": 0.8837, + "step": 6876 + }, + { + "epoch": 0.4659529778440274, + "grad_norm": 6.68782377243042, + "learning_rate": 9.161338900677665e-05, + "loss": 0.7324, + "step": 6877 + }, + { + "epoch": 0.4660207331119995, + "grad_norm": 6.216282844543457, + "learning_rate": 9.161201998767883e-05, + "loss": 0.7787, + "step": 6878 + }, + { + "epoch": 0.4660884883799715, + "grad_norm": 6.8673176765441895, + "learning_rate": 9.161065096858101e-05, + "loss": 1.1118, + "step": 6879 + }, + { + "epoch": 0.4661562436479436, + "grad_norm": 6.71323823928833, + "learning_rate": 9.16092819494832e-05, + "loss": 0.9176, + "step": 6880 + }, + { + "epoch": 0.4662239989159157, + "grad_norm": 8.589679718017578, + "learning_rate": 9.160791293038539e-05, + "loss": 0.9758, + "step": 6881 + }, + { + "epoch": 0.4662917541838878, + "grad_norm": 6.692760467529297, + "learning_rate": 9.160654391128757e-05, + "loss": 0.8867, + "step": 6882 + }, + { + "epoch": 0.4663595094518599, + "grad_norm": 8.462479591369629, + "learning_rate": 9.160517489218975e-05, + "loss": 0.9738, + "step": 6883 + }, + { + "epoch": 0.466427264719832, + "grad_norm": 6.068343162536621, + "learning_rate": 9.160380587309193e-05, + "loss": 0.6652, + "step": 6884 + }, + { + "epoch": 0.46649501998780407, + "grad_norm": 6.04793643951416, + "learning_rate": 9.160243685399412e-05, + "loss": 0.8575, + "step": 6885 + }, + { + "epoch": 0.46656277525577616, + "grad_norm": 7.8360514640808105, + "learning_rate": 9.16010678348963e-05, + "loss": 0.8488, + "step": 6886 + }, + { + "epoch": 0.4666305305237482, + "grad_norm": 6.028532981872559, + "learning_rate": 9.159969881579848e-05, + "loss": 0.8841, + "step": 6887 + }, + { + "epoch": 0.4666982857917203, + "grad_norm": 7.673785209655762, + "learning_rate": 9.159832979670066e-05, + "loss": 0.9035, + "step": 6888 + }, + { + "epoch": 0.4667660410596924, + "grad_norm": 7.042590618133545, + "learning_rate": 9.159696077760284e-05, + "loss": 0.8812, + "step": 6889 + }, + { + "epoch": 0.4668337963276645, + "grad_norm": 6.562109470367432, + "learning_rate": 9.159559175850504e-05, + "loss": 0.7179, + "step": 6890 + }, + { + "epoch": 0.46690155159563657, + "grad_norm": 6.083657264709473, + "learning_rate": 9.159422273940722e-05, + "loss": 0.7322, + "step": 6891 + }, + { + "epoch": 0.46696930686360866, + "grad_norm": 7.408811569213867, + "learning_rate": 9.15928537203094e-05, + "loss": 0.9291, + "step": 6892 + }, + { + "epoch": 0.46703706213158075, + "grad_norm": 6.70425271987915, + "learning_rate": 9.159148470121158e-05, + "loss": 0.9796, + "step": 6893 + }, + { + "epoch": 0.46710481739955284, + "grad_norm": 7.486738681793213, + "learning_rate": 9.159011568211377e-05, + "loss": 1.128, + "step": 6894 + }, + { + "epoch": 0.4671725726675249, + "grad_norm": 6.747304916381836, + "learning_rate": 9.158874666301595e-05, + "loss": 0.8432, + "step": 6895 + }, + { + "epoch": 0.46724032793549697, + "grad_norm": 7.494656085968018, + "learning_rate": 9.158737764391813e-05, + "loss": 0.9587, + "step": 6896 + }, + { + "epoch": 0.46730808320346906, + "grad_norm": 5.556826114654541, + "learning_rate": 9.158600862482031e-05, + "loss": 0.6565, + "step": 6897 + }, + { + "epoch": 0.46737583847144115, + "grad_norm": 6.550345420837402, + "learning_rate": 9.15846396057225e-05, + "loss": 0.7854, + "step": 6898 + }, + { + "epoch": 0.46744359373941324, + "grad_norm": 7.0627899169921875, + "learning_rate": 9.158327058662469e-05, + "loss": 0.9204, + "step": 6899 + }, + { + "epoch": 0.46751134900738534, + "grad_norm": 7.1070661544799805, + "learning_rate": 9.158190156752687e-05, + "loss": 0.9357, + "step": 6900 + }, + { + "epoch": 0.4675791042753574, + "grad_norm": 7.313487529754639, + "learning_rate": 9.158053254842905e-05, + "loss": 0.7561, + "step": 6901 + }, + { + "epoch": 0.4676468595433295, + "grad_norm": 6.839418888092041, + "learning_rate": 9.157916352933123e-05, + "loss": 0.9551, + "step": 6902 + }, + { + "epoch": 0.46771461481130155, + "grad_norm": 6.881319999694824, + "learning_rate": 9.157779451023342e-05, + "loss": 0.8799, + "step": 6903 + }, + { + "epoch": 0.46778237007927365, + "grad_norm": 7.470109462738037, + "learning_rate": 9.15764254911356e-05, + "loss": 0.806, + "step": 6904 + }, + { + "epoch": 0.46785012534724574, + "grad_norm": 7.056912899017334, + "learning_rate": 9.157505647203778e-05, + "loss": 0.7905, + "step": 6905 + }, + { + "epoch": 0.46791788061521783, + "grad_norm": 6.295861721038818, + "learning_rate": 9.157368745293998e-05, + "loss": 0.8256, + "step": 6906 + }, + { + "epoch": 0.4679856358831899, + "grad_norm": 5.80570650100708, + "learning_rate": 9.157231843384216e-05, + "loss": 0.7747, + "step": 6907 + }, + { + "epoch": 0.468053391151162, + "grad_norm": 7.0094218254089355, + "learning_rate": 9.157094941474435e-05, + "loss": 0.9504, + "step": 6908 + }, + { + "epoch": 0.4681211464191341, + "grad_norm": 5.334716320037842, + "learning_rate": 9.156958039564653e-05, + "loss": 0.7608, + "step": 6909 + }, + { + "epoch": 0.4681889016871062, + "grad_norm": 8.058252334594727, + "learning_rate": 9.156821137654871e-05, + "loss": 0.889, + "step": 6910 + }, + { + "epoch": 0.46825665695507823, + "grad_norm": 7.769287586212158, + "learning_rate": 9.156684235745089e-05, + "loss": 1.0391, + "step": 6911 + }, + { + "epoch": 0.4683244122230503, + "grad_norm": 5.96143102645874, + "learning_rate": 9.156547333835307e-05, + "loss": 0.8793, + "step": 6912 + }, + { + "epoch": 0.4683921674910224, + "grad_norm": 5.6451005935668945, + "learning_rate": 9.156410431925527e-05, + "loss": 0.7804, + "step": 6913 + }, + { + "epoch": 0.4684599227589945, + "grad_norm": 7.539672374725342, + "learning_rate": 9.156273530015745e-05, + "loss": 0.8137, + "step": 6914 + }, + { + "epoch": 0.4685276780269666, + "grad_norm": 7.447227954864502, + "learning_rate": 9.156136628105963e-05, + "loss": 1.0692, + "step": 6915 + }, + { + "epoch": 0.4685954332949387, + "grad_norm": 8.100126266479492, + "learning_rate": 9.155999726196181e-05, + "loss": 0.8124, + "step": 6916 + }, + { + "epoch": 0.4686631885629108, + "grad_norm": 8.560744285583496, + "learning_rate": 9.1558628242864e-05, + "loss": 0.8659, + "step": 6917 + }, + { + "epoch": 0.4687309438308829, + "grad_norm": 7.135112285614014, + "learning_rate": 9.155725922376618e-05, + "loss": 0.9009, + "step": 6918 + }, + { + "epoch": 0.4687986990988549, + "grad_norm": 7.749111175537109, + "learning_rate": 9.155589020466836e-05, + "loss": 0.9209, + "step": 6919 + }, + { + "epoch": 0.468866454366827, + "grad_norm": 6.616466999053955, + "learning_rate": 9.155452118557054e-05, + "loss": 0.9283, + "step": 6920 + }, + { + "epoch": 0.4689342096347991, + "grad_norm": 6.961619853973389, + "learning_rate": 9.155315216647272e-05, + "loss": 0.8142, + "step": 6921 + }, + { + "epoch": 0.4690019649027712, + "grad_norm": 7.225759506225586, + "learning_rate": 9.155178314737492e-05, + "loss": 0.7626, + "step": 6922 + }, + { + "epoch": 0.4690697201707433, + "grad_norm": 6.0465922355651855, + "learning_rate": 9.15504141282771e-05, + "loss": 0.8281, + "step": 6923 + }, + { + "epoch": 0.46913747543871537, + "grad_norm": 7.507081985473633, + "learning_rate": 9.154904510917928e-05, + "loss": 1.1642, + "step": 6924 + }, + { + "epoch": 0.46920523070668746, + "grad_norm": 6.281520843505859, + "learning_rate": 9.154767609008146e-05, + "loss": 0.8694, + "step": 6925 + }, + { + "epoch": 0.46927298597465955, + "grad_norm": 6.0980658531188965, + "learning_rate": 9.154630707098365e-05, + "loss": 0.6983, + "step": 6926 + }, + { + "epoch": 0.4693407412426316, + "grad_norm": 5.842291355133057, + "learning_rate": 9.154493805188583e-05, + "loss": 0.8392, + "step": 6927 + }, + { + "epoch": 0.4694084965106037, + "grad_norm": 7.486947536468506, + "learning_rate": 9.154356903278801e-05, + "loss": 0.8363, + "step": 6928 + }, + { + "epoch": 0.46947625177857577, + "grad_norm": 7.279699802398682, + "learning_rate": 9.154220001369019e-05, + "loss": 0.9697, + "step": 6929 + }, + { + "epoch": 0.46954400704654786, + "grad_norm": 6.720832347869873, + "learning_rate": 9.154083099459237e-05, + "loss": 1.0377, + "step": 6930 + }, + { + "epoch": 0.46961176231451995, + "grad_norm": 9.792084693908691, + "learning_rate": 9.153946197549457e-05, + "loss": 1.1697, + "step": 6931 + }, + { + "epoch": 0.46967951758249205, + "grad_norm": 6.329649925231934, + "learning_rate": 9.153809295639675e-05, + "loss": 0.8825, + "step": 6932 + }, + { + "epoch": 0.46974727285046414, + "grad_norm": 5.9268269538879395, + "learning_rate": 9.153672393729893e-05, + "loss": 0.9157, + "step": 6933 + }, + { + "epoch": 0.46981502811843623, + "grad_norm": 7.7385430335998535, + "learning_rate": 9.153535491820111e-05, + "loss": 0.7662, + "step": 6934 + }, + { + "epoch": 0.4698827833864083, + "grad_norm": 6.896132946014404, + "learning_rate": 9.153398589910329e-05, + "loss": 1.004, + "step": 6935 + }, + { + "epoch": 0.46995053865438036, + "grad_norm": 6.822011470794678, + "learning_rate": 9.153261688000548e-05, + "loss": 1.0163, + "step": 6936 + }, + { + "epoch": 0.47001829392235245, + "grad_norm": 6.4482574462890625, + "learning_rate": 9.153124786090766e-05, + "loss": 0.8293, + "step": 6937 + }, + { + "epoch": 0.47008604919032454, + "grad_norm": 6.855703353881836, + "learning_rate": 9.152987884180984e-05, + "loss": 0.9737, + "step": 6938 + }, + { + "epoch": 0.47015380445829663, + "grad_norm": 6.508902549743652, + "learning_rate": 9.152850982271202e-05, + "loss": 1.0892, + "step": 6939 + }, + { + "epoch": 0.4702215597262687, + "grad_norm": 5.452862739562988, + "learning_rate": 9.152714080361422e-05, + "loss": 0.803, + "step": 6940 + }, + { + "epoch": 0.4702893149942408, + "grad_norm": 5.583015441894531, + "learning_rate": 9.15257717845164e-05, + "loss": 0.6624, + "step": 6941 + }, + { + "epoch": 0.4703570702622129, + "grad_norm": 7.870943069458008, + "learning_rate": 9.152440276541858e-05, + "loss": 0.929, + "step": 6942 + }, + { + "epoch": 0.470424825530185, + "grad_norm": 7.723261833190918, + "learning_rate": 9.152303374632076e-05, + "loss": 0.8467, + "step": 6943 + }, + { + "epoch": 0.47049258079815703, + "grad_norm": 8.4433012008667, + "learning_rate": 9.152166472722294e-05, + "loss": 0.9935, + "step": 6944 + }, + { + "epoch": 0.4705603360661291, + "grad_norm": 7.142673015594482, + "learning_rate": 9.152029570812513e-05, + "loss": 0.9821, + "step": 6945 + }, + { + "epoch": 0.4706280913341012, + "grad_norm": 6.800427436828613, + "learning_rate": 9.151892668902731e-05, + "loss": 0.7329, + "step": 6946 + }, + { + "epoch": 0.4706958466020733, + "grad_norm": 6.462594985961914, + "learning_rate": 9.151755766992949e-05, + "loss": 0.6495, + "step": 6947 + }, + { + "epoch": 0.4707636018700454, + "grad_norm": 6.2622294425964355, + "learning_rate": 9.151618865083167e-05, + "loss": 1.0438, + "step": 6948 + }, + { + "epoch": 0.4708313571380175, + "grad_norm": 7.76660680770874, + "learning_rate": 9.151481963173387e-05, + "loss": 0.8855, + "step": 6949 + }, + { + "epoch": 0.4708991124059896, + "grad_norm": 4.535599231719971, + "learning_rate": 9.151345061263605e-05, + "loss": 0.6823, + "step": 6950 + }, + { + "epoch": 0.4709668676739617, + "grad_norm": 5.862152099609375, + "learning_rate": 9.151208159353823e-05, + "loss": 0.7322, + "step": 6951 + }, + { + "epoch": 0.4710346229419337, + "grad_norm": 6.145107746124268, + "learning_rate": 9.151071257444042e-05, + "loss": 0.8258, + "step": 6952 + }, + { + "epoch": 0.4711023782099058, + "grad_norm": 5.889834880828857, + "learning_rate": 9.15093435553426e-05, + "loss": 0.8009, + "step": 6953 + }, + { + "epoch": 0.4711701334778779, + "grad_norm": 8.594234466552734, + "learning_rate": 9.150797453624478e-05, + "loss": 0.8349, + "step": 6954 + }, + { + "epoch": 0.47123788874585, + "grad_norm": 7.7569379806518555, + "learning_rate": 9.150660551714698e-05, + "loss": 0.6711, + "step": 6955 + }, + { + "epoch": 0.4713056440138221, + "grad_norm": 7.4088544845581055, + "learning_rate": 9.150523649804916e-05, + "loss": 0.8936, + "step": 6956 + }, + { + "epoch": 0.47137339928179417, + "grad_norm": 6.610263347625732, + "learning_rate": 9.150386747895134e-05, + "loss": 0.9786, + "step": 6957 + }, + { + "epoch": 0.47144115454976626, + "grad_norm": 6.46956729888916, + "learning_rate": 9.150249845985352e-05, + "loss": 0.6781, + "step": 6958 + }, + { + "epoch": 0.47150890981773835, + "grad_norm": 6.152948379516602, + "learning_rate": 9.150112944075571e-05, + "loss": 0.8375, + "step": 6959 + }, + { + "epoch": 0.4715766650857104, + "grad_norm": 6.96013879776001, + "learning_rate": 9.149976042165789e-05, + "loss": 0.8629, + "step": 6960 + }, + { + "epoch": 0.4716444203536825, + "grad_norm": 6.767139434814453, + "learning_rate": 9.149839140256007e-05, + "loss": 0.8043, + "step": 6961 + }, + { + "epoch": 0.4717121756216546, + "grad_norm": 7.95654296875, + "learning_rate": 9.149702238346225e-05, + "loss": 1.0179, + "step": 6962 + }, + { + "epoch": 0.47177993088962666, + "grad_norm": 8.155875205993652, + "learning_rate": 9.149565336436445e-05, + "loss": 1.0663, + "step": 6963 + }, + { + "epoch": 0.47184768615759876, + "grad_norm": 7.903263092041016, + "learning_rate": 9.149428434526663e-05, + "loss": 1.0214, + "step": 6964 + }, + { + "epoch": 0.47191544142557085, + "grad_norm": 7.510760307312012, + "learning_rate": 9.149291532616881e-05, + "loss": 0.9942, + "step": 6965 + }, + { + "epoch": 0.47198319669354294, + "grad_norm": 5.814423084259033, + "learning_rate": 9.149154630707099e-05, + "loss": 0.7677, + "step": 6966 + }, + { + "epoch": 0.47205095196151503, + "grad_norm": 8.29617977142334, + "learning_rate": 9.149017728797317e-05, + "loss": 0.8268, + "step": 6967 + }, + { + "epoch": 0.47211870722948707, + "grad_norm": 7.393543720245361, + "learning_rate": 9.148880826887536e-05, + "loss": 0.9067, + "step": 6968 + }, + { + "epoch": 0.47218646249745916, + "grad_norm": 7.611250400543213, + "learning_rate": 9.148743924977754e-05, + "loss": 0.8908, + "step": 6969 + }, + { + "epoch": 0.47225421776543125, + "grad_norm": 9.330535888671875, + "learning_rate": 9.148607023067972e-05, + "loss": 0.6368, + "step": 6970 + }, + { + "epoch": 0.47232197303340334, + "grad_norm": 7.130900859832764, + "learning_rate": 9.14847012115819e-05, + "loss": 0.8157, + "step": 6971 + }, + { + "epoch": 0.47238972830137543, + "grad_norm": 6.899352550506592, + "learning_rate": 9.14833321924841e-05, + "loss": 0.7058, + "step": 6972 + }, + { + "epoch": 0.4724574835693475, + "grad_norm": 7.850022792816162, + "learning_rate": 9.148196317338628e-05, + "loss": 0.9622, + "step": 6973 + }, + { + "epoch": 0.4725252388373196, + "grad_norm": 8.331214904785156, + "learning_rate": 9.148059415428846e-05, + "loss": 0.9366, + "step": 6974 + }, + { + "epoch": 0.4725929941052917, + "grad_norm": 6.377279758453369, + "learning_rate": 9.147922513519064e-05, + "loss": 0.8701, + "step": 6975 + }, + { + "epoch": 0.47266074937326374, + "grad_norm": 6.650668621063232, + "learning_rate": 9.147785611609282e-05, + "loss": 0.8066, + "step": 6976 + }, + { + "epoch": 0.47272850464123584, + "grad_norm": 7.406231880187988, + "learning_rate": 9.147648709699501e-05, + "loss": 0.8833, + "step": 6977 + }, + { + "epoch": 0.4727962599092079, + "grad_norm": 6.31017541885376, + "learning_rate": 9.147511807789719e-05, + "loss": 0.8001, + "step": 6978 + }, + { + "epoch": 0.47286401517718, + "grad_norm": 6.4131927490234375, + "learning_rate": 9.147374905879937e-05, + "loss": 0.77, + "step": 6979 + }, + { + "epoch": 0.4729317704451521, + "grad_norm": 9.2667236328125, + "learning_rate": 9.147238003970155e-05, + "loss": 0.9891, + "step": 6980 + }, + { + "epoch": 0.4729995257131242, + "grad_norm": 7.107274055480957, + "learning_rate": 9.147101102060373e-05, + "loss": 1.0087, + "step": 6981 + }, + { + "epoch": 0.4730672809810963, + "grad_norm": 8.102210998535156, + "learning_rate": 9.146964200150593e-05, + "loss": 0.7827, + "step": 6982 + }, + { + "epoch": 0.4731350362490684, + "grad_norm": 6.7619099617004395, + "learning_rate": 9.146827298240811e-05, + "loss": 0.7432, + "step": 6983 + }, + { + "epoch": 0.4732027915170405, + "grad_norm": 7.551081657409668, + "learning_rate": 9.146690396331029e-05, + "loss": 0.6776, + "step": 6984 + }, + { + "epoch": 0.4732705467850125, + "grad_norm": 6.923361301422119, + "learning_rate": 9.146553494421247e-05, + "loss": 0.962, + "step": 6985 + }, + { + "epoch": 0.4733383020529846, + "grad_norm": 6.971046447753906, + "learning_rate": 9.146416592511466e-05, + "loss": 0.9481, + "step": 6986 + }, + { + "epoch": 0.4734060573209567, + "grad_norm": 7.046440601348877, + "learning_rate": 9.146279690601684e-05, + "loss": 1.0244, + "step": 6987 + }, + { + "epoch": 0.4734738125889288, + "grad_norm": 6.281298637390137, + "learning_rate": 9.146142788691902e-05, + "loss": 0.8104, + "step": 6988 + }, + { + "epoch": 0.4735415678569009, + "grad_norm": 10.22514820098877, + "learning_rate": 9.14600588678212e-05, + "loss": 1.1109, + "step": 6989 + }, + { + "epoch": 0.47360932312487297, + "grad_norm": 6.466033458709717, + "learning_rate": 9.145868984872338e-05, + "loss": 0.9219, + "step": 6990 + }, + { + "epoch": 0.47367707839284506, + "grad_norm": 7.837368011474609, + "learning_rate": 9.145732082962558e-05, + "loss": 1.0033, + "step": 6991 + }, + { + "epoch": 0.47374483366081716, + "grad_norm": 7.501054286956787, + "learning_rate": 9.145595181052776e-05, + "loss": 0.919, + "step": 6992 + }, + { + "epoch": 0.4738125889287892, + "grad_norm": 6.243696212768555, + "learning_rate": 9.145458279142994e-05, + "loss": 0.7736, + "step": 6993 + }, + { + "epoch": 0.4738803441967613, + "grad_norm": 7.519147872924805, + "learning_rate": 9.145321377233212e-05, + "loss": 0.8014, + "step": 6994 + }, + { + "epoch": 0.4739480994647334, + "grad_norm": 5.793912887573242, + "learning_rate": 9.145184475323431e-05, + "loss": 0.7508, + "step": 6995 + }, + { + "epoch": 0.47401585473270547, + "grad_norm": 7.050177097320557, + "learning_rate": 9.145047573413649e-05, + "loss": 0.8174, + "step": 6996 + }, + { + "epoch": 0.47408361000067756, + "grad_norm": 5.979698657989502, + "learning_rate": 9.144910671503867e-05, + "loss": 0.9812, + "step": 6997 + }, + { + "epoch": 0.47415136526864965, + "grad_norm": 7.254084587097168, + "learning_rate": 9.144773769594087e-05, + "loss": 0.8015, + "step": 6998 + }, + { + "epoch": 0.47421912053662174, + "grad_norm": 8.168086051940918, + "learning_rate": 9.144636867684305e-05, + "loss": 0.8107, + "step": 6999 + }, + { + "epoch": 0.47428687580459383, + "grad_norm": 6.693539619445801, + "learning_rate": 9.144499965774523e-05, + "loss": 0.7048, + "step": 7000 + }, + { + "epoch": 0.47435463107256587, + "grad_norm": 7.216420650482178, + "learning_rate": 9.144363063864742e-05, + "loss": 0.7434, + "step": 7001 + }, + { + "epoch": 0.47442238634053796, + "grad_norm": 8.014084815979004, + "learning_rate": 9.14422616195496e-05, + "loss": 1.1213, + "step": 7002 + }, + { + "epoch": 0.47449014160851005, + "grad_norm": 6.1906938552856445, + "learning_rate": 9.144089260045178e-05, + "loss": 0.8188, + "step": 7003 + }, + { + "epoch": 0.47455789687648214, + "grad_norm": 8.70464038848877, + "learning_rate": 9.143952358135398e-05, + "loss": 1.0273, + "step": 7004 + }, + { + "epoch": 0.47462565214445424, + "grad_norm": 5.303441524505615, + "learning_rate": 9.143815456225616e-05, + "loss": 0.6782, + "step": 7005 + }, + { + "epoch": 0.4746934074124263, + "grad_norm": 8.176512718200684, + "learning_rate": 9.143678554315834e-05, + "loss": 0.7853, + "step": 7006 + }, + { + "epoch": 0.4747611626803984, + "grad_norm": 6.074409008026123, + "learning_rate": 9.143541652406052e-05, + "loss": 0.9088, + "step": 7007 + }, + { + "epoch": 0.4748289179483705, + "grad_norm": 7.32485294342041, + "learning_rate": 9.14340475049627e-05, + "loss": 1.0558, + "step": 7008 + }, + { + "epoch": 0.47489667321634255, + "grad_norm": 5.4144463539123535, + "learning_rate": 9.143267848586489e-05, + "loss": 0.9064, + "step": 7009 + }, + { + "epoch": 0.47496442848431464, + "grad_norm": 7.717291355133057, + "learning_rate": 9.143130946676707e-05, + "loss": 0.9129, + "step": 7010 + }, + { + "epoch": 0.47503218375228673, + "grad_norm": 7.902177333831787, + "learning_rate": 9.142994044766925e-05, + "loss": 1.1144, + "step": 7011 + }, + { + "epoch": 0.4750999390202588, + "grad_norm": 6.94300651550293, + "learning_rate": 9.142857142857143e-05, + "loss": 0.7982, + "step": 7012 + }, + { + "epoch": 0.4751676942882309, + "grad_norm": 7.08957052230835, + "learning_rate": 9.142720240947361e-05, + "loss": 1.009, + "step": 7013 + }, + { + "epoch": 0.475235449556203, + "grad_norm": 5.148087024688721, + "learning_rate": 9.14258333903758e-05, + "loss": 0.7443, + "step": 7014 + }, + { + "epoch": 0.4753032048241751, + "grad_norm": 6.155871868133545, + "learning_rate": 9.142446437127799e-05, + "loss": 0.8135, + "step": 7015 + }, + { + "epoch": 0.4753709600921472, + "grad_norm": 6.569172382354736, + "learning_rate": 9.142309535218017e-05, + "loss": 0.7684, + "step": 7016 + }, + { + "epoch": 0.4754387153601192, + "grad_norm": 9.110980033874512, + "learning_rate": 9.142172633308235e-05, + "loss": 1.0382, + "step": 7017 + }, + { + "epoch": 0.4755064706280913, + "grad_norm": 6.317762851715088, + "learning_rate": 9.142035731398454e-05, + "loss": 0.7206, + "step": 7018 + }, + { + "epoch": 0.4755742258960634, + "grad_norm": 7.054732799530029, + "learning_rate": 9.141898829488672e-05, + "loss": 0.7442, + "step": 7019 + }, + { + "epoch": 0.4756419811640355, + "grad_norm": 8.904619216918945, + "learning_rate": 9.14176192757889e-05, + "loss": 1.1686, + "step": 7020 + }, + { + "epoch": 0.4757097364320076, + "grad_norm": 7.206265926361084, + "learning_rate": 9.141625025669108e-05, + "loss": 0.8675, + "step": 7021 + }, + { + "epoch": 0.4757774916999797, + "grad_norm": 7.7408318519592285, + "learning_rate": 9.141488123759326e-05, + "loss": 0.9132, + "step": 7022 + }, + { + "epoch": 0.4758452469679518, + "grad_norm": 5.807632923126221, + "learning_rate": 9.141351221849546e-05, + "loss": 0.8356, + "step": 7023 + }, + { + "epoch": 0.47591300223592387, + "grad_norm": 8.07009506225586, + "learning_rate": 9.141214319939764e-05, + "loss": 1.1522, + "step": 7024 + }, + { + "epoch": 0.4759807575038959, + "grad_norm": 7.409401893615723, + "learning_rate": 9.141077418029982e-05, + "loss": 0.7972, + "step": 7025 + }, + { + "epoch": 0.476048512771868, + "grad_norm": 7.14201545715332, + "learning_rate": 9.1409405161202e-05, + "loss": 0.9284, + "step": 7026 + }, + { + "epoch": 0.4761162680398401, + "grad_norm": 6.279862880706787, + "learning_rate": 9.140803614210419e-05, + "loss": 0.7995, + "step": 7027 + }, + { + "epoch": 0.4761840233078122, + "grad_norm": 6.493180274963379, + "learning_rate": 9.140666712300637e-05, + "loss": 0.8848, + "step": 7028 + }, + { + "epoch": 0.47625177857578427, + "grad_norm": 8.528377532958984, + "learning_rate": 9.140529810390855e-05, + "loss": 0.9142, + "step": 7029 + }, + { + "epoch": 0.47631953384375636, + "grad_norm": 6.636556625366211, + "learning_rate": 9.140392908481073e-05, + "loss": 0.6841, + "step": 7030 + }, + { + "epoch": 0.47638728911172845, + "grad_norm": 7.340085983276367, + "learning_rate": 9.140256006571291e-05, + "loss": 1.0198, + "step": 7031 + }, + { + "epoch": 0.47645504437970054, + "grad_norm": 7.510453224182129, + "learning_rate": 9.14011910466151e-05, + "loss": 0.8581, + "step": 7032 + }, + { + "epoch": 0.4765227996476726, + "grad_norm": 5.910886287689209, + "learning_rate": 9.139982202751729e-05, + "loss": 0.7947, + "step": 7033 + }, + { + "epoch": 0.47659055491564467, + "grad_norm": 6.257016181945801, + "learning_rate": 9.139845300841947e-05, + "loss": 0.8759, + "step": 7034 + }, + { + "epoch": 0.47665831018361676, + "grad_norm": 7.840462684631348, + "learning_rate": 9.139708398932165e-05, + "loss": 1.1975, + "step": 7035 + }, + { + "epoch": 0.47672606545158885, + "grad_norm": 9.107316970825195, + "learning_rate": 9.139571497022383e-05, + "loss": 0.8321, + "step": 7036 + }, + { + "epoch": 0.47679382071956095, + "grad_norm": 5.630661964416504, + "learning_rate": 9.139434595112602e-05, + "loss": 0.7302, + "step": 7037 + }, + { + "epoch": 0.47686157598753304, + "grad_norm": 6.390323162078857, + "learning_rate": 9.13929769320282e-05, + "loss": 0.862, + "step": 7038 + }, + { + "epoch": 0.47692933125550513, + "grad_norm": 8.85464096069336, + "learning_rate": 9.139160791293038e-05, + "loss": 0.9283, + "step": 7039 + }, + { + "epoch": 0.4769970865234772, + "grad_norm": 6.312126159667969, + "learning_rate": 9.139023889383256e-05, + "loss": 0.933, + "step": 7040 + }, + { + "epoch": 0.4770648417914493, + "grad_norm": 6.027670383453369, + "learning_rate": 9.138886987473476e-05, + "loss": 0.8221, + "step": 7041 + }, + { + "epoch": 0.47713259705942135, + "grad_norm": 8.296350479125977, + "learning_rate": 9.138750085563694e-05, + "loss": 0.8621, + "step": 7042 + }, + { + "epoch": 0.47720035232739344, + "grad_norm": 8.337299346923828, + "learning_rate": 9.138613183653912e-05, + "loss": 0.9126, + "step": 7043 + }, + { + "epoch": 0.47726810759536553, + "grad_norm": 7.545529842376709, + "learning_rate": 9.138476281744131e-05, + "loss": 1.1327, + "step": 7044 + }, + { + "epoch": 0.4773358628633376, + "grad_norm": 5.35225248336792, + "learning_rate": 9.138339379834349e-05, + "loss": 0.7311, + "step": 7045 + }, + { + "epoch": 0.4774036181313097, + "grad_norm": 7.592402458190918, + "learning_rate": 9.138202477924567e-05, + "loss": 0.9959, + "step": 7046 + }, + { + "epoch": 0.4774713733992818, + "grad_norm": 8.010600090026855, + "learning_rate": 9.138065576014787e-05, + "loss": 0.7788, + "step": 7047 + }, + { + "epoch": 0.4775391286672539, + "grad_norm": 7.170941352844238, + "learning_rate": 9.137928674105005e-05, + "loss": 0.9218, + "step": 7048 + }, + { + "epoch": 0.477606883935226, + "grad_norm": 6.852916240692139, + "learning_rate": 9.137791772195223e-05, + "loss": 0.7614, + "step": 7049 + }, + { + "epoch": 0.477674639203198, + "grad_norm": 8.727351188659668, + "learning_rate": 9.137654870285442e-05, + "loss": 0.8449, + "step": 7050 + }, + { + "epoch": 0.4777423944711701, + "grad_norm": 6.188234329223633, + "learning_rate": 9.13751796837566e-05, + "loss": 0.7001, + "step": 7051 + }, + { + "epoch": 0.4778101497391422, + "grad_norm": 6.4075398445129395, + "learning_rate": 9.137381066465878e-05, + "loss": 0.9363, + "step": 7052 + }, + { + "epoch": 0.4778779050071143, + "grad_norm": 7.744530200958252, + "learning_rate": 9.137244164556096e-05, + "loss": 0.9253, + "step": 7053 + }, + { + "epoch": 0.4779456602750864, + "grad_norm": 6.745645999908447, + "learning_rate": 9.137107262646314e-05, + "loss": 0.9649, + "step": 7054 + }, + { + "epoch": 0.4780134155430585, + "grad_norm": 8.560202598571777, + "learning_rate": 9.136970360736534e-05, + "loss": 1.2453, + "step": 7055 + }, + { + "epoch": 0.4780811708110306, + "grad_norm": 6.941448211669922, + "learning_rate": 9.136833458826752e-05, + "loss": 0.8577, + "step": 7056 + }, + { + "epoch": 0.47814892607900267, + "grad_norm": 6.4724626541137695, + "learning_rate": 9.13669655691697e-05, + "loss": 0.9304, + "step": 7057 + }, + { + "epoch": 0.4782166813469747, + "grad_norm": 6.535644054412842, + "learning_rate": 9.136559655007188e-05, + "loss": 0.8323, + "step": 7058 + }, + { + "epoch": 0.4782844366149468, + "grad_norm": 6.4093217849731445, + "learning_rate": 9.136422753097407e-05, + "loss": 0.9816, + "step": 7059 + }, + { + "epoch": 0.4783521918829189, + "grad_norm": 6.645406246185303, + "learning_rate": 9.136285851187625e-05, + "loss": 0.9507, + "step": 7060 + }, + { + "epoch": 0.478419947150891, + "grad_norm": 8.383099555969238, + "learning_rate": 9.136148949277843e-05, + "loss": 1.0339, + "step": 7061 + }, + { + "epoch": 0.47848770241886307, + "grad_norm": 6.430543422698975, + "learning_rate": 9.136012047368061e-05, + "loss": 0.755, + "step": 7062 + }, + { + "epoch": 0.47855545768683516, + "grad_norm": 6.5899176597595215, + "learning_rate": 9.135875145458279e-05, + "loss": 1.0841, + "step": 7063 + }, + { + "epoch": 0.47862321295480725, + "grad_norm": 9.340789794921875, + "learning_rate": 9.135738243548499e-05, + "loss": 0.7539, + "step": 7064 + }, + { + "epoch": 0.47869096822277934, + "grad_norm": 7.300266742706299, + "learning_rate": 9.135601341638717e-05, + "loss": 0.9049, + "step": 7065 + }, + { + "epoch": 0.4787587234907514, + "grad_norm": 6.495701313018799, + "learning_rate": 9.135464439728935e-05, + "loss": 0.9168, + "step": 7066 + }, + { + "epoch": 0.4788264787587235, + "grad_norm": 8.14792537689209, + "learning_rate": 9.135327537819153e-05, + "loss": 0.7678, + "step": 7067 + }, + { + "epoch": 0.47889423402669556, + "grad_norm": 5.638490200042725, + "learning_rate": 9.135190635909371e-05, + "loss": 0.8914, + "step": 7068 + }, + { + "epoch": 0.47896198929466766, + "grad_norm": 7.043061256408691, + "learning_rate": 9.13505373399959e-05, + "loss": 0.9575, + "step": 7069 + }, + { + "epoch": 0.47902974456263975, + "grad_norm": 7.867429256439209, + "learning_rate": 9.134916832089808e-05, + "loss": 0.9887, + "step": 7070 + }, + { + "epoch": 0.47909749983061184, + "grad_norm": 5.737957000732422, + "learning_rate": 9.134779930180026e-05, + "loss": 0.8605, + "step": 7071 + }, + { + "epoch": 0.47916525509858393, + "grad_norm": 9.436585426330566, + "learning_rate": 9.134643028270244e-05, + "loss": 1.0482, + "step": 7072 + }, + { + "epoch": 0.479233010366556, + "grad_norm": 6.28998327255249, + "learning_rate": 9.134506126360464e-05, + "loss": 1.0235, + "step": 7073 + }, + { + "epoch": 0.47930076563452806, + "grad_norm": 5.121227741241455, + "learning_rate": 9.134369224450682e-05, + "loss": 0.8735, + "step": 7074 + }, + { + "epoch": 0.47936852090250015, + "grad_norm": 5.818413734436035, + "learning_rate": 9.1342323225409e-05, + "loss": 0.738, + "step": 7075 + }, + { + "epoch": 0.47943627617047224, + "grad_norm": 6.716359615325928, + "learning_rate": 9.134095420631118e-05, + "loss": 0.8825, + "step": 7076 + }, + { + "epoch": 0.47950403143844433, + "grad_norm": 5.944983959197998, + "learning_rate": 9.133958518721336e-05, + "loss": 0.9145, + "step": 7077 + }, + { + "epoch": 0.4795717867064164, + "grad_norm": 6.65972375869751, + "learning_rate": 9.133821616811555e-05, + "loss": 0.7746, + "step": 7078 + }, + { + "epoch": 0.4796395419743885, + "grad_norm": 7.595485210418701, + "learning_rate": 9.133684714901773e-05, + "loss": 0.817, + "step": 7079 + }, + { + "epoch": 0.4797072972423606, + "grad_norm": 5.7008280754089355, + "learning_rate": 9.133547812991991e-05, + "loss": 0.7996, + "step": 7080 + }, + { + "epoch": 0.4797750525103327, + "grad_norm": 6.7002668380737305, + "learning_rate": 9.133410911082209e-05, + "loss": 1.0676, + "step": 7081 + }, + { + "epoch": 0.47984280777830474, + "grad_norm": 8.43227481842041, + "learning_rate": 9.133274009172429e-05, + "loss": 0.9844, + "step": 7082 + }, + { + "epoch": 0.47991056304627683, + "grad_norm": 5.9677653312683105, + "learning_rate": 9.133137107262647e-05, + "loss": 0.9158, + "step": 7083 + }, + { + "epoch": 0.4799783183142489, + "grad_norm": 7.774659156799316, + "learning_rate": 9.133000205352865e-05, + "loss": 0.6829, + "step": 7084 + }, + { + "epoch": 0.480046073582221, + "grad_norm": 7.344854354858398, + "learning_rate": 9.132863303443083e-05, + "loss": 0.8579, + "step": 7085 + }, + { + "epoch": 0.4801138288501931, + "grad_norm": 6.4308624267578125, + "learning_rate": 9.132726401533301e-05, + "loss": 0.936, + "step": 7086 + }, + { + "epoch": 0.4801815841181652, + "grad_norm": 7.600352764129639, + "learning_rate": 9.13258949962352e-05, + "loss": 0.8561, + "step": 7087 + }, + { + "epoch": 0.4802493393861373, + "grad_norm": 9.475528717041016, + "learning_rate": 9.132452597713738e-05, + "loss": 0.9684, + "step": 7088 + }, + { + "epoch": 0.4803170946541094, + "grad_norm": 5.927639007568359, + "learning_rate": 9.132315695803956e-05, + "loss": 0.7472, + "step": 7089 + }, + { + "epoch": 0.48038484992208147, + "grad_norm": 7.304515838623047, + "learning_rate": 9.132178793894176e-05, + "loss": 0.8983, + "step": 7090 + }, + { + "epoch": 0.4804526051900535, + "grad_norm": 7.6303391456604, + "learning_rate": 9.132041891984394e-05, + "loss": 0.864, + "step": 7091 + }, + { + "epoch": 0.4805203604580256, + "grad_norm": 6.243314266204834, + "learning_rate": 9.131904990074612e-05, + "loss": 0.8448, + "step": 7092 + }, + { + "epoch": 0.4805881157259977, + "grad_norm": 6.947970867156982, + "learning_rate": 9.131768088164831e-05, + "loss": 0.7853, + "step": 7093 + }, + { + "epoch": 0.4806558709939698, + "grad_norm": 5.687379837036133, + "learning_rate": 9.131631186255049e-05, + "loss": 0.838, + "step": 7094 + }, + { + "epoch": 0.48072362626194187, + "grad_norm": 6.477471351623535, + "learning_rate": 9.131494284345267e-05, + "loss": 0.9849, + "step": 7095 + }, + { + "epoch": 0.48079138152991396, + "grad_norm": 6.526174068450928, + "learning_rate": 9.131357382435487e-05, + "loss": 0.9269, + "step": 7096 + }, + { + "epoch": 0.48085913679788606, + "grad_norm": 7.404792308807373, + "learning_rate": 9.131220480525705e-05, + "loss": 0.8257, + "step": 7097 + }, + { + "epoch": 0.48092689206585815, + "grad_norm": 7.290988445281982, + "learning_rate": 9.131083578615923e-05, + "loss": 1.0275, + "step": 7098 + }, + { + "epoch": 0.4809946473338302, + "grad_norm": 7.744154930114746, + "learning_rate": 9.13094667670614e-05, + "loss": 0.9692, + "step": 7099 + }, + { + "epoch": 0.4810624026018023, + "grad_norm": 5.7665252685546875, + "learning_rate": 9.130809774796359e-05, + "loss": 0.6751, + "step": 7100 + }, + { + "epoch": 0.48113015786977437, + "grad_norm": 6.921631813049316, + "learning_rate": 9.130672872886578e-05, + "loss": 0.8771, + "step": 7101 + }, + { + "epoch": 0.48119791313774646, + "grad_norm": 6.097098350524902, + "learning_rate": 9.130535970976796e-05, + "loss": 0.737, + "step": 7102 + }, + { + "epoch": 0.48126566840571855, + "grad_norm": 7.192615032196045, + "learning_rate": 9.130399069067014e-05, + "loss": 0.9283, + "step": 7103 + }, + { + "epoch": 0.48133342367369064, + "grad_norm": 6.464249610900879, + "learning_rate": 9.130262167157232e-05, + "loss": 1.0467, + "step": 7104 + }, + { + "epoch": 0.48140117894166273, + "grad_norm": 4.616031646728516, + "learning_rate": 9.130125265247452e-05, + "loss": 0.5657, + "step": 7105 + }, + { + "epoch": 0.4814689342096348, + "grad_norm": 6.356307029724121, + "learning_rate": 9.12998836333767e-05, + "loss": 0.722, + "step": 7106 + }, + { + "epoch": 0.48153668947760686, + "grad_norm": 6.398674011230469, + "learning_rate": 9.129851461427888e-05, + "loss": 1.066, + "step": 7107 + }, + { + "epoch": 0.48160444474557895, + "grad_norm": 7.015667915344238, + "learning_rate": 9.129714559518106e-05, + "loss": 0.8681, + "step": 7108 + }, + { + "epoch": 0.48167220001355104, + "grad_norm": 8.212510108947754, + "learning_rate": 9.129577657608324e-05, + "loss": 0.8158, + "step": 7109 + }, + { + "epoch": 0.48173995528152314, + "grad_norm": 8.409659385681152, + "learning_rate": 9.129440755698543e-05, + "loss": 0.7391, + "step": 7110 + }, + { + "epoch": 0.4818077105494952, + "grad_norm": 7.42771053314209, + "learning_rate": 9.129303853788761e-05, + "loss": 0.9876, + "step": 7111 + }, + { + "epoch": 0.4818754658174673, + "grad_norm": 7.816039562225342, + "learning_rate": 9.129166951878979e-05, + "loss": 0.9601, + "step": 7112 + }, + { + "epoch": 0.4819432210854394, + "grad_norm": 8.761859893798828, + "learning_rate": 9.129030049969197e-05, + "loss": 0.6283, + "step": 7113 + }, + { + "epoch": 0.4820109763534115, + "grad_norm": 6.981362342834473, + "learning_rate": 9.128893148059415e-05, + "loss": 0.9901, + "step": 7114 + }, + { + "epoch": 0.48207873162138354, + "grad_norm": 7.718024253845215, + "learning_rate": 9.128756246149635e-05, + "loss": 0.8318, + "step": 7115 + }, + { + "epoch": 0.48214648688935563, + "grad_norm": 8.191770553588867, + "learning_rate": 9.128619344239853e-05, + "loss": 0.9108, + "step": 7116 + }, + { + "epoch": 0.4822142421573277, + "grad_norm": 8.095576286315918, + "learning_rate": 9.12848244233007e-05, + "loss": 0.8404, + "step": 7117 + }, + { + "epoch": 0.4822819974252998, + "grad_norm": 6.66763162612915, + "learning_rate": 9.128345540420289e-05, + "loss": 0.6538, + "step": 7118 + }, + { + "epoch": 0.4823497526932719, + "grad_norm": 7.253853797912598, + "learning_rate": 9.128208638510508e-05, + "loss": 0.8893, + "step": 7119 + }, + { + "epoch": 0.482417507961244, + "grad_norm": 8.162705421447754, + "learning_rate": 9.128071736600726e-05, + "loss": 0.9602, + "step": 7120 + }, + { + "epoch": 0.4824852632292161, + "grad_norm": 7.548867225646973, + "learning_rate": 9.127934834690944e-05, + "loss": 0.8701, + "step": 7121 + }, + { + "epoch": 0.4825530184971882, + "grad_norm": 5.984834671020508, + "learning_rate": 9.127797932781162e-05, + "loss": 1.0416, + "step": 7122 + }, + { + "epoch": 0.4826207737651602, + "grad_norm": 8.716156005859375, + "learning_rate": 9.12766103087138e-05, + "loss": 1.016, + "step": 7123 + }, + { + "epoch": 0.4826885290331323, + "grad_norm": 6.2188873291015625, + "learning_rate": 9.1275241289616e-05, + "loss": 0.7147, + "step": 7124 + }, + { + "epoch": 0.4827562843011044, + "grad_norm": 10.026150703430176, + "learning_rate": 9.127387227051818e-05, + "loss": 0.9221, + "step": 7125 + }, + { + "epoch": 0.4828240395690765, + "grad_norm": 7.320329666137695, + "learning_rate": 9.127250325142036e-05, + "loss": 0.8486, + "step": 7126 + }, + { + "epoch": 0.4828917948370486, + "grad_norm": 6.134348392486572, + "learning_rate": 9.127113423232254e-05, + "loss": 0.7383, + "step": 7127 + }, + { + "epoch": 0.4829595501050207, + "grad_norm": 7.2880778312683105, + "learning_rate": 9.126976521322473e-05, + "loss": 1.059, + "step": 7128 + }, + { + "epoch": 0.48302730537299277, + "grad_norm": 6.100679874420166, + "learning_rate": 9.126839619412691e-05, + "loss": 0.8673, + "step": 7129 + }, + { + "epoch": 0.48309506064096486, + "grad_norm": 6.899023056030273, + "learning_rate": 9.126702717502909e-05, + "loss": 1.1572, + "step": 7130 + }, + { + "epoch": 0.4831628159089369, + "grad_norm": 6.187694549560547, + "learning_rate": 9.126565815593127e-05, + "loss": 0.9584, + "step": 7131 + }, + { + "epoch": 0.483230571176909, + "grad_norm": 7.093903064727783, + "learning_rate": 9.126428913683345e-05, + "loss": 0.969, + "step": 7132 + }, + { + "epoch": 0.4832983264448811, + "grad_norm": 6.003389835357666, + "learning_rate": 9.126292011773565e-05, + "loss": 0.7638, + "step": 7133 + }, + { + "epoch": 0.48336608171285317, + "grad_norm": 5.557130336761475, + "learning_rate": 9.126155109863783e-05, + "loss": 0.8529, + "step": 7134 + }, + { + "epoch": 0.48343383698082526, + "grad_norm": 6.029399871826172, + "learning_rate": 9.126018207954001e-05, + "loss": 0.6359, + "step": 7135 + }, + { + "epoch": 0.48350159224879735, + "grad_norm": 6.123723030090332, + "learning_rate": 9.12588130604422e-05, + "loss": 0.9462, + "step": 7136 + }, + { + "epoch": 0.48356934751676944, + "grad_norm": 6.213245868682861, + "learning_rate": 9.125744404134438e-05, + "loss": 0.8322, + "step": 7137 + }, + { + "epoch": 0.48363710278474153, + "grad_norm": 7.509876251220703, + "learning_rate": 9.125607502224656e-05, + "loss": 1.0556, + "step": 7138 + }, + { + "epoch": 0.48370485805271357, + "grad_norm": 6.355532646179199, + "learning_rate": 9.125470600314876e-05, + "loss": 0.9281, + "step": 7139 + }, + { + "epoch": 0.48377261332068566, + "grad_norm": 7.302781105041504, + "learning_rate": 9.125333698405094e-05, + "loss": 0.846, + "step": 7140 + }, + { + "epoch": 0.48384036858865775, + "grad_norm": 7.081716537475586, + "learning_rate": 9.125196796495312e-05, + "loss": 0.8571, + "step": 7141 + }, + { + "epoch": 0.48390812385662985, + "grad_norm": 7.652805328369141, + "learning_rate": 9.125059894585531e-05, + "loss": 0.7986, + "step": 7142 + }, + { + "epoch": 0.48397587912460194, + "grad_norm": 5.906263828277588, + "learning_rate": 9.124922992675749e-05, + "loss": 0.8058, + "step": 7143 + }, + { + "epoch": 0.48404363439257403, + "grad_norm": 7.398087024688721, + "learning_rate": 9.124786090765967e-05, + "loss": 0.8961, + "step": 7144 + }, + { + "epoch": 0.4841113896605461, + "grad_norm": 9.616337776184082, + "learning_rate": 9.124649188856185e-05, + "loss": 0.9403, + "step": 7145 + }, + { + "epoch": 0.4841791449285182, + "grad_norm": 7.74692440032959, + "learning_rate": 9.124512286946403e-05, + "loss": 0.7659, + "step": 7146 + }, + { + "epoch": 0.4842469001964903, + "grad_norm": 6.067378520965576, + "learning_rate": 9.124375385036623e-05, + "loss": 0.8094, + "step": 7147 + }, + { + "epoch": 0.48431465546446234, + "grad_norm": 7.943274974822998, + "learning_rate": 9.12423848312684e-05, + "loss": 0.8103, + "step": 7148 + }, + { + "epoch": 0.48438241073243443, + "grad_norm": 7.710971355438232, + "learning_rate": 9.124101581217059e-05, + "loss": 1.0818, + "step": 7149 + }, + { + "epoch": 0.4844501660004065, + "grad_norm": 6.904791831970215, + "learning_rate": 9.123964679307277e-05, + "loss": 0.9413, + "step": 7150 + }, + { + "epoch": 0.4845179212683786, + "grad_norm": 8.46650505065918, + "learning_rate": 9.123827777397496e-05, + "loss": 1.1661, + "step": 7151 + }, + { + "epoch": 0.4845856765363507, + "grad_norm": 7.021801471710205, + "learning_rate": 9.123690875487714e-05, + "loss": 0.7305, + "step": 7152 + }, + { + "epoch": 0.4846534318043228, + "grad_norm": 7.553677082061768, + "learning_rate": 9.123553973577932e-05, + "loss": 0.9806, + "step": 7153 + }, + { + "epoch": 0.4847211870722949, + "grad_norm": 6.204870700836182, + "learning_rate": 9.12341707166815e-05, + "loss": 0.8433, + "step": 7154 + }, + { + "epoch": 0.484788942340267, + "grad_norm": 7.81880521774292, + "learning_rate": 9.123280169758368e-05, + "loss": 1.112, + "step": 7155 + }, + { + "epoch": 0.484856697608239, + "grad_norm": 7.029433727264404, + "learning_rate": 9.123143267848588e-05, + "loss": 0.8068, + "step": 7156 + }, + { + "epoch": 0.4849244528762111, + "grad_norm": 6.795009136199951, + "learning_rate": 9.123006365938806e-05, + "loss": 0.766, + "step": 7157 + }, + { + "epoch": 0.4849922081441832, + "grad_norm": 5.924415588378906, + "learning_rate": 9.122869464029024e-05, + "loss": 0.7914, + "step": 7158 + }, + { + "epoch": 0.4850599634121553, + "grad_norm": 5.561947345733643, + "learning_rate": 9.122732562119242e-05, + "loss": 1.1486, + "step": 7159 + }, + { + "epoch": 0.4851277186801274, + "grad_norm": 7.202826976776123, + "learning_rate": 9.122595660209461e-05, + "loss": 0.8802, + "step": 7160 + }, + { + "epoch": 0.4851954739480995, + "grad_norm": 6.435755729675293, + "learning_rate": 9.122458758299679e-05, + "loss": 0.888, + "step": 7161 + }, + { + "epoch": 0.48526322921607157, + "grad_norm": 6.197578430175781, + "learning_rate": 9.122321856389897e-05, + "loss": 0.9139, + "step": 7162 + }, + { + "epoch": 0.48533098448404366, + "grad_norm": 6.596435070037842, + "learning_rate": 9.122184954480115e-05, + "loss": 0.7802, + "step": 7163 + }, + { + "epoch": 0.4853987397520157, + "grad_norm": 7.858447551727295, + "learning_rate": 9.122048052570333e-05, + "loss": 0.8759, + "step": 7164 + }, + { + "epoch": 0.4854664950199878, + "grad_norm": 6.002086162567139, + "learning_rate": 9.121911150660553e-05, + "loss": 0.9621, + "step": 7165 + }, + { + "epoch": 0.4855342502879599, + "grad_norm": 5.917041301727295, + "learning_rate": 9.12177424875077e-05, + "loss": 0.8144, + "step": 7166 + }, + { + "epoch": 0.48560200555593197, + "grad_norm": 6.202271461486816, + "learning_rate": 9.121637346840989e-05, + "loss": 0.9937, + "step": 7167 + }, + { + "epoch": 0.48566976082390406, + "grad_norm": 6.635425567626953, + "learning_rate": 9.121500444931207e-05, + "loss": 0.8103, + "step": 7168 + }, + { + "epoch": 0.48573751609187615, + "grad_norm": 7.288759708404541, + "learning_rate": 9.121363543021425e-05, + "loss": 0.8354, + "step": 7169 + }, + { + "epoch": 0.48580527135984825, + "grad_norm": 6.015995979309082, + "learning_rate": 9.121226641111644e-05, + "loss": 1.0732, + "step": 7170 + }, + { + "epoch": 0.48587302662782034, + "grad_norm": 6.693684101104736, + "learning_rate": 9.121089739201862e-05, + "loss": 0.9792, + "step": 7171 + }, + { + "epoch": 0.4859407818957924, + "grad_norm": 7.2573561668396, + "learning_rate": 9.12095283729208e-05, + "loss": 0.8823, + "step": 7172 + }, + { + "epoch": 0.48600853716376446, + "grad_norm": 6.1881585121154785, + "learning_rate": 9.120815935382298e-05, + "loss": 0.8397, + "step": 7173 + }, + { + "epoch": 0.48607629243173656, + "grad_norm": 7.354151725769043, + "learning_rate": 9.120679033472518e-05, + "loss": 1.0145, + "step": 7174 + }, + { + "epoch": 0.48614404769970865, + "grad_norm": 6.806859016418457, + "learning_rate": 9.120542131562736e-05, + "loss": 0.9289, + "step": 7175 + }, + { + "epoch": 0.48621180296768074, + "grad_norm": 6.098382949829102, + "learning_rate": 9.120405229652954e-05, + "loss": 0.9819, + "step": 7176 + }, + { + "epoch": 0.48627955823565283, + "grad_norm": 6.944835186004639, + "learning_rate": 9.120268327743172e-05, + "loss": 0.8004, + "step": 7177 + }, + { + "epoch": 0.4863473135036249, + "grad_norm": 7.9429497718811035, + "learning_rate": 9.12013142583339e-05, + "loss": 0.9473, + "step": 7178 + }, + { + "epoch": 0.486415068771597, + "grad_norm": 6.831770420074463, + "learning_rate": 9.119994523923609e-05, + "loss": 0.8866, + "step": 7179 + }, + { + "epoch": 0.48648282403956905, + "grad_norm": 7.992087364196777, + "learning_rate": 9.119857622013827e-05, + "loss": 0.6405, + "step": 7180 + }, + { + "epoch": 0.48655057930754114, + "grad_norm": 7.081966400146484, + "learning_rate": 9.119720720104045e-05, + "loss": 0.9166, + "step": 7181 + }, + { + "epoch": 0.48661833457551323, + "grad_norm": 8.257608413696289, + "learning_rate": 9.119583818194263e-05, + "loss": 0.7556, + "step": 7182 + }, + { + "epoch": 0.4866860898434853, + "grad_norm": 6.714028835296631, + "learning_rate": 9.119446916284483e-05, + "loss": 0.784, + "step": 7183 + }, + { + "epoch": 0.4867538451114574, + "grad_norm": 5.71464204788208, + "learning_rate": 9.1193100143747e-05, + "loss": 0.815, + "step": 7184 + }, + { + "epoch": 0.4868216003794295, + "grad_norm": 8.290979385375977, + "learning_rate": 9.119173112464919e-05, + "loss": 0.822, + "step": 7185 + }, + { + "epoch": 0.4868893556474016, + "grad_norm": 6.116361141204834, + "learning_rate": 9.119036210555138e-05, + "loss": 0.7727, + "step": 7186 + }, + { + "epoch": 0.4869571109153737, + "grad_norm": 5.137567043304443, + "learning_rate": 9.118899308645356e-05, + "loss": 0.4916, + "step": 7187 + }, + { + "epoch": 0.48702486618334573, + "grad_norm": 6.987879753112793, + "learning_rate": 9.118762406735574e-05, + "loss": 1.0662, + "step": 7188 + }, + { + "epoch": 0.4870926214513178, + "grad_norm": 7.223506927490234, + "learning_rate": 9.118625504825794e-05, + "loss": 0.7408, + "step": 7189 + }, + { + "epoch": 0.4871603767192899, + "grad_norm": 7.493766784667969, + "learning_rate": 9.118488602916012e-05, + "loss": 0.9625, + "step": 7190 + }, + { + "epoch": 0.487228131987262, + "grad_norm": 7.265352725982666, + "learning_rate": 9.11835170100623e-05, + "loss": 0.9228, + "step": 7191 + }, + { + "epoch": 0.4872958872552341, + "grad_norm": 6.04194450378418, + "learning_rate": 9.118214799096449e-05, + "loss": 0.8081, + "step": 7192 + }, + { + "epoch": 0.4873636425232062, + "grad_norm": 7.033185958862305, + "learning_rate": 9.118077897186667e-05, + "loss": 1.0709, + "step": 7193 + }, + { + "epoch": 0.4874313977911783, + "grad_norm": 5.993305683135986, + "learning_rate": 9.117940995276885e-05, + "loss": 0.8222, + "step": 7194 + }, + { + "epoch": 0.48749915305915037, + "grad_norm": 6.695589542388916, + "learning_rate": 9.117804093367103e-05, + "loss": 0.7981, + "step": 7195 + }, + { + "epoch": 0.48756690832712246, + "grad_norm": 6.6938157081604, + "learning_rate": 9.117667191457321e-05, + "loss": 0.8485, + "step": 7196 + }, + { + "epoch": 0.4876346635950945, + "grad_norm": 6.760074138641357, + "learning_rate": 9.11753028954754e-05, + "loss": 0.6706, + "step": 7197 + }, + { + "epoch": 0.4877024188630666, + "grad_norm": 6.397393703460693, + "learning_rate": 9.117393387637759e-05, + "loss": 0.8247, + "step": 7198 + }, + { + "epoch": 0.4877701741310387, + "grad_norm": 7.138194561004639, + "learning_rate": 9.117256485727977e-05, + "loss": 1.0124, + "step": 7199 + }, + { + "epoch": 0.4878379293990108, + "grad_norm": 6.4646172523498535, + "learning_rate": 9.117119583818195e-05, + "loss": 1.0531, + "step": 7200 + }, + { + "epoch": 0.48790568466698286, + "grad_norm": 6.58534574508667, + "learning_rate": 9.116982681908413e-05, + "loss": 0.962, + "step": 7201 + }, + { + "epoch": 0.48797343993495496, + "grad_norm": 8.237184524536133, + "learning_rate": 9.116845779998632e-05, + "loss": 1.0363, + "step": 7202 + }, + { + "epoch": 0.48804119520292705, + "grad_norm": 9.047645568847656, + "learning_rate": 9.11670887808885e-05, + "loss": 0.8925, + "step": 7203 + }, + { + "epoch": 0.48810895047089914, + "grad_norm": 5.87882137298584, + "learning_rate": 9.116571976179068e-05, + "loss": 0.7843, + "step": 7204 + }, + { + "epoch": 0.4881767057388712, + "grad_norm": 7.792137622833252, + "learning_rate": 9.116435074269286e-05, + "loss": 0.6999, + "step": 7205 + }, + { + "epoch": 0.48824446100684327, + "grad_norm": 6.028510570526123, + "learning_rate": 9.116298172359506e-05, + "loss": 0.9233, + "step": 7206 + }, + { + "epoch": 0.48831221627481536, + "grad_norm": 6.219117164611816, + "learning_rate": 9.116161270449724e-05, + "loss": 0.5807, + "step": 7207 + }, + { + "epoch": 0.48837997154278745, + "grad_norm": 5.99801778793335, + "learning_rate": 9.116024368539942e-05, + "loss": 0.9267, + "step": 7208 + }, + { + "epoch": 0.48844772681075954, + "grad_norm": 7.163285255432129, + "learning_rate": 9.11588746663016e-05, + "loss": 0.8695, + "step": 7209 + }, + { + "epoch": 0.48851548207873163, + "grad_norm": 5.209384441375732, + "learning_rate": 9.115750564720378e-05, + "loss": 0.8395, + "step": 7210 + }, + { + "epoch": 0.4885832373467037, + "grad_norm": 5.942741394042969, + "learning_rate": 9.115613662810597e-05, + "loss": 0.8593, + "step": 7211 + }, + { + "epoch": 0.4886509926146758, + "grad_norm": 6.742303848266602, + "learning_rate": 9.115476760900815e-05, + "loss": 0.7964, + "step": 7212 + }, + { + "epoch": 0.48871874788264785, + "grad_norm": 6.951894283294678, + "learning_rate": 9.115339858991033e-05, + "loss": 0.9285, + "step": 7213 + }, + { + "epoch": 0.48878650315061994, + "grad_norm": 5.752067565917969, + "learning_rate": 9.115202957081251e-05, + "loss": 0.6053, + "step": 7214 + }, + { + "epoch": 0.48885425841859204, + "grad_norm": 6.7097883224487305, + "learning_rate": 9.11506605517147e-05, + "loss": 0.82, + "step": 7215 + }, + { + "epoch": 0.4889220136865641, + "grad_norm": 6.396644592285156, + "learning_rate": 9.114929153261689e-05, + "loss": 0.9438, + "step": 7216 + }, + { + "epoch": 0.4889897689545362, + "grad_norm": 6.378931999206543, + "learning_rate": 9.114792251351907e-05, + "loss": 0.9635, + "step": 7217 + }, + { + "epoch": 0.4890575242225083, + "grad_norm": 5.802820682525635, + "learning_rate": 9.114655349442125e-05, + "loss": 0.7166, + "step": 7218 + }, + { + "epoch": 0.4891252794904804, + "grad_norm": 6.493535041809082, + "learning_rate": 9.114518447532343e-05, + "loss": 0.985, + "step": 7219 + }, + { + "epoch": 0.4891930347584525, + "grad_norm": 7.591537952423096, + "learning_rate": 9.114381545622562e-05, + "loss": 0.8348, + "step": 7220 + }, + { + "epoch": 0.48926079002642453, + "grad_norm": 6.379971027374268, + "learning_rate": 9.11424464371278e-05, + "loss": 0.9761, + "step": 7221 + }, + { + "epoch": 0.4893285452943966, + "grad_norm": 9.187169075012207, + "learning_rate": 9.114107741802998e-05, + "loss": 1.1654, + "step": 7222 + }, + { + "epoch": 0.4893963005623687, + "grad_norm": 6.582739353179932, + "learning_rate": 9.113970839893216e-05, + "loss": 0.8921, + "step": 7223 + }, + { + "epoch": 0.4894640558303408, + "grad_norm": 5.6152544021606445, + "learning_rate": 9.113833937983434e-05, + "loss": 0.7498, + "step": 7224 + }, + { + "epoch": 0.4895318110983129, + "grad_norm": 6.960738182067871, + "learning_rate": 9.113697036073654e-05, + "loss": 0.7443, + "step": 7225 + }, + { + "epoch": 0.489599566366285, + "grad_norm": 6.515749454498291, + "learning_rate": 9.113560134163872e-05, + "loss": 0.903, + "step": 7226 + }, + { + "epoch": 0.4896673216342571, + "grad_norm": 7.327613830566406, + "learning_rate": 9.11342323225409e-05, + "loss": 0.8484, + "step": 7227 + }, + { + "epoch": 0.48973507690222917, + "grad_norm": 6.575616359710693, + "learning_rate": 9.113286330344308e-05, + "loss": 1.001, + "step": 7228 + }, + { + "epoch": 0.4898028321702012, + "grad_norm": 6.429412364959717, + "learning_rate": 9.113149428434527e-05, + "loss": 0.8028, + "step": 7229 + }, + { + "epoch": 0.4898705874381733, + "grad_norm": 6.13348388671875, + "learning_rate": 9.113012526524745e-05, + "loss": 0.8966, + "step": 7230 + }, + { + "epoch": 0.4899383427061454, + "grad_norm": 5.246626377105713, + "learning_rate": 9.112875624614963e-05, + "loss": 0.7548, + "step": 7231 + }, + { + "epoch": 0.4900060979741175, + "grad_norm": 8.197700500488281, + "learning_rate": 9.112738722705183e-05, + "loss": 0.9122, + "step": 7232 + }, + { + "epoch": 0.4900738532420896, + "grad_norm": 7.987671375274658, + "learning_rate": 9.1126018207954e-05, + "loss": 1.0089, + "step": 7233 + }, + { + "epoch": 0.49014160851006167, + "grad_norm": 6.8674211502075195, + "learning_rate": 9.112464918885619e-05, + "loss": 0.7244, + "step": 7234 + }, + { + "epoch": 0.49020936377803376, + "grad_norm": 9.719727516174316, + "learning_rate": 9.112328016975838e-05, + "loss": 0.9079, + "step": 7235 + }, + { + "epoch": 0.49027711904600585, + "grad_norm": 6.982578277587891, + "learning_rate": 9.112191115066056e-05, + "loss": 0.7876, + "step": 7236 + }, + { + "epoch": 0.4903448743139779, + "grad_norm": 5.029915809631348, + "learning_rate": 9.112054213156274e-05, + "loss": 0.7574, + "step": 7237 + }, + { + "epoch": 0.49041262958195, + "grad_norm": 8.605433464050293, + "learning_rate": 9.111917311246493e-05, + "loss": 0.9255, + "step": 7238 + }, + { + "epoch": 0.49048038484992207, + "grad_norm": 7.910008907318115, + "learning_rate": 9.111780409336711e-05, + "loss": 0.9514, + "step": 7239 + }, + { + "epoch": 0.49054814011789416, + "grad_norm": 6.401332855224609, + "learning_rate": 9.11164350742693e-05, + "loss": 0.9243, + "step": 7240 + }, + { + "epoch": 0.49061589538586625, + "grad_norm": 6.722992420196533, + "learning_rate": 9.111506605517148e-05, + "loss": 0.7595, + "step": 7241 + }, + { + "epoch": 0.49068365065383834, + "grad_norm": 6.60951566696167, + "learning_rate": 9.111369703607366e-05, + "loss": 0.9381, + "step": 7242 + }, + { + "epoch": 0.49075140592181044, + "grad_norm": 7.443787574768066, + "learning_rate": 9.111232801697585e-05, + "loss": 1.0037, + "step": 7243 + }, + { + "epoch": 0.4908191611897825, + "grad_norm": 7.43011999130249, + "learning_rate": 9.111095899787803e-05, + "loss": 0.8213, + "step": 7244 + }, + { + "epoch": 0.49088691645775456, + "grad_norm": 8.865147590637207, + "learning_rate": 9.110958997878021e-05, + "loss": 0.9547, + "step": 7245 + }, + { + "epoch": 0.49095467172572665, + "grad_norm": 5.83010721206665, + "learning_rate": 9.110822095968239e-05, + "loss": 0.7253, + "step": 7246 + }, + { + "epoch": 0.49102242699369875, + "grad_norm": 6.1638946533203125, + "learning_rate": 9.110685194058458e-05, + "loss": 0.7643, + "step": 7247 + }, + { + "epoch": 0.49109018226167084, + "grad_norm": 6.534294128417969, + "learning_rate": 9.110548292148677e-05, + "loss": 0.9102, + "step": 7248 + }, + { + "epoch": 0.49115793752964293, + "grad_norm": 8.244972229003906, + "learning_rate": 9.110411390238895e-05, + "loss": 0.9615, + "step": 7249 + }, + { + "epoch": 0.491225692797615, + "grad_norm": 7.7991461753845215, + "learning_rate": 9.110274488329113e-05, + "loss": 0.9697, + "step": 7250 + }, + { + "epoch": 0.4912934480655871, + "grad_norm": 7.489588737487793, + "learning_rate": 9.11013758641933e-05, + "loss": 0.7339, + "step": 7251 + }, + { + "epoch": 0.4913612033335592, + "grad_norm": 6.112767696380615, + "learning_rate": 9.11000068450955e-05, + "loss": 1.1147, + "step": 7252 + }, + { + "epoch": 0.4914289586015313, + "grad_norm": 6.319901943206787, + "learning_rate": 9.109863782599768e-05, + "loss": 0.9777, + "step": 7253 + }, + { + "epoch": 0.49149671386950333, + "grad_norm": 7.175682544708252, + "learning_rate": 9.109726880689986e-05, + "loss": 0.7403, + "step": 7254 + }, + { + "epoch": 0.4915644691374754, + "grad_norm": 7.254464149475098, + "learning_rate": 9.109589978780204e-05, + "loss": 0.8533, + "step": 7255 + }, + { + "epoch": 0.4916322244054475, + "grad_norm": 6.266726493835449, + "learning_rate": 9.109453076870422e-05, + "loss": 0.8648, + "step": 7256 + }, + { + "epoch": 0.4916999796734196, + "grad_norm": 7.290742874145508, + "learning_rate": 9.109316174960642e-05, + "loss": 0.7274, + "step": 7257 + }, + { + "epoch": 0.4917677349413917, + "grad_norm": 6.882185459136963, + "learning_rate": 9.10917927305086e-05, + "loss": 0.9184, + "step": 7258 + }, + { + "epoch": 0.4918354902093638, + "grad_norm": 6.796192169189453, + "learning_rate": 9.109042371141078e-05, + "loss": 0.9734, + "step": 7259 + }, + { + "epoch": 0.4919032454773359, + "grad_norm": 7.461274147033691, + "learning_rate": 9.108905469231296e-05, + "loss": 0.8931, + "step": 7260 + }, + { + "epoch": 0.491971000745308, + "grad_norm": 6.522415637969971, + "learning_rate": 9.108768567321515e-05, + "loss": 0.7529, + "step": 7261 + }, + { + "epoch": 0.49203875601328, + "grad_norm": 7.108310699462891, + "learning_rate": 9.108631665411733e-05, + "loss": 0.7551, + "step": 7262 + }, + { + "epoch": 0.4921065112812521, + "grad_norm": 6.65360164642334, + "learning_rate": 9.108494763501951e-05, + "loss": 1.1178, + "step": 7263 + }, + { + "epoch": 0.4921742665492242, + "grad_norm": 8.197813034057617, + "learning_rate": 9.108357861592169e-05, + "loss": 1.2079, + "step": 7264 + }, + { + "epoch": 0.4922420218171963, + "grad_norm": 6.898741245269775, + "learning_rate": 9.108220959682387e-05, + "loss": 0.7838, + "step": 7265 + }, + { + "epoch": 0.4923097770851684, + "grad_norm": 7.363327980041504, + "learning_rate": 9.108084057772607e-05, + "loss": 0.763, + "step": 7266 + }, + { + "epoch": 0.49237753235314047, + "grad_norm": 7.18175745010376, + "learning_rate": 9.107947155862825e-05, + "loss": 0.7809, + "step": 7267 + }, + { + "epoch": 0.49244528762111256, + "grad_norm": 5.595573902130127, + "learning_rate": 9.107810253953043e-05, + "loss": 0.8471, + "step": 7268 + }, + { + "epoch": 0.49251304288908465, + "grad_norm": 6.333422660827637, + "learning_rate": 9.10767335204326e-05, + "loss": 0.7616, + "step": 7269 + }, + { + "epoch": 0.4925807981570567, + "grad_norm": 8.157796859741211, + "learning_rate": 9.10753645013348e-05, + "loss": 0.8233, + "step": 7270 + }, + { + "epoch": 0.4926485534250288, + "grad_norm": 6.123366832733154, + "learning_rate": 9.107399548223698e-05, + "loss": 0.7266, + "step": 7271 + }, + { + "epoch": 0.49271630869300087, + "grad_norm": 10.229715347290039, + "learning_rate": 9.107262646313916e-05, + "loss": 0.9589, + "step": 7272 + }, + { + "epoch": 0.49278406396097296, + "grad_norm": 5.360836505889893, + "learning_rate": 9.107125744404134e-05, + "loss": 0.7874, + "step": 7273 + }, + { + "epoch": 0.49285181922894505, + "grad_norm": 6.273800373077393, + "learning_rate": 9.106988842494352e-05, + "loss": 0.9512, + "step": 7274 + }, + { + "epoch": 0.49291957449691715, + "grad_norm": 7.902069568634033, + "learning_rate": 9.106851940584572e-05, + "loss": 0.799, + "step": 7275 + }, + { + "epoch": 0.49298732976488924, + "grad_norm": 9.35932731628418, + "learning_rate": 9.10671503867479e-05, + "loss": 0.8973, + "step": 7276 + }, + { + "epoch": 0.49305508503286133, + "grad_norm": 7.119052410125732, + "learning_rate": 9.106578136765008e-05, + "loss": 1.2001, + "step": 7277 + }, + { + "epoch": 0.49312284030083336, + "grad_norm": 6.111217498779297, + "learning_rate": 9.106441234855227e-05, + "loss": 0.7685, + "step": 7278 + }, + { + "epoch": 0.49319059556880546, + "grad_norm": 6.093493461608887, + "learning_rate": 9.106304332945445e-05, + "loss": 0.7097, + "step": 7279 + }, + { + "epoch": 0.49325835083677755, + "grad_norm": 6.722117900848389, + "learning_rate": 9.106167431035663e-05, + "loss": 0.805, + "step": 7280 + }, + { + "epoch": 0.49332610610474964, + "grad_norm": 6.489585876464844, + "learning_rate": 9.106030529125882e-05, + "loss": 0.9748, + "step": 7281 + }, + { + "epoch": 0.49339386137272173, + "grad_norm": 6.1473236083984375, + "learning_rate": 9.1058936272161e-05, + "loss": 0.8299, + "step": 7282 + }, + { + "epoch": 0.4934616166406938, + "grad_norm": 7.472615718841553, + "learning_rate": 9.105756725306319e-05, + "loss": 1.0012, + "step": 7283 + }, + { + "epoch": 0.4935293719086659, + "grad_norm": 7.1405463218688965, + "learning_rate": 9.105619823396538e-05, + "loss": 0.9959, + "step": 7284 + }, + { + "epoch": 0.493597127176638, + "grad_norm": 6.21019172668457, + "learning_rate": 9.105482921486756e-05, + "loss": 0.867, + "step": 7285 + }, + { + "epoch": 0.49366488244461004, + "grad_norm": 6.836954593658447, + "learning_rate": 9.105346019576974e-05, + "loss": 0.8058, + "step": 7286 + }, + { + "epoch": 0.49373263771258213, + "grad_norm": 6.4547319412231445, + "learning_rate": 9.105209117667192e-05, + "loss": 0.8028, + "step": 7287 + }, + { + "epoch": 0.4938003929805542, + "grad_norm": 6.335334300994873, + "learning_rate": 9.10507221575741e-05, + "loss": 0.763, + "step": 7288 + }, + { + "epoch": 0.4938681482485263, + "grad_norm": 7.21290397644043, + "learning_rate": 9.10493531384763e-05, + "loss": 0.9477, + "step": 7289 + }, + { + "epoch": 0.4939359035164984, + "grad_norm": 7.856054782867432, + "learning_rate": 9.104798411937847e-05, + "loss": 1.1117, + "step": 7290 + }, + { + "epoch": 0.4940036587844705, + "grad_norm": 6.308975696563721, + "learning_rate": 9.104661510028066e-05, + "loss": 1.0397, + "step": 7291 + }, + { + "epoch": 0.4940714140524426, + "grad_norm": 6.111830711364746, + "learning_rate": 9.104524608118284e-05, + "loss": 0.8082, + "step": 7292 + }, + { + "epoch": 0.4941391693204147, + "grad_norm": 7.741870403289795, + "learning_rate": 9.104387706208503e-05, + "loss": 0.804, + "step": 7293 + }, + { + "epoch": 0.4942069245883867, + "grad_norm": 7.447502613067627, + "learning_rate": 9.104250804298721e-05, + "loss": 0.9074, + "step": 7294 + }, + { + "epoch": 0.4942746798563588, + "grad_norm": 4.931535243988037, + "learning_rate": 9.104113902388939e-05, + "loss": 0.7519, + "step": 7295 + }, + { + "epoch": 0.4943424351243309, + "grad_norm": 9.830883026123047, + "learning_rate": 9.103977000479157e-05, + "loss": 0.8659, + "step": 7296 + }, + { + "epoch": 0.494410190392303, + "grad_norm": 6.182522296905518, + "learning_rate": 9.103840098569375e-05, + "loss": 0.7021, + "step": 7297 + }, + { + "epoch": 0.4944779456602751, + "grad_norm": 8.73188591003418, + "learning_rate": 9.103703196659594e-05, + "loss": 0.8496, + "step": 7298 + }, + { + "epoch": 0.4945457009282472, + "grad_norm": 7.2238640785217285, + "learning_rate": 9.103566294749813e-05, + "loss": 1.0778, + "step": 7299 + }, + { + "epoch": 0.49461345619621927, + "grad_norm": 6.101573944091797, + "learning_rate": 9.10342939284003e-05, + "loss": 0.9264, + "step": 7300 + }, + { + "epoch": 0.49468121146419136, + "grad_norm": 7.327548503875732, + "learning_rate": 9.103292490930249e-05, + "loss": 0.7973, + "step": 7301 + }, + { + "epoch": 0.49474896673216345, + "grad_norm": 7.1809186935424805, + "learning_rate": 9.103155589020467e-05, + "loss": 0.9586, + "step": 7302 + }, + { + "epoch": 0.4948167220001355, + "grad_norm": 5.164478778839111, + "learning_rate": 9.103018687110686e-05, + "loss": 0.6197, + "step": 7303 + }, + { + "epoch": 0.4948844772681076, + "grad_norm": 6.387687683105469, + "learning_rate": 9.102881785200904e-05, + "loss": 0.9509, + "step": 7304 + }, + { + "epoch": 0.4949522325360797, + "grad_norm": 8.202160835266113, + "learning_rate": 9.102744883291122e-05, + "loss": 0.8557, + "step": 7305 + }, + { + "epoch": 0.49501998780405176, + "grad_norm": 6.164126873016357, + "learning_rate": 9.10260798138134e-05, + "loss": 0.7898, + "step": 7306 + }, + { + "epoch": 0.49508774307202386, + "grad_norm": 6.448176383972168, + "learning_rate": 9.10247107947156e-05, + "loss": 0.9094, + "step": 7307 + }, + { + "epoch": 0.49515549833999595, + "grad_norm": 6.582845687866211, + "learning_rate": 9.102334177561778e-05, + "loss": 0.8778, + "step": 7308 + }, + { + "epoch": 0.49522325360796804, + "grad_norm": 7.6885552406311035, + "learning_rate": 9.102197275651996e-05, + "loss": 1.0229, + "step": 7309 + }, + { + "epoch": 0.49529100887594013, + "grad_norm": 6.840595722198486, + "learning_rate": 9.102060373742214e-05, + "loss": 0.972, + "step": 7310 + }, + { + "epoch": 0.49535876414391217, + "grad_norm": 6.486509323120117, + "learning_rate": 9.101923471832432e-05, + "loss": 0.8788, + "step": 7311 + }, + { + "epoch": 0.49542651941188426, + "grad_norm": 9.052862167358398, + "learning_rate": 9.101786569922651e-05, + "loss": 0.8917, + "step": 7312 + }, + { + "epoch": 0.49549427467985635, + "grad_norm": 5.528444766998291, + "learning_rate": 9.101649668012869e-05, + "loss": 0.6167, + "step": 7313 + }, + { + "epoch": 0.49556202994782844, + "grad_norm": 6.7654032707214355, + "learning_rate": 9.101512766103087e-05, + "loss": 0.9848, + "step": 7314 + }, + { + "epoch": 0.49562978521580053, + "grad_norm": 6.247506618499756, + "learning_rate": 9.101375864193305e-05, + "loss": 0.5868, + "step": 7315 + }, + { + "epoch": 0.4956975404837726, + "grad_norm": 6.155362129211426, + "learning_rate": 9.101238962283525e-05, + "loss": 0.8344, + "step": 7316 + }, + { + "epoch": 0.4957652957517447, + "grad_norm": 8.219099998474121, + "learning_rate": 9.101102060373743e-05, + "loss": 0.8839, + "step": 7317 + }, + { + "epoch": 0.4958330510197168, + "grad_norm": 5.146651744842529, + "learning_rate": 9.10096515846396e-05, + "loss": 0.7872, + "step": 7318 + }, + { + "epoch": 0.49590080628768884, + "grad_norm": 7.989886283874512, + "learning_rate": 9.100828256554179e-05, + "loss": 1.0485, + "step": 7319 + }, + { + "epoch": 0.49596856155566094, + "grad_norm": 7.214312553405762, + "learning_rate": 9.100691354644397e-05, + "loss": 0.9304, + "step": 7320 + }, + { + "epoch": 0.49603631682363303, + "grad_norm": 6.856055736541748, + "learning_rate": 9.100554452734616e-05, + "loss": 0.8934, + "step": 7321 + }, + { + "epoch": 0.4961040720916051, + "grad_norm": 6.771759986877441, + "learning_rate": 9.100417550824834e-05, + "loss": 0.9349, + "step": 7322 + }, + { + "epoch": 0.4961718273595772, + "grad_norm": 8.19810962677002, + "learning_rate": 9.100280648915052e-05, + "loss": 1.0005, + "step": 7323 + }, + { + "epoch": 0.4962395826275493, + "grad_norm": 6.946470737457275, + "learning_rate": 9.100143747005271e-05, + "loss": 0.7505, + "step": 7324 + }, + { + "epoch": 0.4963073378955214, + "grad_norm": 6.6737518310546875, + "learning_rate": 9.10000684509549e-05, + "loss": 0.8722, + "step": 7325 + }, + { + "epoch": 0.4963750931634935, + "grad_norm": 5.637021064758301, + "learning_rate": 9.099869943185708e-05, + "loss": 0.9587, + "step": 7326 + }, + { + "epoch": 0.4964428484314655, + "grad_norm": 5.846226692199707, + "learning_rate": 9.099733041275927e-05, + "loss": 0.7726, + "step": 7327 + }, + { + "epoch": 0.4965106036994376, + "grad_norm": 7.612580299377441, + "learning_rate": 9.099596139366145e-05, + "loss": 0.7203, + "step": 7328 + }, + { + "epoch": 0.4965783589674097, + "grad_norm": 5.490561485290527, + "learning_rate": 9.099459237456363e-05, + "loss": 0.9457, + "step": 7329 + }, + { + "epoch": 0.4966461142353818, + "grad_norm": 5.573283672332764, + "learning_rate": 9.099322335546582e-05, + "loss": 0.8116, + "step": 7330 + }, + { + "epoch": 0.4967138695033539, + "grad_norm": 7.063168048858643, + "learning_rate": 9.0991854336368e-05, + "loss": 0.906, + "step": 7331 + }, + { + "epoch": 0.496781624771326, + "grad_norm": 7.066551685333252, + "learning_rate": 9.099048531727018e-05, + "loss": 0.8515, + "step": 7332 + }, + { + "epoch": 0.49684938003929807, + "grad_norm": 6.462795257568359, + "learning_rate": 9.098911629817237e-05, + "loss": 0.6325, + "step": 7333 + }, + { + "epoch": 0.49691713530727016, + "grad_norm": 6.59752893447876, + "learning_rate": 9.098774727907455e-05, + "loss": 0.7356, + "step": 7334 + }, + { + "epoch": 0.4969848905752422, + "grad_norm": 5.600460052490234, + "learning_rate": 9.098637825997674e-05, + "loss": 0.6307, + "step": 7335 + }, + { + "epoch": 0.4970526458432143, + "grad_norm": 6.0045270919799805, + "learning_rate": 9.098500924087892e-05, + "loss": 1.0137, + "step": 7336 + }, + { + "epoch": 0.4971204011111864, + "grad_norm": 7.3824143409729, + "learning_rate": 9.09836402217811e-05, + "loss": 0.8982, + "step": 7337 + }, + { + "epoch": 0.4971881563791585, + "grad_norm": 6.573738098144531, + "learning_rate": 9.098227120268328e-05, + "loss": 0.7594, + "step": 7338 + }, + { + "epoch": 0.49725591164713057, + "grad_norm": 8.101619720458984, + "learning_rate": 9.098090218358547e-05, + "loss": 1.1994, + "step": 7339 + }, + { + "epoch": 0.49732366691510266, + "grad_norm": 5.76462459564209, + "learning_rate": 9.097953316448765e-05, + "loss": 0.7082, + "step": 7340 + }, + { + "epoch": 0.49739142218307475, + "grad_norm": 6.358243465423584, + "learning_rate": 9.097816414538983e-05, + "loss": 0.8459, + "step": 7341 + }, + { + "epoch": 0.49745917745104684, + "grad_norm": 7.145965576171875, + "learning_rate": 9.097679512629202e-05, + "loss": 0.9953, + "step": 7342 + }, + { + "epoch": 0.4975269327190189, + "grad_norm": 8.23405933380127, + "learning_rate": 9.09754261071942e-05, + "loss": 1.1466, + "step": 7343 + }, + { + "epoch": 0.49759468798699097, + "grad_norm": 6.5327982902526855, + "learning_rate": 9.097405708809639e-05, + "loss": 1.0226, + "step": 7344 + }, + { + "epoch": 0.49766244325496306, + "grad_norm": 7.101400375366211, + "learning_rate": 9.097268806899857e-05, + "loss": 1.0476, + "step": 7345 + }, + { + "epoch": 0.49773019852293515, + "grad_norm": 7.278493404388428, + "learning_rate": 9.097131904990075e-05, + "loss": 0.9499, + "step": 7346 + }, + { + "epoch": 0.49779795379090724, + "grad_norm": 6.826780796051025, + "learning_rate": 9.096995003080293e-05, + "loss": 0.8371, + "step": 7347 + }, + { + "epoch": 0.49786570905887934, + "grad_norm": 6.522684097290039, + "learning_rate": 9.096858101170512e-05, + "loss": 0.9318, + "step": 7348 + }, + { + "epoch": 0.4979334643268514, + "grad_norm": 6.954566478729248, + "learning_rate": 9.09672119926073e-05, + "loss": 0.9174, + "step": 7349 + }, + { + "epoch": 0.4980012195948235, + "grad_norm": 6.442493438720703, + "learning_rate": 9.096584297350949e-05, + "loss": 0.8619, + "step": 7350 + }, + { + "epoch": 0.49806897486279555, + "grad_norm": 7.186161041259766, + "learning_rate": 9.096447395441167e-05, + "loss": 0.661, + "step": 7351 + }, + { + "epoch": 0.49813673013076765, + "grad_norm": 6.716146945953369, + "learning_rate": 9.096310493531385e-05, + "loss": 0.9148, + "step": 7352 + }, + { + "epoch": 0.49820448539873974, + "grad_norm": 5.643620014190674, + "learning_rate": 9.096173591621604e-05, + "loss": 0.8393, + "step": 7353 + }, + { + "epoch": 0.49827224066671183, + "grad_norm": 7.107893466949463, + "learning_rate": 9.096036689711822e-05, + "loss": 0.972, + "step": 7354 + }, + { + "epoch": 0.4983399959346839, + "grad_norm": 6.967519283294678, + "learning_rate": 9.09589978780204e-05, + "loss": 0.7033, + "step": 7355 + }, + { + "epoch": 0.498407751202656, + "grad_norm": 7.291131496429443, + "learning_rate": 9.095762885892258e-05, + "loss": 0.8853, + "step": 7356 + }, + { + "epoch": 0.4984755064706281, + "grad_norm": 7.630476474761963, + "learning_rate": 9.095625983982476e-05, + "loss": 0.8116, + "step": 7357 + }, + { + "epoch": 0.4985432617386002, + "grad_norm": 6.2367167472839355, + "learning_rate": 9.095489082072695e-05, + "loss": 0.8344, + "step": 7358 + }, + { + "epoch": 0.4986110170065723, + "grad_norm": 9.436936378479004, + "learning_rate": 9.095352180162914e-05, + "loss": 1.043, + "step": 7359 + }, + { + "epoch": 0.4986787722745443, + "grad_norm": 5.330153942108154, + "learning_rate": 9.095215278253132e-05, + "loss": 0.7025, + "step": 7360 + }, + { + "epoch": 0.4987465275425164, + "grad_norm": 5.117184162139893, + "learning_rate": 9.09507837634335e-05, + "loss": 0.8207, + "step": 7361 + }, + { + "epoch": 0.4988142828104885, + "grad_norm": 7.965060234069824, + "learning_rate": 9.094941474433569e-05, + "loss": 0.9344, + "step": 7362 + }, + { + "epoch": 0.4988820380784606, + "grad_norm": 6.985347747802734, + "learning_rate": 9.094804572523787e-05, + "loss": 0.8708, + "step": 7363 + }, + { + "epoch": 0.4989497933464327, + "grad_norm": 8.380836486816406, + "learning_rate": 9.094667670614005e-05, + "loss": 0.8901, + "step": 7364 + }, + { + "epoch": 0.4990175486144048, + "grad_norm": 6.336101055145264, + "learning_rate": 9.094530768704223e-05, + "loss": 0.9709, + "step": 7365 + }, + { + "epoch": 0.4990853038823769, + "grad_norm": 6.435248851776123, + "learning_rate": 9.094393866794441e-05, + "loss": 0.8913, + "step": 7366 + }, + { + "epoch": 0.49915305915034897, + "grad_norm": 7.674434661865234, + "learning_rate": 9.09425696488466e-05, + "loss": 1.0581, + "step": 7367 + }, + { + "epoch": 0.499220814418321, + "grad_norm": 7.2185211181640625, + "learning_rate": 9.094120062974879e-05, + "loss": 0.9202, + "step": 7368 + }, + { + "epoch": 0.4992885696862931, + "grad_norm": 6.717129230499268, + "learning_rate": 9.093983161065097e-05, + "loss": 0.8664, + "step": 7369 + }, + { + "epoch": 0.4993563249542652, + "grad_norm": 6.229526996612549, + "learning_rate": 9.093846259155316e-05, + "loss": 0.7374, + "step": 7370 + }, + { + "epoch": 0.4994240802222373, + "grad_norm": 7.177096843719482, + "learning_rate": 9.093709357245534e-05, + "loss": 1.0426, + "step": 7371 + }, + { + "epoch": 0.49949183549020937, + "grad_norm": 7.289033889770508, + "learning_rate": 9.093572455335752e-05, + "loss": 0.8705, + "step": 7372 + }, + { + "epoch": 0.49955959075818146, + "grad_norm": 8.260116577148438, + "learning_rate": 9.093435553425971e-05, + "loss": 0.6928, + "step": 7373 + }, + { + "epoch": 0.49962734602615355, + "grad_norm": 5.730698108673096, + "learning_rate": 9.09329865151619e-05, + "loss": 0.7852, + "step": 7374 + }, + { + "epoch": 0.49969510129412564, + "grad_norm": 6.90589714050293, + "learning_rate": 9.093161749606407e-05, + "loss": 0.9258, + "step": 7375 + }, + { + "epoch": 0.4997628565620977, + "grad_norm": 5.358353137969971, + "learning_rate": 9.093024847696627e-05, + "loss": 0.7327, + "step": 7376 + }, + { + "epoch": 0.49983061183006977, + "grad_norm": 5.4898505210876465, + "learning_rate": 9.092887945786845e-05, + "loss": 0.7371, + "step": 7377 + }, + { + "epoch": 0.49989836709804186, + "grad_norm": 7.045073986053467, + "learning_rate": 9.092751043877063e-05, + "loss": 0.8785, + "step": 7378 + }, + { + "epoch": 0.49989836709804186, + "eval_loss": 0.8431733846664429, + "eval_noise_accuracy": 0.0, + "eval_runtime": 1466.1466, + "eval_samples_per_second": 3.505, + "eval_steps_per_second": 0.22, + "eval_wer": 75.89224292121845, + "step": 7378 + }, + { + "epoch": 0.49996612236601395, + "grad_norm": 6.144540309906006, + "learning_rate": 9.092614141967281e-05, + "loss": 0.8467, + "step": 7379 + }, + { + "epoch": 0.500033877633986, + "grad_norm": 5.819301605224609, + "learning_rate": 9.0924772400575e-05, + "loss": 0.7813, + "step": 7380 + }, + { + "epoch": 0.5001016329019581, + "grad_norm": 5.759615421295166, + "learning_rate": 9.092340338147718e-05, + "loss": 0.7834, + "step": 7381 + }, + { + "epoch": 0.5001693881699302, + "grad_norm": 5.7733917236328125, + "learning_rate": 9.092203436237936e-05, + "loss": 0.6671, + "step": 7382 + }, + { + "epoch": 0.5002371434379023, + "grad_norm": 8.171788215637207, + "learning_rate": 9.092066534328154e-05, + "loss": 0.9039, + "step": 7383 + }, + { + "epoch": 0.5003048987058744, + "grad_norm": 6.261331081390381, + "learning_rate": 9.091929632418373e-05, + "loss": 0.9185, + "step": 7384 + }, + { + "epoch": 0.5003726539738464, + "grad_norm": 4.867089748382568, + "learning_rate": 9.091792730508592e-05, + "loss": 0.8967, + "step": 7385 + }, + { + "epoch": 0.5004404092418185, + "grad_norm": 8.540884971618652, + "learning_rate": 9.09165582859881e-05, + "loss": 1.0576, + "step": 7386 + }, + { + "epoch": 0.5005081645097906, + "grad_norm": 6.116450786590576, + "learning_rate": 9.091518926689028e-05, + "loss": 0.8109, + "step": 7387 + }, + { + "epoch": 0.5005759197777627, + "grad_norm": 5.271210670471191, + "learning_rate": 9.091382024779246e-05, + "loss": 1.0116, + "step": 7388 + }, + { + "epoch": 0.5006436750457348, + "grad_norm": 5.851868152618408, + "learning_rate": 9.091245122869464e-05, + "loss": 0.8285, + "step": 7389 + }, + { + "epoch": 0.5007114303137069, + "grad_norm": 6.349635124206543, + "learning_rate": 9.091108220959683e-05, + "loss": 1.0065, + "step": 7390 + }, + { + "epoch": 0.500779185581679, + "grad_norm": 7.653061389923096, + "learning_rate": 9.090971319049901e-05, + "loss": 0.8314, + "step": 7391 + }, + { + "epoch": 0.5008469408496511, + "grad_norm": 7.560361385345459, + "learning_rate": 9.09083441714012e-05, + "loss": 0.9674, + "step": 7392 + }, + { + "epoch": 0.5009146961176232, + "grad_norm": 5.517054080963135, + "learning_rate": 9.090697515230338e-05, + "loss": 0.6423, + "step": 7393 + }, + { + "epoch": 0.5009824513855953, + "grad_norm": 6.644406795501709, + "learning_rate": 9.090560613320557e-05, + "loss": 0.7256, + "step": 7394 + }, + { + "epoch": 0.5010502066535674, + "grad_norm": 6.9502854347229, + "learning_rate": 9.090423711410775e-05, + "loss": 0.8194, + "step": 7395 + }, + { + "epoch": 0.5011179619215393, + "grad_norm": 5.900984287261963, + "learning_rate": 9.090286809500993e-05, + "loss": 1.1089, + "step": 7396 + }, + { + "epoch": 0.5011857171895114, + "grad_norm": 5.964234352111816, + "learning_rate": 9.090149907591211e-05, + "loss": 0.9525, + "step": 7397 + }, + { + "epoch": 0.5012534724574835, + "grad_norm": 7.2671895027160645, + "learning_rate": 9.090013005681429e-05, + "loss": 0.7658, + "step": 7398 + }, + { + "epoch": 0.5013212277254556, + "grad_norm": 6.128476619720459, + "learning_rate": 9.089876103771648e-05, + "loss": 0.8413, + "step": 7399 + }, + { + "epoch": 0.5013889829934277, + "grad_norm": 5.146761894226074, + "learning_rate": 9.089739201861866e-05, + "loss": 0.7552, + "step": 7400 + }, + { + "epoch": 0.5014567382613998, + "grad_norm": 7.736568927764893, + "learning_rate": 9.089602299952085e-05, + "loss": 0.5941, + "step": 7401 + }, + { + "epoch": 0.5015244935293719, + "grad_norm": 7.113034725189209, + "learning_rate": 9.089465398042303e-05, + "loss": 0.8414, + "step": 7402 + }, + { + "epoch": 0.501592248797344, + "grad_norm": 6.421565055847168, + "learning_rate": 9.089328496132522e-05, + "loss": 0.7477, + "step": 7403 + }, + { + "epoch": 0.5016600040653161, + "grad_norm": 6.609166145324707, + "learning_rate": 9.08919159422274e-05, + "loss": 0.8404, + "step": 7404 + }, + { + "epoch": 0.5017277593332882, + "grad_norm": 5.771233558654785, + "learning_rate": 9.089054692312958e-05, + "loss": 0.8446, + "step": 7405 + }, + { + "epoch": 0.5017955146012603, + "grad_norm": 5.4080119132995605, + "learning_rate": 9.088917790403176e-05, + "loss": 0.7046, + "step": 7406 + }, + { + "epoch": 0.5018632698692324, + "grad_norm": 7.0025248527526855, + "learning_rate": 9.088780888493394e-05, + "loss": 1.0269, + "step": 7407 + }, + { + "epoch": 0.5019310251372044, + "grad_norm": 6.687203884124756, + "learning_rate": 9.088643986583613e-05, + "loss": 0.9794, + "step": 7408 + }, + { + "epoch": 0.5019987804051765, + "grad_norm": 7.627871036529541, + "learning_rate": 9.088507084673831e-05, + "loss": 1.0068, + "step": 7409 + }, + { + "epoch": 0.5020665356731486, + "grad_norm": 6.824975490570068, + "learning_rate": 9.08837018276405e-05, + "loss": 0.9582, + "step": 7410 + }, + { + "epoch": 0.5021342909411207, + "grad_norm": 5.561855792999268, + "learning_rate": 9.088233280854268e-05, + "loss": 0.7373, + "step": 7411 + }, + { + "epoch": 0.5022020462090928, + "grad_norm": 6.751492023468018, + "learning_rate": 9.088096378944486e-05, + "loss": 1.1699, + "step": 7412 + }, + { + "epoch": 0.5022698014770648, + "grad_norm": 6.15092658996582, + "learning_rate": 9.087959477034705e-05, + "loss": 0.656, + "step": 7413 + }, + { + "epoch": 0.5023375567450369, + "grad_norm": 7.125277042388916, + "learning_rate": 9.087822575124923e-05, + "loss": 0.8737, + "step": 7414 + }, + { + "epoch": 0.502405312013009, + "grad_norm": 8.87856674194336, + "learning_rate": 9.087685673215141e-05, + "loss": 0.9961, + "step": 7415 + }, + { + "epoch": 0.5024730672809811, + "grad_norm": 5.436285495758057, + "learning_rate": 9.087548771305359e-05, + "loss": 0.8834, + "step": 7416 + }, + { + "epoch": 0.5025408225489532, + "grad_norm": 6.28549861907959, + "learning_rate": 9.087411869395578e-05, + "loss": 0.6354, + "step": 7417 + }, + { + "epoch": 0.5026085778169253, + "grad_norm": 5.951661109924316, + "learning_rate": 9.087274967485797e-05, + "loss": 0.8347, + "step": 7418 + }, + { + "epoch": 0.5026763330848973, + "grad_norm": 5.268624305725098, + "learning_rate": 9.087138065576015e-05, + "loss": 0.9966, + "step": 7419 + }, + { + "epoch": 0.5027440883528694, + "grad_norm": 6.974735260009766, + "learning_rate": 9.087001163666234e-05, + "loss": 0.9685, + "step": 7420 + }, + { + "epoch": 0.5028118436208415, + "grad_norm": 6.689586639404297, + "learning_rate": 9.086864261756452e-05, + "loss": 0.8843, + "step": 7421 + }, + { + "epoch": 0.5028795988888136, + "grad_norm": 6.510265827178955, + "learning_rate": 9.086727359846671e-05, + "loss": 1.0125, + "step": 7422 + }, + { + "epoch": 0.5029473541567857, + "grad_norm": 7.039668560028076, + "learning_rate": 9.08659045793689e-05, + "loss": 0.7289, + "step": 7423 + }, + { + "epoch": 0.5030151094247578, + "grad_norm": 6.1734442710876465, + "learning_rate": 9.086453556027107e-05, + "loss": 0.8023, + "step": 7424 + }, + { + "epoch": 0.5030828646927299, + "grad_norm": 7.895476818084717, + "learning_rate": 9.086316654117325e-05, + "loss": 1.0043, + "step": 7425 + }, + { + "epoch": 0.503150619960702, + "grad_norm": 6.599829196929932, + "learning_rate": 9.086179752207545e-05, + "loss": 0.8537, + "step": 7426 + }, + { + "epoch": 0.5032183752286741, + "grad_norm": 8.60390567779541, + "learning_rate": 9.086042850297763e-05, + "loss": 1.1038, + "step": 7427 + }, + { + "epoch": 0.5032861304966462, + "grad_norm": 6.656140327453613, + "learning_rate": 9.085905948387981e-05, + "loss": 0.9533, + "step": 7428 + }, + { + "epoch": 0.5033538857646181, + "grad_norm": 5.768946170806885, + "learning_rate": 9.085769046478199e-05, + "loss": 0.6271, + "step": 7429 + }, + { + "epoch": 0.5034216410325902, + "grad_norm": 5.938215732574463, + "learning_rate": 9.085632144568417e-05, + "loss": 1.0308, + "step": 7430 + }, + { + "epoch": 0.5034893963005623, + "grad_norm": 7.145301342010498, + "learning_rate": 9.085495242658636e-05, + "loss": 0.8695, + "step": 7431 + }, + { + "epoch": 0.5035571515685344, + "grad_norm": 5.975915908813477, + "learning_rate": 9.085358340748854e-05, + "loss": 0.9071, + "step": 7432 + }, + { + "epoch": 0.5036249068365065, + "grad_norm": 5.689105033874512, + "learning_rate": 9.085221438839072e-05, + "loss": 0.9037, + "step": 7433 + }, + { + "epoch": 0.5036926621044786, + "grad_norm": 6.513401985168457, + "learning_rate": 9.08508453692929e-05, + "loss": 0.7534, + "step": 7434 + }, + { + "epoch": 0.5037604173724507, + "grad_norm": 7.220860958099365, + "learning_rate": 9.084947635019509e-05, + "loss": 1.0833, + "step": 7435 + }, + { + "epoch": 0.5038281726404228, + "grad_norm": 10.640632629394531, + "learning_rate": 9.084810733109728e-05, + "loss": 0.9143, + "step": 7436 + }, + { + "epoch": 0.5038959279083949, + "grad_norm": 6.900107383728027, + "learning_rate": 9.084673831199946e-05, + "loss": 0.8239, + "step": 7437 + }, + { + "epoch": 0.503963683176367, + "grad_norm": 6.290066719055176, + "learning_rate": 9.084536929290164e-05, + "loss": 0.8982, + "step": 7438 + }, + { + "epoch": 0.5040314384443391, + "grad_norm": 7.073644638061523, + "learning_rate": 9.084400027380382e-05, + "loss": 1.0932, + "step": 7439 + }, + { + "epoch": 0.5040991937123112, + "grad_norm": 7.144145965576172, + "learning_rate": 9.084263125470601e-05, + "loss": 0.7772, + "step": 7440 + }, + { + "epoch": 0.5041669489802832, + "grad_norm": 5.8309326171875, + "learning_rate": 9.08412622356082e-05, + "loss": 0.8945, + "step": 7441 + }, + { + "epoch": 0.5042347042482553, + "grad_norm": 7.0719218254089355, + "learning_rate": 9.083989321651037e-05, + "loss": 0.7513, + "step": 7442 + }, + { + "epoch": 0.5043024595162274, + "grad_norm": 7.7847795486450195, + "learning_rate": 9.083852419741255e-05, + "loss": 0.9553, + "step": 7443 + }, + { + "epoch": 0.5043702147841995, + "grad_norm": 6.7899169921875, + "learning_rate": 9.083715517831474e-05, + "loss": 0.7908, + "step": 7444 + }, + { + "epoch": 0.5044379700521715, + "grad_norm": 7.399930000305176, + "learning_rate": 9.083578615921693e-05, + "loss": 0.8104, + "step": 7445 + }, + { + "epoch": 0.5045057253201436, + "grad_norm": 6.694761753082275, + "learning_rate": 9.083441714011911e-05, + "loss": 0.8481, + "step": 7446 + }, + { + "epoch": 0.5045734805881157, + "grad_norm": 5.661715030670166, + "learning_rate": 9.083304812102129e-05, + "loss": 0.7598, + "step": 7447 + }, + { + "epoch": 0.5046412358560878, + "grad_norm": 6.502758979797363, + "learning_rate": 9.083167910192347e-05, + "loss": 0.8789, + "step": 7448 + }, + { + "epoch": 0.5047089911240599, + "grad_norm": 7.428299427032471, + "learning_rate": 9.083031008282566e-05, + "loss": 1.0323, + "step": 7449 + }, + { + "epoch": 0.504776746392032, + "grad_norm": 7.600015163421631, + "learning_rate": 9.082894106372784e-05, + "loss": 0.8985, + "step": 7450 + }, + { + "epoch": 0.504844501660004, + "grad_norm": 5.523435592651367, + "learning_rate": 9.082757204463002e-05, + "loss": 0.7891, + "step": 7451 + }, + { + "epoch": 0.5049122569279761, + "grad_norm": 7.293107509613037, + "learning_rate": 9.08262030255322e-05, + "loss": 1.0804, + "step": 7452 + }, + { + "epoch": 0.5049800121959482, + "grad_norm": 5.669400215148926, + "learning_rate": 9.082483400643439e-05, + "loss": 0.9136, + "step": 7453 + }, + { + "epoch": 0.5050477674639203, + "grad_norm": 6.409341812133789, + "learning_rate": 9.082346498733658e-05, + "loss": 1.1784, + "step": 7454 + }, + { + "epoch": 0.5051155227318924, + "grad_norm": 5.636824607849121, + "learning_rate": 9.082209596823876e-05, + "loss": 0.7079, + "step": 7455 + }, + { + "epoch": 0.5051832779998645, + "grad_norm": 6.060736179351807, + "learning_rate": 9.082072694914094e-05, + "loss": 0.9279, + "step": 7456 + }, + { + "epoch": 0.5052510332678366, + "grad_norm": 8.116156578063965, + "learning_rate": 9.081935793004312e-05, + "loss": 0.919, + "step": 7457 + }, + { + "epoch": 0.5053187885358087, + "grad_norm": 7.159115791320801, + "learning_rate": 9.081798891094531e-05, + "loss": 0.8237, + "step": 7458 + }, + { + "epoch": 0.5053865438037808, + "grad_norm": 7.625302314758301, + "learning_rate": 9.08166198918475e-05, + "loss": 0.8204, + "step": 7459 + }, + { + "epoch": 0.5054542990717529, + "grad_norm": 7.276190280914307, + "learning_rate": 9.081525087274967e-05, + "loss": 0.6219, + "step": 7460 + }, + { + "epoch": 0.505522054339725, + "grad_norm": 8.660572052001953, + "learning_rate": 9.081388185365186e-05, + "loss": 0.9772, + "step": 7461 + }, + { + "epoch": 0.505589809607697, + "grad_norm": 8.218971252441406, + "learning_rate": 9.081251283455404e-05, + "loss": 0.7771, + "step": 7462 + }, + { + "epoch": 0.505657564875669, + "grad_norm": 5.246251106262207, + "learning_rate": 9.081114381545623e-05, + "loss": 0.7706, + "step": 7463 + }, + { + "epoch": 0.5057253201436411, + "grad_norm": 8.52219009399414, + "learning_rate": 9.080977479635841e-05, + "loss": 0.986, + "step": 7464 + }, + { + "epoch": 0.5057930754116132, + "grad_norm": 5.994356632232666, + "learning_rate": 9.080840577726059e-05, + "loss": 0.783, + "step": 7465 + }, + { + "epoch": 0.5058608306795853, + "grad_norm": 7.397661209106445, + "learning_rate": 9.080703675816278e-05, + "loss": 0.8338, + "step": 7466 + }, + { + "epoch": 0.5059285859475574, + "grad_norm": 7.875096797943115, + "learning_rate": 9.080566773906496e-05, + "loss": 1.0615, + "step": 7467 + }, + { + "epoch": 0.5059963412155295, + "grad_norm": 5.616501331329346, + "learning_rate": 9.080429871996714e-05, + "loss": 0.8735, + "step": 7468 + }, + { + "epoch": 0.5060640964835016, + "grad_norm": 7.229982852935791, + "learning_rate": 9.080292970086934e-05, + "loss": 0.6812, + "step": 7469 + }, + { + "epoch": 0.5061318517514737, + "grad_norm": 6.370266437530518, + "learning_rate": 9.080156068177152e-05, + "loss": 0.999, + "step": 7470 + }, + { + "epoch": 0.5061996070194458, + "grad_norm": 7.752940654754639, + "learning_rate": 9.08001916626737e-05, + "loss": 1.0239, + "step": 7471 + }, + { + "epoch": 0.5062673622874179, + "grad_norm": 6.016890048980713, + "learning_rate": 9.079882264357589e-05, + "loss": 0.8494, + "step": 7472 + }, + { + "epoch": 0.50633511755539, + "grad_norm": 5.739022731781006, + "learning_rate": 9.079745362447807e-05, + "loss": 0.8505, + "step": 7473 + }, + { + "epoch": 0.506402872823362, + "grad_norm": 5.543849468231201, + "learning_rate": 9.079608460538025e-05, + "loss": 0.884, + "step": 7474 + }, + { + "epoch": 0.5064706280913341, + "grad_norm": 6.608273506164551, + "learning_rate": 9.079471558628243e-05, + "loss": 0.832, + "step": 7475 + }, + { + "epoch": 0.5065383833593062, + "grad_norm": 5.356375217437744, + "learning_rate": 9.079334656718461e-05, + "loss": 0.7631, + "step": 7476 + }, + { + "epoch": 0.5066061386272783, + "grad_norm": 6.19942045211792, + "learning_rate": 9.079197754808681e-05, + "loss": 0.8971, + "step": 7477 + }, + { + "epoch": 0.5066738938952503, + "grad_norm": 6.8287506103515625, + "learning_rate": 9.079060852898899e-05, + "loss": 1.1027, + "step": 7478 + }, + { + "epoch": 0.5067416491632224, + "grad_norm": 5.98441219329834, + "learning_rate": 9.078923950989117e-05, + "loss": 0.8476, + "step": 7479 + }, + { + "epoch": 0.5068094044311945, + "grad_norm": 5.990217208862305, + "learning_rate": 9.078787049079335e-05, + "loss": 0.7411, + "step": 7480 + }, + { + "epoch": 0.5068771596991666, + "grad_norm": 7.649387836456299, + "learning_rate": 9.078650147169554e-05, + "loss": 0.8498, + "step": 7481 + }, + { + "epoch": 0.5069449149671387, + "grad_norm": 5.848696708679199, + "learning_rate": 9.078513245259772e-05, + "loss": 0.9977, + "step": 7482 + }, + { + "epoch": 0.5070126702351108, + "grad_norm": 7.848155498504639, + "learning_rate": 9.07837634334999e-05, + "loss": 1.0863, + "step": 7483 + }, + { + "epoch": 0.5070804255030829, + "grad_norm": 6.667200088500977, + "learning_rate": 9.078239441440208e-05, + "loss": 0.8052, + "step": 7484 + }, + { + "epoch": 0.507148180771055, + "grad_norm": 7.332635879516602, + "learning_rate": 9.078102539530426e-05, + "loss": 0.8855, + "step": 7485 + }, + { + "epoch": 0.507215936039027, + "grad_norm": 8.586542129516602, + "learning_rate": 9.077965637620646e-05, + "loss": 1.0024, + "step": 7486 + }, + { + "epoch": 0.5072836913069991, + "grad_norm": 7.780113697052002, + "learning_rate": 9.077828735710864e-05, + "loss": 0.9129, + "step": 7487 + }, + { + "epoch": 0.5073514465749712, + "grad_norm": 6.412726879119873, + "learning_rate": 9.077691833801082e-05, + "loss": 0.6586, + "step": 7488 + }, + { + "epoch": 0.5074192018429433, + "grad_norm": 7.84084415435791, + "learning_rate": 9.0775549318913e-05, + "loss": 0.8709, + "step": 7489 + }, + { + "epoch": 0.5074869571109154, + "grad_norm": 6.463359355926514, + "learning_rate": 9.077418029981518e-05, + "loss": 0.7824, + "step": 7490 + }, + { + "epoch": 0.5075547123788875, + "grad_norm": 7.006936550140381, + "learning_rate": 9.077281128071737e-05, + "loss": 0.8868, + "step": 7491 + }, + { + "epoch": 0.5076224676468596, + "grad_norm": 6.153975486755371, + "learning_rate": 9.077144226161955e-05, + "loss": 0.8607, + "step": 7492 + }, + { + "epoch": 0.5076902229148317, + "grad_norm": 6.791597366333008, + "learning_rate": 9.077007324252173e-05, + "loss": 0.8782, + "step": 7493 + }, + { + "epoch": 0.5077579781828037, + "grad_norm": 8.730172157287598, + "learning_rate": 9.076870422342391e-05, + "loss": 0.9439, + "step": 7494 + }, + { + "epoch": 0.5078257334507758, + "grad_norm": 8.325760841369629, + "learning_rate": 9.076733520432611e-05, + "loss": 0.9425, + "step": 7495 + }, + { + "epoch": 0.5078934887187478, + "grad_norm": 5.549458026885986, + "learning_rate": 9.076596618522829e-05, + "loss": 0.741, + "step": 7496 + }, + { + "epoch": 0.5079612439867199, + "grad_norm": 6.16536283493042, + "learning_rate": 9.076459716613047e-05, + "loss": 0.9809, + "step": 7497 + }, + { + "epoch": 0.508028999254692, + "grad_norm": 7.393336772918701, + "learning_rate": 9.076322814703265e-05, + "loss": 0.7392, + "step": 7498 + }, + { + "epoch": 0.5080967545226641, + "grad_norm": 6.697513580322266, + "learning_rate": 9.076185912793483e-05, + "loss": 0.9413, + "step": 7499 + }, + { + "epoch": 0.5081645097906362, + "grad_norm": 5.854069709777832, + "learning_rate": 9.076049010883702e-05, + "loss": 0.8432, + "step": 7500 + }, + { + "epoch": 0.5082322650586083, + "grad_norm": 5.70686149597168, + "learning_rate": 9.07591210897392e-05, + "loss": 0.762, + "step": 7501 + }, + { + "epoch": 0.5083000203265804, + "grad_norm": 7.012457370758057, + "learning_rate": 9.075775207064138e-05, + "loss": 0.8047, + "step": 7502 + }, + { + "epoch": 0.5083677755945525, + "grad_norm": 6.485003471374512, + "learning_rate": 9.075638305154357e-05, + "loss": 0.6968, + "step": 7503 + }, + { + "epoch": 0.5084355308625246, + "grad_norm": 6.104341506958008, + "learning_rate": 9.075501403244576e-05, + "loss": 0.82, + "step": 7504 + }, + { + "epoch": 0.5085032861304967, + "grad_norm": 7.397383689880371, + "learning_rate": 9.075364501334794e-05, + "loss": 0.7462, + "step": 7505 + }, + { + "epoch": 0.5085710413984688, + "grad_norm": 6.942671775817871, + "learning_rate": 9.075227599425012e-05, + "loss": 0.8829, + "step": 7506 + }, + { + "epoch": 0.5086387966664409, + "grad_norm": 7.897338390350342, + "learning_rate": 9.07509069751523e-05, + "loss": 0.8725, + "step": 7507 + }, + { + "epoch": 0.5087065519344129, + "grad_norm": 6.410269260406494, + "learning_rate": 9.074953795605448e-05, + "loss": 0.6648, + "step": 7508 + }, + { + "epoch": 0.508774307202385, + "grad_norm": 7.2909955978393555, + "learning_rate": 9.074816893695667e-05, + "loss": 0.8097, + "step": 7509 + }, + { + "epoch": 0.508842062470357, + "grad_norm": 7.618723392486572, + "learning_rate": 9.074679991785885e-05, + "loss": 1.0666, + "step": 7510 + }, + { + "epoch": 0.5089098177383291, + "grad_norm": 6.482639789581299, + "learning_rate": 9.074543089876103e-05, + "loss": 0.7946, + "step": 7511 + }, + { + "epoch": 0.5089775730063012, + "grad_norm": 8.747861862182617, + "learning_rate": 9.074406187966323e-05, + "loss": 0.9464, + "step": 7512 + }, + { + "epoch": 0.5090453282742733, + "grad_norm": 8.655475616455078, + "learning_rate": 9.074269286056541e-05, + "loss": 1.0564, + "step": 7513 + }, + { + "epoch": 0.5091130835422454, + "grad_norm": 5.97476053237915, + "learning_rate": 9.074132384146759e-05, + "loss": 0.8152, + "step": 7514 + }, + { + "epoch": 0.5091808388102175, + "grad_norm": 7.176423072814941, + "learning_rate": 9.073995482236978e-05, + "loss": 0.9338, + "step": 7515 + }, + { + "epoch": 0.5092485940781896, + "grad_norm": 6.205722808837891, + "learning_rate": 9.073858580327196e-05, + "loss": 0.9404, + "step": 7516 + }, + { + "epoch": 0.5093163493461617, + "grad_norm": 6.857678413391113, + "learning_rate": 9.073721678417414e-05, + "loss": 0.7308, + "step": 7517 + }, + { + "epoch": 0.5093841046141337, + "grad_norm": 5.610182285308838, + "learning_rate": 9.073584776507634e-05, + "loss": 0.7608, + "step": 7518 + }, + { + "epoch": 0.5094518598821058, + "grad_norm": 5.618816375732422, + "learning_rate": 9.073447874597852e-05, + "loss": 0.6947, + "step": 7519 + }, + { + "epoch": 0.5095196151500779, + "grad_norm": 5.652774810791016, + "learning_rate": 9.07331097268807e-05, + "loss": 0.6527, + "step": 7520 + }, + { + "epoch": 0.50958737041805, + "grad_norm": 6.597334861755371, + "learning_rate": 9.073174070778288e-05, + "loss": 0.8084, + "step": 7521 + }, + { + "epoch": 0.5096551256860221, + "grad_norm": 6.7087202072143555, + "learning_rate": 9.073037168868506e-05, + "loss": 0.9075, + "step": 7522 + }, + { + "epoch": 0.5097228809539942, + "grad_norm": 6.56160306930542, + "learning_rate": 9.072900266958725e-05, + "loss": 0.9683, + "step": 7523 + }, + { + "epoch": 0.5097906362219663, + "grad_norm": 7.624117851257324, + "learning_rate": 9.072763365048943e-05, + "loss": 1.0495, + "step": 7524 + }, + { + "epoch": 0.5098583914899384, + "grad_norm": 5.690593719482422, + "learning_rate": 9.072626463139161e-05, + "loss": 1.0348, + "step": 7525 + }, + { + "epoch": 0.5099261467579105, + "grad_norm": 5.954509258270264, + "learning_rate": 9.07248956122938e-05, + "loss": 0.8622, + "step": 7526 + }, + { + "epoch": 0.5099939020258825, + "grad_norm": 6.603368759155273, + "learning_rate": 9.072352659319599e-05, + "loss": 0.6596, + "step": 7527 + }, + { + "epoch": 0.5100616572938546, + "grad_norm": 6.074961185455322, + "learning_rate": 9.072215757409817e-05, + "loss": 1.1204, + "step": 7528 + }, + { + "epoch": 0.5101294125618266, + "grad_norm": 7.444427967071533, + "learning_rate": 9.072078855500035e-05, + "loss": 0.8495, + "step": 7529 + }, + { + "epoch": 0.5101971678297987, + "grad_norm": 6.496705532073975, + "learning_rate": 9.071941953590253e-05, + "loss": 0.8796, + "step": 7530 + }, + { + "epoch": 0.5102649230977708, + "grad_norm": 6.413107872009277, + "learning_rate": 9.071805051680471e-05, + "loss": 0.8713, + "step": 7531 + }, + { + "epoch": 0.5103326783657429, + "grad_norm": 5.7435221672058105, + "learning_rate": 9.07166814977069e-05, + "loss": 0.7322, + "step": 7532 + }, + { + "epoch": 0.510400433633715, + "grad_norm": 5.807244777679443, + "learning_rate": 9.071531247860908e-05, + "loss": 0.8808, + "step": 7533 + }, + { + "epoch": 0.5104681889016871, + "grad_norm": 7.4514617919921875, + "learning_rate": 9.071394345951126e-05, + "loss": 0.9344, + "step": 7534 + }, + { + "epoch": 0.5105359441696592, + "grad_norm": 8.864927291870117, + "learning_rate": 9.071257444041344e-05, + "loss": 1.1036, + "step": 7535 + }, + { + "epoch": 0.5106036994376313, + "grad_norm": 6.26414155960083, + "learning_rate": 9.071120542131564e-05, + "loss": 0.7743, + "step": 7536 + }, + { + "epoch": 0.5106714547056034, + "grad_norm": 6.583436012268066, + "learning_rate": 9.070983640221782e-05, + "loss": 0.8079, + "step": 7537 + }, + { + "epoch": 0.5107392099735755, + "grad_norm": 6.833841323852539, + "learning_rate": 9.070846738312e-05, + "loss": 0.9583, + "step": 7538 + }, + { + "epoch": 0.5108069652415476, + "grad_norm": 7.538537502288818, + "learning_rate": 9.070709836402218e-05, + "loss": 0.9455, + "step": 7539 + }, + { + "epoch": 0.5108747205095197, + "grad_norm": 6.710206031799316, + "learning_rate": 9.070572934492436e-05, + "loss": 0.8598, + "step": 7540 + }, + { + "epoch": 0.5109424757774917, + "grad_norm": 5.586297035217285, + "learning_rate": 9.070436032582655e-05, + "loss": 0.7335, + "step": 7541 + }, + { + "epoch": 0.5110102310454638, + "grad_norm": 6.353386878967285, + "learning_rate": 9.070299130672873e-05, + "loss": 0.7107, + "step": 7542 + }, + { + "epoch": 0.5110779863134358, + "grad_norm": 7.016844272613525, + "learning_rate": 9.070162228763091e-05, + "loss": 0.8202, + "step": 7543 + }, + { + "epoch": 0.5111457415814079, + "grad_norm": 7.088860988616943, + "learning_rate": 9.07002532685331e-05, + "loss": 0.7402, + "step": 7544 + }, + { + "epoch": 0.51121349684938, + "grad_norm": 7.208921432495117, + "learning_rate": 9.069888424943527e-05, + "loss": 0.8991, + "step": 7545 + }, + { + "epoch": 0.5112812521173521, + "grad_norm": 7.308844566345215, + "learning_rate": 9.069751523033747e-05, + "loss": 0.9908, + "step": 7546 + }, + { + "epoch": 0.5113490073853242, + "grad_norm": 6.0522308349609375, + "learning_rate": 9.069614621123965e-05, + "loss": 0.788, + "step": 7547 + }, + { + "epoch": 0.5114167626532963, + "grad_norm": 6.078645706176758, + "learning_rate": 9.069477719214183e-05, + "loss": 0.7327, + "step": 7548 + }, + { + "epoch": 0.5114845179212684, + "grad_norm": 7.585491180419922, + "learning_rate": 9.069340817304401e-05, + "loss": 1.0976, + "step": 7549 + }, + { + "epoch": 0.5115522731892405, + "grad_norm": 5.420160293579102, + "learning_rate": 9.06920391539462e-05, + "loss": 0.5896, + "step": 7550 + }, + { + "epoch": 0.5116200284572126, + "grad_norm": 7.09340763092041, + "learning_rate": 9.069067013484838e-05, + "loss": 0.8464, + "step": 7551 + }, + { + "epoch": 0.5116877837251846, + "grad_norm": 5.813265323638916, + "learning_rate": 9.068930111575056e-05, + "loss": 0.685, + "step": 7552 + }, + { + "epoch": 0.5117555389931567, + "grad_norm": 8.996179580688477, + "learning_rate": 9.068793209665274e-05, + "loss": 1.0089, + "step": 7553 + }, + { + "epoch": 0.5118232942611288, + "grad_norm": 5.605385780334473, + "learning_rate": 9.068656307755493e-05, + "loss": 0.8688, + "step": 7554 + }, + { + "epoch": 0.5118910495291009, + "grad_norm": 7.07156229019165, + "learning_rate": 9.068519405845712e-05, + "loss": 0.7926, + "step": 7555 + }, + { + "epoch": 0.511958804797073, + "grad_norm": 6.07735538482666, + "learning_rate": 9.06838250393593e-05, + "loss": 0.7721, + "step": 7556 + }, + { + "epoch": 0.5120265600650451, + "grad_norm": 8.173517227172852, + "learning_rate": 9.068245602026148e-05, + "loss": 0.8789, + "step": 7557 + }, + { + "epoch": 0.5120943153330172, + "grad_norm": 6.656474590301514, + "learning_rate": 9.068108700116367e-05, + "loss": 0.9221, + "step": 7558 + }, + { + "epoch": 0.5121620706009892, + "grad_norm": 7.5140061378479, + "learning_rate": 9.067971798206585e-05, + "loss": 0.7789, + "step": 7559 + }, + { + "epoch": 0.5122298258689613, + "grad_norm": 6.517942428588867, + "learning_rate": 9.067834896296803e-05, + "loss": 0.7376, + "step": 7560 + }, + { + "epoch": 0.5122975811369334, + "grad_norm": 6.736027717590332, + "learning_rate": 9.067697994387023e-05, + "loss": 0.8051, + "step": 7561 + }, + { + "epoch": 0.5123653364049054, + "grad_norm": 11.268937110900879, + "learning_rate": 9.067561092477241e-05, + "loss": 0.7261, + "step": 7562 + }, + { + "epoch": 0.5124330916728775, + "grad_norm": 6.4452667236328125, + "learning_rate": 9.067424190567459e-05, + "loss": 0.9289, + "step": 7563 + }, + { + "epoch": 0.5125008469408496, + "grad_norm": 6.565738201141357, + "learning_rate": 9.067287288657678e-05, + "loss": 0.8039, + "step": 7564 + }, + { + "epoch": 0.5125686022088217, + "grad_norm": 5.919821739196777, + "learning_rate": 9.067150386747896e-05, + "loss": 0.7832, + "step": 7565 + }, + { + "epoch": 0.5126363574767938, + "grad_norm": 7.885809898376465, + "learning_rate": 9.067013484838114e-05, + "loss": 0.7081, + "step": 7566 + }, + { + "epoch": 0.5127041127447659, + "grad_norm": 6.188759803771973, + "learning_rate": 9.066876582928332e-05, + "loss": 0.9531, + "step": 7567 + }, + { + "epoch": 0.512771868012738, + "grad_norm": 7.5221452713012695, + "learning_rate": 9.06673968101855e-05, + "loss": 0.9727, + "step": 7568 + }, + { + "epoch": 0.5128396232807101, + "grad_norm": 6.461081027984619, + "learning_rate": 9.06660277910877e-05, + "loss": 0.9477, + "step": 7569 + }, + { + "epoch": 0.5129073785486822, + "grad_norm": 7.04016637802124, + "learning_rate": 9.066465877198988e-05, + "loss": 0.7655, + "step": 7570 + }, + { + "epoch": 0.5129751338166543, + "grad_norm": 6.721169471740723, + "learning_rate": 9.066328975289206e-05, + "loss": 0.7675, + "step": 7571 + }, + { + "epoch": 0.5130428890846264, + "grad_norm": 6.552652835845947, + "learning_rate": 9.066192073379424e-05, + "loss": 0.6938, + "step": 7572 + }, + { + "epoch": 0.5131106443525985, + "grad_norm": 5.2619452476501465, + "learning_rate": 9.066055171469643e-05, + "loss": 0.8072, + "step": 7573 + }, + { + "epoch": 0.5131783996205705, + "grad_norm": 6.426028728485107, + "learning_rate": 9.065918269559861e-05, + "loss": 0.8989, + "step": 7574 + }, + { + "epoch": 0.5132461548885426, + "grad_norm": 7.762108325958252, + "learning_rate": 9.06578136765008e-05, + "loss": 1.0512, + "step": 7575 + }, + { + "epoch": 0.5133139101565146, + "grad_norm": 5.862361431121826, + "learning_rate": 9.065644465740297e-05, + "loss": 0.8233, + "step": 7576 + }, + { + "epoch": 0.5133816654244867, + "grad_norm": 6.885676860809326, + "learning_rate": 9.065507563830515e-05, + "loss": 0.9328, + "step": 7577 + }, + { + "epoch": 0.5134494206924588, + "grad_norm": 6.061431884765625, + "learning_rate": 9.065370661920735e-05, + "loss": 0.6901, + "step": 7578 + }, + { + "epoch": 0.5135171759604309, + "grad_norm": 5.702830791473389, + "learning_rate": 9.065233760010953e-05, + "loss": 0.8137, + "step": 7579 + }, + { + "epoch": 0.513584931228403, + "grad_norm": 7.937398910522461, + "learning_rate": 9.065096858101171e-05, + "loss": 1.1357, + "step": 7580 + }, + { + "epoch": 0.5136526864963751, + "grad_norm": 6.353341102600098, + "learning_rate": 9.064959956191389e-05, + "loss": 0.8848, + "step": 7581 + }, + { + "epoch": 0.5137204417643472, + "grad_norm": 5.204100608825684, + "learning_rate": 9.064823054281608e-05, + "loss": 0.6812, + "step": 7582 + }, + { + "epoch": 0.5137881970323193, + "grad_norm": 6.726477146148682, + "learning_rate": 9.064686152371826e-05, + "loss": 0.8012, + "step": 7583 + }, + { + "epoch": 0.5138559523002914, + "grad_norm": 8.397887229919434, + "learning_rate": 9.064549250462044e-05, + "loss": 0.6674, + "step": 7584 + }, + { + "epoch": 0.5139237075682634, + "grad_norm": 6.773873329162598, + "learning_rate": 9.064412348552262e-05, + "loss": 0.7718, + "step": 7585 + }, + { + "epoch": 0.5139914628362355, + "grad_norm": 5.611907005310059, + "learning_rate": 9.06427544664248e-05, + "loss": 0.9307, + "step": 7586 + }, + { + "epoch": 0.5140592181042076, + "grad_norm": 7.3962225914001465, + "learning_rate": 9.0641385447327e-05, + "loss": 0.8438, + "step": 7587 + }, + { + "epoch": 0.5141269733721797, + "grad_norm": 7.1288580894470215, + "learning_rate": 9.064001642822918e-05, + "loss": 0.8185, + "step": 7588 + }, + { + "epoch": 0.5141947286401518, + "grad_norm": 6.204834461212158, + "learning_rate": 9.063864740913136e-05, + "loss": 0.8201, + "step": 7589 + }, + { + "epoch": 0.5142624839081239, + "grad_norm": 6.229215145111084, + "learning_rate": 9.063727839003354e-05, + "loss": 0.811, + "step": 7590 + }, + { + "epoch": 0.514330239176096, + "grad_norm": 7.3174285888671875, + "learning_rate": 9.063590937093573e-05, + "loss": 0.9745, + "step": 7591 + }, + { + "epoch": 0.514397994444068, + "grad_norm": 7.100773334503174, + "learning_rate": 9.063454035183791e-05, + "loss": 0.8132, + "step": 7592 + }, + { + "epoch": 0.5144657497120401, + "grad_norm": 7.439940929412842, + "learning_rate": 9.06331713327401e-05, + "loss": 1.042, + "step": 7593 + }, + { + "epoch": 0.5145335049800122, + "grad_norm": 6.337569713592529, + "learning_rate": 9.063180231364227e-05, + "loss": 0.6381, + "step": 7594 + }, + { + "epoch": 0.5146012602479842, + "grad_norm": 8.016763687133789, + "learning_rate": 9.063043329454445e-05, + "loss": 0.944, + "step": 7595 + }, + { + "epoch": 0.5146690155159563, + "grad_norm": 7.090301036834717, + "learning_rate": 9.062906427544665e-05, + "loss": 1.0519, + "step": 7596 + }, + { + "epoch": 0.5147367707839284, + "grad_norm": 6.24937629699707, + "learning_rate": 9.062769525634883e-05, + "loss": 0.916, + "step": 7597 + }, + { + "epoch": 0.5148045260519005, + "grad_norm": 6.316051959991455, + "learning_rate": 9.062632623725101e-05, + "loss": 0.8483, + "step": 7598 + }, + { + "epoch": 0.5148722813198726, + "grad_norm": 7.601284027099609, + "learning_rate": 9.062495721815319e-05, + "loss": 0.9532, + "step": 7599 + }, + { + "epoch": 0.5149400365878447, + "grad_norm": 6.206179618835449, + "learning_rate": 9.062358819905537e-05, + "loss": 0.8767, + "step": 7600 + }, + { + "epoch": 0.5150077918558168, + "grad_norm": 7.3104119300842285, + "learning_rate": 9.062221917995756e-05, + "loss": 0.9337, + "step": 7601 + }, + { + "epoch": 0.5150755471237889, + "grad_norm": 7.43108606338501, + "learning_rate": 9.062085016085974e-05, + "loss": 0.9725, + "step": 7602 + }, + { + "epoch": 0.515143302391761, + "grad_norm": 8.126846313476562, + "learning_rate": 9.061948114176192e-05, + "loss": 1.0367, + "step": 7603 + }, + { + "epoch": 0.5152110576597331, + "grad_norm": 6.7592926025390625, + "learning_rate": 9.061811212266412e-05, + "loss": 0.9557, + "step": 7604 + }, + { + "epoch": 0.5152788129277052, + "grad_norm": 5.046600341796875, + "learning_rate": 9.06167431035663e-05, + "loss": 0.601, + "step": 7605 + }, + { + "epoch": 0.5153465681956773, + "grad_norm": 7.141878604888916, + "learning_rate": 9.061537408446848e-05, + "loss": 0.8938, + "step": 7606 + }, + { + "epoch": 0.5154143234636493, + "grad_norm": 8.64307689666748, + "learning_rate": 9.061400506537067e-05, + "loss": 1.1668, + "step": 7607 + }, + { + "epoch": 0.5154820787316213, + "grad_norm": 6.312736988067627, + "learning_rate": 9.061263604627285e-05, + "loss": 0.7902, + "step": 7608 + }, + { + "epoch": 0.5155498339995934, + "grad_norm": 8.432990074157715, + "learning_rate": 9.061126702717503e-05, + "loss": 0.9889, + "step": 7609 + }, + { + "epoch": 0.5156175892675655, + "grad_norm": 9.558629989624023, + "learning_rate": 9.060989800807723e-05, + "loss": 0.8032, + "step": 7610 + }, + { + "epoch": 0.5156853445355376, + "grad_norm": 5.524839878082275, + "learning_rate": 9.060852898897941e-05, + "loss": 0.8536, + "step": 7611 + }, + { + "epoch": 0.5157530998035097, + "grad_norm": 6.718236923217773, + "learning_rate": 9.060715996988159e-05, + "loss": 1.1837, + "step": 7612 + }, + { + "epoch": 0.5158208550714818, + "grad_norm": 4.946925163269043, + "learning_rate": 9.060579095078377e-05, + "loss": 0.8149, + "step": 7613 + }, + { + "epoch": 0.5158886103394539, + "grad_norm": 8.684269905090332, + "learning_rate": 9.060442193168596e-05, + "loss": 1.1112, + "step": 7614 + }, + { + "epoch": 0.515956365607426, + "grad_norm": 5.708873271942139, + "learning_rate": 9.060305291258814e-05, + "loss": 0.9813, + "step": 7615 + }, + { + "epoch": 0.5160241208753981, + "grad_norm": 7.8680419921875, + "learning_rate": 9.060168389349032e-05, + "loss": 0.8572, + "step": 7616 + }, + { + "epoch": 0.5160918761433702, + "grad_norm": 7.092006206512451, + "learning_rate": 9.06003148743925e-05, + "loss": 1.0532, + "step": 7617 + }, + { + "epoch": 0.5161596314113422, + "grad_norm": 6.504335880279541, + "learning_rate": 9.059894585529468e-05, + "loss": 0.969, + "step": 7618 + }, + { + "epoch": 0.5162273866793143, + "grad_norm": 7.514725208282471, + "learning_rate": 9.059757683619688e-05, + "loss": 0.7966, + "step": 7619 + }, + { + "epoch": 0.5162951419472864, + "grad_norm": 5.896969318389893, + "learning_rate": 9.059620781709906e-05, + "loss": 0.6818, + "step": 7620 + }, + { + "epoch": 0.5163628972152585, + "grad_norm": 11.024290084838867, + "learning_rate": 9.059483879800124e-05, + "loss": 0.8365, + "step": 7621 + }, + { + "epoch": 0.5164306524832306, + "grad_norm": 6.390562534332275, + "learning_rate": 9.059346977890342e-05, + "loss": 0.9039, + "step": 7622 + }, + { + "epoch": 0.5164984077512027, + "grad_norm": 6.528719425201416, + "learning_rate": 9.05921007598056e-05, + "loss": 0.8762, + "step": 7623 + }, + { + "epoch": 0.5165661630191748, + "grad_norm": 6.40498685836792, + "learning_rate": 9.059073174070779e-05, + "loss": 1.1455, + "step": 7624 + }, + { + "epoch": 0.5166339182871468, + "grad_norm": 6.250789165496826, + "learning_rate": 9.058936272160997e-05, + "loss": 0.8012, + "step": 7625 + }, + { + "epoch": 0.5167016735551189, + "grad_norm": 5.280569076538086, + "learning_rate": 9.058799370251215e-05, + "loss": 0.5803, + "step": 7626 + }, + { + "epoch": 0.516769428823091, + "grad_norm": 6.143814563751221, + "learning_rate": 9.058662468341433e-05, + "loss": 0.8817, + "step": 7627 + }, + { + "epoch": 0.516837184091063, + "grad_norm": 5.753292560577393, + "learning_rate": 9.058525566431653e-05, + "loss": 0.6251, + "step": 7628 + }, + { + "epoch": 0.5169049393590351, + "grad_norm": 7.431488037109375, + "learning_rate": 9.058388664521871e-05, + "loss": 0.807, + "step": 7629 + }, + { + "epoch": 0.5169726946270072, + "grad_norm": 7.9407548904418945, + "learning_rate": 9.058251762612089e-05, + "loss": 0.9708, + "step": 7630 + }, + { + "epoch": 0.5170404498949793, + "grad_norm": 9.618108749389648, + "learning_rate": 9.058114860702307e-05, + "loss": 0.769, + "step": 7631 + }, + { + "epoch": 0.5171082051629514, + "grad_norm": 7.05092716217041, + "learning_rate": 9.057977958792525e-05, + "loss": 0.8202, + "step": 7632 + }, + { + "epoch": 0.5171759604309235, + "grad_norm": 8.478489875793457, + "learning_rate": 9.057841056882744e-05, + "loss": 0.836, + "step": 7633 + }, + { + "epoch": 0.5172437156988956, + "grad_norm": 9.781081199645996, + "learning_rate": 9.057704154972962e-05, + "loss": 1.1411, + "step": 7634 + }, + { + "epoch": 0.5173114709668677, + "grad_norm": 6.536010265350342, + "learning_rate": 9.05756725306318e-05, + "loss": 1.0199, + "step": 7635 + }, + { + "epoch": 0.5173792262348398, + "grad_norm": 6.633424758911133, + "learning_rate": 9.057430351153398e-05, + "loss": 0.8167, + "step": 7636 + }, + { + "epoch": 0.5174469815028119, + "grad_norm": 8.581771850585938, + "learning_rate": 9.057293449243618e-05, + "loss": 0.9926, + "step": 7637 + }, + { + "epoch": 0.517514736770784, + "grad_norm": 5.538379669189453, + "learning_rate": 9.057156547333836e-05, + "loss": 0.7093, + "step": 7638 + }, + { + "epoch": 0.5175824920387561, + "grad_norm": 6.037271022796631, + "learning_rate": 9.057019645424054e-05, + "loss": 0.9901, + "step": 7639 + }, + { + "epoch": 0.5176502473067282, + "grad_norm": 5.663453578948975, + "learning_rate": 9.056882743514272e-05, + "loss": 0.929, + "step": 7640 + }, + { + "epoch": 0.5177180025747001, + "grad_norm": 9.440011978149414, + "learning_rate": 9.05674584160449e-05, + "loss": 0.6656, + "step": 7641 + }, + { + "epoch": 0.5177857578426722, + "grad_norm": 7.314742088317871, + "learning_rate": 9.056608939694709e-05, + "loss": 1.0248, + "step": 7642 + }, + { + "epoch": 0.5178535131106443, + "grad_norm": 6.524215221405029, + "learning_rate": 9.056472037784927e-05, + "loss": 0.7533, + "step": 7643 + }, + { + "epoch": 0.5179212683786164, + "grad_norm": 7.727807998657227, + "learning_rate": 9.056335135875145e-05, + "loss": 0.8106, + "step": 7644 + }, + { + "epoch": 0.5179890236465885, + "grad_norm": 6.237912654876709, + "learning_rate": 9.056198233965363e-05, + "loss": 0.8731, + "step": 7645 + }, + { + "epoch": 0.5180567789145606, + "grad_norm": 5.7623796463012695, + "learning_rate": 9.056061332055583e-05, + "loss": 0.7102, + "step": 7646 + }, + { + "epoch": 0.5181245341825327, + "grad_norm": 7.946072578430176, + "learning_rate": 9.055924430145801e-05, + "loss": 0.9145, + "step": 7647 + }, + { + "epoch": 0.5181922894505048, + "grad_norm": 6.882560729980469, + "learning_rate": 9.055787528236019e-05, + "loss": 0.9543, + "step": 7648 + }, + { + "epoch": 0.5182600447184769, + "grad_norm": 5.425245761871338, + "learning_rate": 9.055650626326237e-05, + "loss": 0.5889, + "step": 7649 + }, + { + "epoch": 0.518327799986449, + "grad_norm": 5.8002142906188965, + "learning_rate": 9.055513724416456e-05, + "loss": 0.8603, + "step": 7650 + }, + { + "epoch": 0.518395555254421, + "grad_norm": 6.522377014160156, + "learning_rate": 9.055376822506674e-05, + "loss": 0.8288, + "step": 7651 + }, + { + "epoch": 0.5184633105223931, + "grad_norm": 6.6606221199035645, + "learning_rate": 9.055239920596892e-05, + "loss": 0.8647, + "step": 7652 + }, + { + "epoch": 0.5185310657903652, + "grad_norm": 7.66829776763916, + "learning_rate": 9.055103018687112e-05, + "loss": 1.0252, + "step": 7653 + }, + { + "epoch": 0.5185988210583373, + "grad_norm": 5.306484222412109, + "learning_rate": 9.05496611677733e-05, + "loss": 0.7053, + "step": 7654 + }, + { + "epoch": 0.5186665763263094, + "grad_norm": 5.434155464172363, + "learning_rate": 9.054829214867548e-05, + "loss": 0.7684, + "step": 7655 + }, + { + "epoch": 0.5187343315942815, + "grad_norm": 6.3002495765686035, + "learning_rate": 9.054692312957767e-05, + "loss": 0.8797, + "step": 7656 + }, + { + "epoch": 0.5188020868622535, + "grad_norm": 7.075455665588379, + "learning_rate": 9.054555411047985e-05, + "loss": 1.0045, + "step": 7657 + }, + { + "epoch": 0.5188698421302256, + "grad_norm": 8.508553504943848, + "learning_rate": 9.054418509138203e-05, + "loss": 0.8166, + "step": 7658 + }, + { + "epoch": 0.5189375973981977, + "grad_norm": 6.156308174133301, + "learning_rate": 9.054281607228421e-05, + "loss": 0.5456, + "step": 7659 + }, + { + "epoch": 0.5190053526661698, + "grad_norm": 5.993220806121826, + "learning_rate": 9.054144705318641e-05, + "loss": 0.8783, + "step": 7660 + }, + { + "epoch": 0.5190731079341419, + "grad_norm": 7.312931537628174, + "learning_rate": 9.054007803408859e-05, + "loss": 0.8758, + "step": 7661 + }, + { + "epoch": 0.519140863202114, + "grad_norm": 6.825846195220947, + "learning_rate": 9.053870901499077e-05, + "loss": 0.9189, + "step": 7662 + }, + { + "epoch": 0.519208618470086, + "grad_norm": 6.013767242431641, + "learning_rate": 9.053733999589295e-05, + "loss": 0.9579, + "step": 7663 + }, + { + "epoch": 0.5192763737380581, + "grad_norm": 7.817983627319336, + "learning_rate": 9.053597097679513e-05, + "loss": 0.7481, + "step": 7664 + }, + { + "epoch": 0.5193441290060302, + "grad_norm": 6.868429183959961, + "learning_rate": 9.053460195769732e-05, + "loss": 0.9323, + "step": 7665 + }, + { + "epoch": 0.5194118842740023, + "grad_norm": 8.720475196838379, + "learning_rate": 9.05332329385995e-05, + "loss": 0.6936, + "step": 7666 + }, + { + "epoch": 0.5194796395419744, + "grad_norm": 4.687426567077637, + "learning_rate": 9.053186391950168e-05, + "loss": 0.6088, + "step": 7667 + }, + { + "epoch": 0.5195473948099465, + "grad_norm": 7.063420295715332, + "learning_rate": 9.053049490040386e-05, + "loss": 1.0478, + "step": 7668 + }, + { + "epoch": 0.5196151500779186, + "grad_norm": 6.082928657531738, + "learning_rate": 9.052912588130606e-05, + "loss": 0.7404, + "step": 7669 + }, + { + "epoch": 0.5196829053458907, + "grad_norm": 6.03659200668335, + "learning_rate": 9.052775686220824e-05, + "loss": 0.9515, + "step": 7670 + }, + { + "epoch": 0.5197506606138628, + "grad_norm": 5.9596333503723145, + "learning_rate": 9.052638784311042e-05, + "loss": 0.789, + "step": 7671 + }, + { + "epoch": 0.5198184158818349, + "grad_norm": 6.595233917236328, + "learning_rate": 9.05250188240126e-05, + "loss": 0.8488, + "step": 7672 + }, + { + "epoch": 0.519886171149807, + "grad_norm": 5.341801643371582, + "learning_rate": 9.052364980491478e-05, + "loss": 0.7815, + "step": 7673 + }, + { + "epoch": 0.5199539264177789, + "grad_norm": 9.91911506652832, + "learning_rate": 9.052228078581697e-05, + "loss": 0.9121, + "step": 7674 + }, + { + "epoch": 0.520021681685751, + "grad_norm": 6.1603498458862305, + "learning_rate": 9.052091176671915e-05, + "loss": 0.9465, + "step": 7675 + }, + { + "epoch": 0.5200894369537231, + "grad_norm": 7.671917915344238, + "learning_rate": 9.051954274762133e-05, + "loss": 0.8225, + "step": 7676 + }, + { + "epoch": 0.5201571922216952, + "grad_norm": 7.148565769195557, + "learning_rate": 9.051817372852351e-05, + "loss": 0.6911, + "step": 7677 + }, + { + "epoch": 0.5202249474896673, + "grad_norm": 5.975964546203613, + "learning_rate": 9.05168047094257e-05, + "loss": 0.6338, + "step": 7678 + }, + { + "epoch": 0.5202927027576394, + "grad_norm": 6.8202924728393555, + "learning_rate": 9.051543569032789e-05, + "loss": 0.8961, + "step": 7679 + }, + { + "epoch": 0.5203604580256115, + "grad_norm": 6.167017459869385, + "learning_rate": 9.051406667123007e-05, + "loss": 0.7479, + "step": 7680 + }, + { + "epoch": 0.5204282132935836, + "grad_norm": 6.801616668701172, + "learning_rate": 9.051269765213225e-05, + "loss": 0.7939, + "step": 7681 + }, + { + "epoch": 0.5204959685615557, + "grad_norm": 5.709447860717773, + "learning_rate": 9.051132863303443e-05, + "loss": 0.74, + "step": 7682 + }, + { + "epoch": 0.5205637238295278, + "grad_norm": 7.522529602050781, + "learning_rate": 9.050995961393662e-05, + "loss": 0.8341, + "step": 7683 + }, + { + "epoch": 0.5206314790974998, + "grad_norm": 7.840425491333008, + "learning_rate": 9.05085905948388e-05, + "loss": 0.6591, + "step": 7684 + }, + { + "epoch": 0.5206992343654719, + "grad_norm": 5.461009979248047, + "learning_rate": 9.050722157574098e-05, + "loss": 0.6172, + "step": 7685 + }, + { + "epoch": 0.520766989633444, + "grad_norm": 6.047004222869873, + "learning_rate": 9.050585255664316e-05, + "loss": 0.8012, + "step": 7686 + }, + { + "epoch": 0.5208347449014161, + "grad_norm": 7.2090630531311035, + "learning_rate": 9.050448353754534e-05, + "loss": 0.9606, + "step": 7687 + }, + { + "epoch": 0.5209025001693882, + "grad_norm": 6.153731822967529, + "learning_rate": 9.050311451844754e-05, + "loss": 0.8408, + "step": 7688 + }, + { + "epoch": 0.5209702554373603, + "grad_norm": 6.858744144439697, + "learning_rate": 9.050174549934972e-05, + "loss": 0.9427, + "step": 7689 + }, + { + "epoch": 0.5210380107053323, + "grad_norm": 6.979424476623535, + "learning_rate": 9.05003764802519e-05, + "loss": 0.7301, + "step": 7690 + }, + { + "epoch": 0.5211057659733044, + "grad_norm": 6.714144706726074, + "learning_rate": 9.049900746115408e-05, + "loss": 1.0239, + "step": 7691 + }, + { + "epoch": 0.5211735212412765, + "grad_norm": 6.382346153259277, + "learning_rate": 9.049763844205627e-05, + "loss": 0.7648, + "step": 7692 + }, + { + "epoch": 0.5212412765092486, + "grad_norm": 6.722228050231934, + "learning_rate": 9.049626942295845e-05, + "loss": 0.5747, + "step": 7693 + }, + { + "epoch": 0.5213090317772207, + "grad_norm": 8.618867874145508, + "learning_rate": 9.049490040386063e-05, + "loss": 0.9178, + "step": 7694 + }, + { + "epoch": 0.5213767870451927, + "grad_norm": 7.99964714050293, + "learning_rate": 9.049353138476281e-05, + "loss": 0.989, + "step": 7695 + }, + { + "epoch": 0.5214445423131648, + "grad_norm": 8.504440307617188, + "learning_rate": 9.0492162365665e-05, + "loss": 0.7935, + "step": 7696 + }, + { + "epoch": 0.5215122975811369, + "grad_norm": 6.088517665863037, + "learning_rate": 9.049079334656719e-05, + "loss": 0.894, + "step": 7697 + }, + { + "epoch": 0.521580052849109, + "grad_norm": 7.038356304168701, + "learning_rate": 9.048942432746937e-05, + "loss": 0.8796, + "step": 7698 + }, + { + "epoch": 0.5216478081170811, + "grad_norm": 6.817752361297607, + "learning_rate": 9.048805530837155e-05, + "loss": 0.8724, + "step": 7699 + }, + { + "epoch": 0.5217155633850532, + "grad_norm": 6.776190280914307, + "learning_rate": 9.048668628927374e-05, + "loss": 0.8975, + "step": 7700 + }, + { + "epoch": 0.5217833186530253, + "grad_norm": 6.522316932678223, + "learning_rate": 9.048531727017592e-05, + "loss": 0.7861, + "step": 7701 + }, + { + "epoch": 0.5218510739209974, + "grad_norm": 8.256267547607422, + "learning_rate": 9.04839482510781e-05, + "loss": 1.1714, + "step": 7702 + }, + { + "epoch": 0.5219188291889695, + "grad_norm": 6.675931930541992, + "learning_rate": 9.04825792319803e-05, + "loss": 0.8718, + "step": 7703 + }, + { + "epoch": 0.5219865844569416, + "grad_norm": 7.407254695892334, + "learning_rate": 9.048121021288248e-05, + "loss": 1.1478, + "step": 7704 + }, + { + "epoch": 0.5220543397249137, + "grad_norm": 7.207369327545166, + "learning_rate": 9.047984119378466e-05, + "loss": 0.9569, + "step": 7705 + }, + { + "epoch": 0.5221220949928856, + "grad_norm": 5.364320278167725, + "learning_rate": 9.047847217468685e-05, + "loss": 0.7651, + "step": 7706 + }, + { + "epoch": 0.5221898502608577, + "grad_norm": 6.586124897003174, + "learning_rate": 9.047710315558903e-05, + "loss": 0.8322, + "step": 7707 + }, + { + "epoch": 0.5222576055288298, + "grad_norm": 6.659854412078857, + "learning_rate": 9.047573413649121e-05, + "loss": 0.8912, + "step": 7708 + }, + { + "epoch": 0.5223253607968019, + "grad_norm": 7.059124946594238, + "learning_rate": 9.047436511739339e-05, + "loss": 1.012, + "step": 7709 + }, + { + "epoch": 0.522393116064774, + "grad_norm": 6.54586124420166, + "learning_rate": 9.047299609829557e-05, + "loss": 0.9704, + "step": 7710 + }, + { + "epoch": 0.5224608713327461, + "grad_norm": 5.587647438049316, + "learning_rate": 9.047162707919777e-05, + "loss": 0.6115, + "step": 7711 + }, + { + "epoch": 0.5225286266007182, + "grad_norm": 6.951663970947266, + "learning_rate": 9.047025806009995e-05, + "loss": 0.7751, + "step": 7712 + }, + { + "epoch": 0.5225963818686903, + "grad_norm": 7.021738052368164, + "learning_rate": 9.046888904100213e-05, + "loss": 1.1632, + "step": 7713 + }, + { + "epoch": 0.5226641371366624, + "grad_norm": 6.46588134765625, + "learning_rate": 9.046752002190431e-05, + "loss": 0.7517, + "step": 7714 + }, + { + "epoch": 0.5227318924046345, + "grad_norm": 6.638917922973633, + "learning_rate": 9.04661510028065e-05, + "loss": 1.0171, + "step": 7715 + }, + { + "epoch": 0.5227996476726066, + "grad_norm": 5.955547332763672, + "learning_rate": 9.046478198370868e-05, + "loss": 0.9812, + "step": 7716 + }, + { + "epoch": 0.5228674029405787, + "grad_norm": 6.903220176696777, + "learning_rate": 9.046341296461086e-05, + "loss": 0.7894, + "step": 7717 + }, + { + "epoch": 0.5229351582085507, + "grad_norm": 7.656357288360596, + "learning_rate": 9.046204394551304e-05, + "loss": 1.0162, + "step": 7718 + }, + { + "epoch": 0.5230029134765228, + "grad_norm": 8.025165557861328, + "learning_rate": 9.046067492641522e-05, + "loss": 0.9158, + "step": 7719 + }, + { + "epoch": 0.5230706687444949, + "grad_norm": 7.652121067047119, + "learning_rate": 9.045930590731742e-05, + "loss": 0.9347, + "step": 7720 + }, + { + "epoch": 0.523138424012467, + "grad_norm": 7.028977870941162, + "learning_rate": 9.04579368882196e-05, + "loss": 0.9914, + "step": 7721 + }, + { + "epoch": 0.523206179280439, + "grad_norm": 6.23792028427124, + "learning_rate": 9.045656786912178e-05, + "loss": 0.7154, + "step": 7722 + }, + { + "epoch": 0.5232739345484111, + "grad_norm": 8.56432056427002, + "learning_rate": 9.045519885002396e-05, + "loss": 0.9627, + "step": 7723 + }, + { + "epoch": 0.5233416898163832, + "grad_norm": 6.733086109161377, + "learning_rate": 9.045382983092615e-05, + "loss": 0.8932, + "step": 7724 + }, + { + "epoch": 0.5234094450843553, + "grad_norm": 6.419559955596924, + "learning_rate": 9.045246081182833e-05, + "loss": 0.6503, + "step": 7725 + }, + { + "epoch": 0.5234772003523274, + "grad_norm": 6.498406887054443, + "learning_rate": 9.045109179273051e-05, + "loss": 1.0333, + "step": 7726 + }, + { + "epoch": 0.5235449556202995, + "grad_norm": 6.396055698394775, + "learning_rate": 9.044972277363269e-05, + "loss": 0.7628, + "step": 7727 + }, + { + "epoch": 0.5236127108882715, + "grad_norm": 5.671381950378418, + "learning_rate": 9.044835375453487e-05, + "loss": 0.7091, + "step": 7728 + }, + { + "epoch": 0.5236804661562436, + "grad_norm": 7.36892557144165, + "learning_rate": 9.044698473543707e-05, + "loss": 0.8571, + "step": 7729 + }, + { + "epoch": 0.5237482214242157, + "grad_norm": 7.574361324310303, + "learning_rate": 9.044561571633925e-05, + "loss": 0.9784, + "step": 7730 + }, + { + "epoch": 0.5238159766921878, + "grad_norm": 7.9222540855407715, + "learning_rate": 9.044424669724143e-05, + "loss": 0.8634, + "step": 7731 + }, + { + "epoch": 0.5238837319601599, + "grad_norm": 6.013153076171875, + "learning_rate": 9.044287767814361e-05, + "loss": 0.9073, + "step": 7732 + }, + { + "epoch": 0.523951487228132, + "grad_norm": 7.480499744415283, + "learning_rate": 9.044150865904579e-05, + "loss": 0.9001, + "step": 7733 + }, + { + "epoch": 0.5240192424961041, + "grad_norm": 5.559460163116455, + "learning_rate": 9.044013963994798e-05, + "loss": 0.7644, + "step": 7734 + }, + { + "epoch": 0.5240869977640762, + "grad_norm": 5.924765110015869, + "learning_rate": 9.043877062085016e-05, + "loss": 0.6714, + "step": 7735 + }, + { + "epoch": 0.5241547530320483, + "grad_norm": 5.304020404815674, + "learning_rate": 9.043740160175234e-05, + "loss": 0.6169, + "step": 7736 + }, + { + "epoch": 0.5242225083000204, + "grad_norm": 6.051050662994385, + "learning_rate": 9.043603258265452e-05, + "loss": 0.9661, + "step": 7737 + }, + { + "epoch": 0.5242902635679925, + "grad_norm": 7.776313304901123, + "learning_rate": 9.043466356355672e-05, + "loss": 0.9922, + "step": 7738 + }, + { + "epoch": 0.5243580188359644, + "grad_norm": 6.945296287536621, + "learning_rate": 9.04332945444589e-05, + "loss": 0.8607, + "step": 7739 + }, + { + "epoch": 0.5244257741039365, + "grad_norm": 7.384977340698242, + "learning_rate": 9.043192552536108e-05, + "loss": 1.1737, + "step": 7740 + }, + { + "epoch": 0.5244935293719086, + "grad_norm": 7.252483367919922, + "learning_rate": 9.043055650626326e-05, + "loss": 0.8508, + "step": 7741 + }, + { + "epoch": 0.5245612846398807, + "grad_norm": 6.793081283569336, + "learning_rate": 9.042918748716544e-05, + "loss": 0.8632, + "step": 7742 + }, + { + "epoch": 0.5246290399078528, + "grad_norm": 6.1907057762146, + "learning_rate": 9.042781846806763e-05, + "loss": 0.8447, + "step": 7743 + }, + { + "epoch": 0.5246967951758249, + "grad_norm": 6.956373691558838, + "learning_rate": 9.042644944896981e-05, + "loss": 0.7821, + "step": 7744 + }, + { + "epoch": 0.524764550443797, + "grad_norm": 6.5989484786987305, + "learning_rate": 9.0425080429872e-05, + "loss": 0.9129, + "step": 7745 + }, + { + "epoch": 0.5248323057117691, + "grad_norm": 6.416126728057861, + "learning_rate": 9.042371141077419e-05, + "loss": 0.7622, + "step": 7746 + }, + { + "epoch": 0.5249000609797412, + "grad_norm": 7.08583402633667, + "learning_rate": 9.042234239167637e-05, + "loss": 0.868, + "step": 7747 + }, + { + "epoch": 0.5249678162477133, + "grad_norm": 6.688178539276123, + "learning_rate": 9.042097337257855e-05, + "loss": 0.8158, + "step": 7748 + }, + { + "epoch": 0.5250355715156854, + "grad_norm": 6.829257488250732, + "learning_rate": 9.041960435348074e-05, + "loss": 1.0495, + "step": 7749 + }, + { + "epoch": 0.5251033267836575, + "grad_norm": 9.319602966308594, + "learning_rate": 9.041823533438292e-05, + "loss": 1.1006, + "step": 7750 + }, + { + "epoch": 0.5251710820516295, + "grad_norm": 5.709659576416016, + "learning_rate": 9.04168663152851e-05, + "loss": 0.8396, + "step": 7751 + }, + { + "epoch": 0.5252388373196016, + "grad_norm": 7.805490493774414, + "learning_rate": 9.04154972961873e-05, + "loss": 0.8824, + "step": 7752 + }, + { + "epoch": 0.5253065925875737, + "grad_norm": 6.146732330322266, + "learning_rate": 9.041412827708948e-05, + "loss": 0.6136, + "step": 7753 + }, + { + "epoch": 0.5253743478555458, + "grad_norm": 7.149491310119629, + "learning_rate": 9.041275925799166e-05, + "loss": 0.8545, + "step": 7754 + }, + { + "epoch": 0.5254421031235178, + "grad_norm": 6.13656759262085, + "learning_rate": 9.041139023889384e-05, + "loss": 0.6775, + "step": 7755 + }, + { + "epoch": 0.5255098583914899, + "grad_norm": 7.198244571685791, + "learning_rate": 9.041002121979602e-05, + "loss": 0.8732, + "step": 7756 + }, + { + "epoch": 0.525577613659462, + "grad_norm": 7.588368892669678, + "learning_rate": 9.040865220069821e-05, + "loss": 0.8651, + "step": 7757 + }, + { + "epoch": 0.5256453689274341, + "grad_norm": 6.695016860961914, + "learning_rate": 9.040728318160039e-05, + "loss": 0.8592, + "step": 7758 + }, + { + "epoch": 0.5257131241954062, + "grad_norm": 6.87295389175415, + "learning_rate": 9.040591416250257e-05, + "loss": 0.8187, + "step": 7759 + }, + { + "epoch": 0.5257808794633783, + "grad_norm": 6.2048163414001465, + "learning_rate": 9.040454514340475e-05, + "loss": 0.9454, + "step": 7760 + }, + { + "epoch": 0.5258486347313504, + "grad_norm": 6.1639933586120605, + "learning_rate": 9.040317612430695e-05, + "loss": 0.7686, + "step": 7761 + }, + { + "epoch": 0.5259163899993224, + "grad_norm": 6.537137508392334, + "learning_rate": 9.040180710520913e-05, + "loss": 0.8203, + "step": 7762 + }, + { + "epoch": 0.5259841452672945, + "grad_norm": 6.688928127288818, + "learning_rate": 9.040043808611131e-05, + "loss": 0.7651, + "step": 7763 + }, + { + "epoch": 0.5260519005352666, + "grad_norm": 7.225207805633545, + "learning_rate": 9.039906906701349e-05, + "loss": 0.7461, + "step": 7764 + }, + { + "epoch": 0.5261196558032387, + "grad_norm": 6.173831939697266, + "learning_rate": 9.039770004791567e-05, + "loss": 0.6654, + "step": 7765 + }, + { + "epoch": 0.5261874110712108, + "grad_norm": 6.1730146408081055, + "learning_rate": 9.039633102881786e-05, + "loss": 0.7595, + "step": 7766 + }, + { + "epoch": 0.5262551663391829, + "grad_norm": 6.485074520111084, + "learning_rate": 9.039496200972004e-05, + "loss": 0.8252, + "step": 7767 + }, + { + "epoch": 0.526322921607155, + "grad_norm": 5.83876895904541, + "learning_rate": 9.039359299062222e-05, + "loss": 0.7358, + "step": 7768 + }, + { + "epoch": 0.5263906768751271, + "grad_norm": 6.3718953132629395, + "learning_rate": 9.03922239715244e-05, + "loss": 0.8205, + "step": 7769 + }, + { + "epoch": 0.5264584321430992, + "grad_norm": 7.796750068664551, + "learning_rate": 9.03908549524266e-05, + "loss": 1.285, + "step": 7770 + }, + { + "epoch": 0.5265261874110712, + "grad_norm": 6.802064418792725, + "learning_rate": 9.038948593332878e-05, + "loss": 0.8769, + "step": 7771 + }, + { + "epoch": 0.5265939426790432, + "grad_norm": 6.941751956939697, + "learning_rate": 9.038811691423096e-05, + "loss": 0.8375, + "step": 7772 + }, + { + "epoch": 0.5266616979470153, + "grad_norm": 4.891567707061768, + "learning_rate": 9.038674789513314e-05, + "loss": 0.7047, + "step": 7773 + }, + { + "epoch": 0.5267294532149874, + "grad_norm": 6.294151306152344, + "learning_rate": 9.038537887603532e-05, + "loss": 0.9026, + "step": 7774 + }, + { + "epoch": 0.5267972084829595, + "grad_norm": 8.100536346435547, + "learning_rate": 9.038400985693751e-05, + "loss": 0.912, + "step": 7775 + }, + { + "epoch": 0.5268649637509316, + "grad_norm": 6.022440433502197, + "learning_rate": 9.038264083783969e-05, + "loss": 0.9056, + "step": 7776 + }, + { + "epoch": 0.5269327190189037, + "grad_norm": 5.4183454513549805, + "learning_rate": 9.038127181874187e-05, + "loss": 0.883, + "step": 7777 + }, + { + "epoch": 0.5270004742868758, + "grad_norm": 7.119368076324463, + "learning_rate": 9.037990279964405e-05, + "loss": 0.8019, + "step": 7778 + }, + { + "epoch": 0.5270682295548479, + "grad_norm": 7.654230117797852, + "learning_rate": 9.037853378054625e-05, + "loss": 0.9149, + "step": 7779 + }, + { + "epoch": 0.52713598482282, + "grad_norm": 8.382317543029785, + "learning_rate": 9.037716476144843e-05, + "loss": 0.8678, + "step": 7780 + }, + { + "epoch": 0.5272037400907921, + "grad_norm": 6.171057224273682, + "learning_rate": 9.037579574235061e-05, + "loss": 0.7199, + "step": 7781 + }, + { + "epoch": 0.5272714953587642, + "grad_norm": 7.584953784942627, + "learning_rate": 9.037442672325279e-05, + "loss": 1.1612, + "step": 7782 + }, + { + "epoch": 0.5273392506267363, + "grad_norm": 8.255087852478027, + "learning_rate": 9.037305770415497e-05, + "loss": 1.0615, + "step": 7783 + }, + { + "epoch": 0.5274070058947083, + "grad_norm": 5.925463676452637, + "learning_rate": 9.037168868505716e-05, + "loss": 0.5921, + "step": 7784 + }, + { + "epoch": 0.5274747611626804, + "grad_norm": 6.816803932189941, + "learning_rate": 9.037031966595934e-05, + "loss": 0.7954, + "step": 7785 + }, + { + "epoch": 0.5275425164306525, + "grad_norm": 5.511174201965332, + "learning_rate": 9.036895064686152e-05, + "loss": 0.6013, + "step": 7786 + }, + { + "epoch": 0.5276102716986246, + "grad_norm": 8.431069374084473, + "learning_rate": 9.03675816277637e-05, + "loss": 1.1493, + "step": 7787 + }, + { + "epoch": 0.5276780269665966, + "grad_norm": 7.5087666511535645, + "learning_rate": 9.036621260866588e-05, + "loss": 0.9416, + "step": 7788 + }, + { + "epoch": 0.5277457822345687, + "grad_norm": 6.478548049926758, + "learning_rate": 9.036484358956808e-05, + "loss": 0.7571, + "step": 7789 + }, + { + "epoch": 0.5278135375025408, + "grad_norm": 6.319628715515137, + "learning_rate": 9.036347457047026e-05, + "loss": 1.076, + "step": 7790 + }, + { + "epoch": 0.5278812927705129, + "grad_norm": 7.512424945831299, + "learning_rate": 9.036210555137244e-05, + "loss": 1.2705, + "step": 7791 + }, + { + "epoch": 0.527949048038485, + "grad_norm": 9.076953887939453, + "learning_rate": 9.036073653227463e-05, + "loss": 0.723, + "step": 7792 + }, + { + "epoch": 0.5280168033064571, + "grad_norm": 7.088381767272949, + "learning_rate": 9.035936751317681e-05, + "loss": 0.8198, + "step": 7793 + }, + { + "epoch": 0.5280845585744292, + "grad_norm": 5.79082727432251, + "learning_rate": 9.035799849407899e-05, + "loss": 0.6164, + "step": 7794 + }, + { + "epoch": 0.5281523138424012, + "grad_norm": 5.18831729888916, + "learning_rate": 9.035662947498119e-05, + "loss": 0.7178, + "step": 7795 + }, + { + "epoch": 0.5282200691103733, + "grad_norm": 5.686639308929443, + "learning_rate": 9.035526045588337e-05, + "loss": 0.8515, + "step": 7796 + }, + { + "epoch": 0.5282878243783454, + "grad_norm": 5.51866340637207, + "learning_rate": 9.035389143678555e-05, + "loss": 0.7451, + "step": 7797 + }, + { + "epoch": 0.5283555796463175, + "grad_norm": 5.142022609710693, + "learning_rate": 9.035252241768774e-05, + "loss": 0.8063, + "step": 7798 + }, + { + "epoch": 0.5284233349142896, + "grad_norm": 8.204577445983887, + "learning_rate": 9.035115339858992e-05, + "loss": 1.0189, + "step": 7799 + }, + { + "epoch": 0.5284910901822617, + "grad_norm": 7.62236213684082, + "learning_rate": 9.03497843794921e-05, + "loss": 1.0905, + "step": 7800 + }, + { + "epoch": 0.5285588454502338, + "grad_norm": 7.067042827606201, + "learning_rate": 9.034841536039428e-05, + "loss": 0.8651, + "step": 7801 + }, + { + "epoch": 0.5286266007182059, + "grad_norm": 5.675161361694336, + "learning_rate": 9.034704634129648e-05, + "loss": 0.8717, + "step": 7802 + }, + { + "epoch": 0.528694355986178, + "grad_norm": 6.024029731750488, + "learning_rate": 9.034567732219866e-05, + "loss": 0.8326, + "step": 7803 + }, + { + "epoch": 0.52876211125415, + "grad_norm": 6.060807228088379, + "learning_rate": 9.034430830310084e-05, + "loss": 0.7887, + "step": 7804 + }, + { + "epoch": 0.528829866522122, + "grad_norm": 7.034177780151367, + "learning_rate": 9.034293928400302e-05, + "loss": 0.8416, + "step": 7805 + }, + { + "epoch": 0.5288976217900941, + "grad_norm": 6.663289546966553, + "learning_rate": 9.03415702649052e-05, + "loss": 0.8078, + "step": 7806 + }, + { + "epoch": 0.5289653770580662, + "grad_norm": 5.6733880043029785, + "learning_rate": 9.034020124580739e-05, + "loss": 0.8106, + "step": 7807 + }, + { + "epoch": 0.5290331323260383, + "grad_norm": 7.0584940910339355, + "learning_rate": 9.033883222670957e-05, + "loss": 0.8558, + "step": 7808 + }, + { + "epoch": 0.5291008875940104, + "grad_norm": 7.715261459350586, + "learning_rate": 9.033746320761175e-05, + "loss": 0.9663, + "step": 7809 + }, + { + "epoch": 0.5291686428619825, + "grad_norm": 6.987175464630127, + "learning_rate": 9.033609418851393e-05, + "loss": 0.8837, + "step": 7810 + }, + { + "epoch": 0.5292363981299546, + "grad_norm": 6.454022407531738, + "learning_rate": 9.033472516941611e-05, + "loss": 0.7697, + "step": 7811 + }, + { + "epoch": 0.5293041533979267, + "grad_norm": 6.083451271057129, + "learning_rate": 9.03333561503183e-05, + "loss": 0.7759, + "step": 7812 + }, + { + "epoch": 0.5293719086658988, + "grad_norm": 6.736910343170166, + "learning_rate": 9.033198713122049e-05, + "loss": 0.7654, + "step": 7813 + }, + { + "epoch": 0.5294396639338709, + "grad_norm": 7.494657516479492, + "learning_rate": 9.033061811212267e-05, + "loss": 0.9894, + "step": 7814 + }, + { + "epoch": 0.529507419201843, + "grad_norm": 5.999954700469971, + "learning_rate": 9.032924909302485e-05, + "loss": 0.7583, + "step": 7815 + }, + { + "epoch": 0.5295751744698151, + "grad_norm": 6.741001605987549, + "learning_rate": 9.032788007392704e-05, + "loss": 0.6704, + "step": 7816 + }, + { + "epoch": 0.5296429297377871, + "grad_norm": 7.073209762573242, + "learning_rate": 9.032651105482922e-05, + "loss": 0.8104, + "step": 7817 + }, + { + "epoch": 0.5297106850057592, + "grad_norm": 6.424438953399658, + "learning_rate": 9.03251420357314e-05, + "loss": 0.8042, + "step": 7818 + }, + { + "epoch": 0.5297784402737313, + "grad_norm": 5.467334270477295, + "learning_rate": 9.032377301663358e-05, + "loss": 0.7108, + "step": 7819 + }, + { + "epoch": 0.5298461955417033, + "grad_norm": 7.322316646575928, + "learning_rate": 9.032240399753576e-05, + "loss": 0.9461, + "step": 7820 + }, + { + "epoch": 0.5299139508096754, + "grad_norm": 6.560715198516846, + "learning_rate": 9.032103497843796e-05, + "loss": 1.135, + "step": 7821 + }, + { + "epoch": 0.5299817060776475, + "grad_norm": 6.125954627990723, + "learning_rate": 9.031966595934014e-05, + "loss": 1.0004, + "step": 7822 + }, + { + "epoch": 0.5300494613456196, + "grad_norm": 6.26869535446167, + "learning_rate": 9.031829694024232e-05, + "loss": 0.7594, + "step": 7823 + }, + { + "epoch": 0.5301172166135917, + "grad_norm": 7.5978193283081055, + "learning_rate": 9.03169279211445e-05, + "loss": 0.7051, + "step": 7824 + }, + { + "epoch": 0.5301849718815638, + "grad_norm": 9.408920288085938, + "learning_rate": 9.031555890204669e-05, + "loss": 0.7884, + "step": 7825 + }, + { + "epoch": 0.5302527271495359, + "grad_norm": 7.134509086608887, + "learning_rate": 9.031418988294887e-05, + "loss": 0.844, + "step": 7826 + }, + { + "epoch": 0.530320482417508, + "grad_norm": 9.650331497192383, + "learning_rate": 9.031282086385105e-05, + "loss": 0.9636, + "step": 7827 + }, + { + "epoch": 0.53038823768548, + "grad_norm": 8.131028175354004, + "learning_rate": 9.031145184475323e-05, + "loss": 1.1321, + "step": 7828 + }, + { + "epoch": 0.5304559929534521, + "grad_norm": 6.363440036773682, + "learning_rate": 9.031008282565541e-05, + "loss": 0.7437, + "step": 7829 + }, + { + "epoch": 0.5305237482214242, + "grad_norm": 6.6181440353393555, + "learning_rate": 9.030871380655761e-05, + "loss": 1.1998, + "step": 7830 + }, + { + "epoch": 0.5305915034893963, + "grad_norm": 6.380220413208008, + "learning_rate": 9.030734478745979e-05, + "loss": 0.8995, + "step": 7831 + }, + { + "epoch": 0.5306592587573684, + "grad_norm": 7.590890407562256, + "learning_rate": 9.030597576836197e-05, + "loss": 0.8736, + "step": 7832 + }, + { + "epoch": 0.5307270140253405, + "grad_norm": 7.453338146209717, + "learning_rate": 9.030460674926415e-05, + "loss": 0.7418, + "step": 7833 + }, + { + "epoch": 0.5307947692933126, + "grad_norm": 5.30055570602417, + "learning_rate": 9.030323773016634e-05, + "loss": 0.7002, + "step": 7834 + }, + { + "epoch": 0.5308625245612847, + "grad_norm": 7.599035739898682, + "learning_rate": 9.030186871106852e-05, + "loss": 1.0054, + "step": 7835 + }, + { + "epoch": 0.5309302798292568, + "grad_norm": 7.773530006408691, + "learning_rate": 9.03004996919707e-05, + "loss": 0.8603, + "step": 7836 + }, + { + "epoch": 0.5309980350972288, + "grad_norm": 7.378914833068848, + "learning_rate": 9.029913067287288e-05, + "loss": 0.8082, + "step": 7837 + }, + { + "epoch": 0.5310657903652009, + "grad_norm": 5.327150344848633, + "learning_rate": 9.029776165377508e-05, + "loss": 0.7642, + "step": 7838 + }, + { + "epoch": 0.5311335456331729, + "grad_norm": 6.01016092300415, + "learning_rate": 9.029639263467726e-05, + "loss": 1.0466, + "step": 7839 + }, + { + "epoch": 0.531201300901145, + "grad_norm": 4.763113498687744, + "learning_rate": 9.029502361557944e-05, + "loss": 0.7451, + "step": 7840 + }, + { + "epoch": 0.5312690561691171, + "grad_norm": 5.175201416015625, + "learning_rate": 9.029365459648163e-05, + "loss": 0.6928, + "step": 7841 + }, + { + "epoch": 0.5313368114370892, + "grad_norm": 9.04288387298584, + "learning_rate": 9.029228557738381e-05, + "loss": 0.7087, + "step": 7842 + }, + { + "epoch": 0.5314045667050613, + "grad_norm": 6.132844924926758, + "learning_rate": 9.029091655828599e-05, + "loss": 0.8231, + "step": 7843 + }, + { + "epoch": 0.5314723219730334, + "grad_norm": 7.074441909790039, + "learning_rate": 9.028954753918819e-05, + "loss": 0.7966, + "step": 7844 + }, + { + "epoch": 0.5315400772410055, + "grad_norm": 7.046668529510498, + "learning_rate": 9.028817852009037e-05, + "loss": 0.9588, + "step": 7845 + }, + { + "epoch": 0.5316078325089776, + "grad_norm": 6.8423662185668945, + "learning_rate": 9.028680950099255e-05, + "loss": 0.9413, + "step": 7846 + }, + { + "epoch": 0.5316755877769497, + "grad_norm": 5.093846797943115, + "learning_rate": 9.028544048189473e-05, + "loss": 0.6967, + "step": 7847 + }, + { + "epoch": 0.5317433430449218, + "grad_norm": 7.116734027862549, + "learning_rate": 9.028407146279692e-05, + "loss": 0.8084, + "step": 7848 + }, + { + "epoch": 0.5318110983128939, + "grad_norm": 6.06544828414917, + "learning_rate": 9.02827024436991e-05, + "loss": 0.8041, + "step": 7849 + }, + { + "epoch": 0.531878853580866, + "grad_norm": 6.238218307495117, + "learning_rate": 9.028133342460128e-05, + "loss": 0.9562, + "step": 7850 + }, + { + "epoch": 0.531946608848838, + "grad_norm": 5.939441680908203, + "learning_rate": 9.027996440550346e-05, + "loss": 0.8226, + "step": 7851 + }, + { + "epoch": 0.5320143641168101, + "grad_norm": 5.409734725952148, + "learning_rate": 9.027859538640564e-05, + "loss": 0.7485, + "step": 7852 + }, + { + "epoch": 0.5320821193847821, + "grad_norm": 5.72265625, + "learning_rate": 9.027722636730784e-05, + "loss": 0.691, + "step": 7853 + }, + { + "epoch": 0.5321498746527542, + "grad_norm": 5.743767261505127, + "learning_rate": 9.027585734821002e-05, + "loss": 0.8193, + "step": 7854 + }, + { + "epoch": 0.5322176299207263, + "grad_norm": 7.155900955200195, + "learning_rate": 9.02744883291122e-05, + "loss": 1.1046, + "step": 7855 + }, + { + "epoch": 0.5322853851886984, + "grad_norm": 5.2816996574401855, + "learning_rate": 9.027311931001438e-05, + "loss": 0.5548, + "step": 7856 + }, + { + "epoch": 0.5323531404566705, + "grad_norm": 8.357234001159668, + "learning_rate": 9.027175029091657e-05, + "loss": 0.9597, + "step": 7857 + }, + { + "epoch": 0.5324208957246426, + "grad_norm": 6.937591075897217, + "learning_rate": 9.027038127181875e-05, + "loss": 1.0591, + "step": 7858 + }, + { + "epoch": 0.5324886509926147, + "grad_norm": 9.02571964263916, + "learning_rate": 9.026901225272093e-05, + "loss": 0.9186, + "step": 7859 + }, + { + "epoch": 0.5325564062605868, + "grad_norm": 6.292126178741455, + "learning_rate": 9.026764323362311e-05, + "loss": 0.9691, + "step": 7860 + }, + { + "epoch": 0.5326241615285588, + "grad_norm": 6.714123725891113, + "learning_rate": 9.026627421452529e-05, + "loss": 0.6972, + "step": 7861 + }, + { + "epoch": 0.5326919167965309, + "grad_norm": 8.798256874084473, + "learning_rate": 9.026490519542749e-05, + "loss": 1.237, + "step": 7862 + }, + { + "epoch": 0.532759672064503, + "grad_norm": 7.367827892303467, + "learning_rate": 9.026353617632967e-05, + "loss": 0.8725, + "step": 7863 + }, + { + "epoch": 0.5328274273324751, + "grad_norm": 7.99265193939209, + "learning_rate": 9.026216715723185e-05, + "loss": 0.7761, + "step": 7864 + }, + { + "epoch": 0.5328951826004472, + "grad_norm": 6.861504554748535, + "learning_rate": 9.026079813813403e-05, + "loss": 1.0259, + "step": 7865 + }, + { + "epoch": 0.5329629378684193, + "grad_norm": 6.661327838897705, + "learning_rate": 9.025942911903621e-05, + "loss": 0.7245, + "step": 7866 + }, + { + "epoch": 0.5330306931363914, + "grad_norm": 6.973964691162109, + "learning_rate": 9.02580600999384e-05, + "loss": 1.0628, + "step": 7867 + }, + { + "epoch": 0.5330984484043635, + "grad_norm": 6.732471466064453, + "learning_rate": 9.025669108084058e-05, + "loss": 0.7972, + "step": 7868 + }, + { + "epoch": 0.5331662036723355, + "grad_norm": 5.995905876159668, + "learning_rate": 9.025532206174276e-05, + "loss": 0.831, + "step": 7869 + }, + { + "epoch": 0.5332339589403076, + "grad_norm": 7.870401382446289, + "learning_rate": 9.025395304264494e-05, + "loss": 0.9793, + "step": 7870 + }, + { + "epoch": 0.5333017142082797, + "grad_norm": 5.577757835388184, + "learning_rate": 9.025258402354714e-05, + "loss": 0.7032, + "step": 7871 + }, + { + "epoch": 0.5333694694762517, + "grad_norm": 7.106391429901123, + "learning_rate": 9.025121500444932e-05, + "loss": 0.8424, + "step": 7872 + }, + { + "epoch": 0.5334372247442238, + "grad_norm": 5.373198986053467, + "learning_rate": 9.02498459853515e-05, + "loss": 0.8357, + "step": 7873 + }, + { + "epoch": 0.5335049800121959, + "grad_norm": 6.655058860778809, + "learning_rate": 9.024847696625368e-05, + "loss": 1.0405, + "step": 7874 + }, + { + "epoch": 0.533572735280168, + "grad_norm": 5.896321773529053, + "learning_rate": 9.024710794715586e-05, + "loss": 1.0676, + "step": 7875 + }, + { + "epoch": 0.5336404905481401, + "grad_norm": 7.391895771026611, + "learning_rate": 9.024573892805805e-05, + "loss": 0.9807, + "step": 7876 + }, + { + "epoch": 0.5337082458161122, + "grad_norm": 8.472131729125977, + "learning_rate": 9.024436990896023e-05, + "loss": 1.0328, + "step": 7877 + }, + { + "epoch": 0.5337760010840843, + "grad_norm": 5.47896671295166, + "learning_rate": 9.024300088986241e-05, + "loss": 0.6698, + "step": 7878 + }, + { + "epoch": 0.5338437563520564, + "grad_norm": 7.560204982757568, + "learning_rate": 9.024163187076459e-05, + "loss": 0.7431, + "step": 7879 + }, + { + "epoch": 0.5339115116200285, + "grad_norm": 7.789547920227051, + "learning_rate": 9.024026285166679e-05, + "loss": 0.7852, + "step": 7880 + }, + { + "epoch": 0.5339792668880006, + "grad_norm": 6.868436336517334, + "learning_rate": 9.023889383256897e-05, + "loss": 0.8807, + "step": 7881 + }, + { + "epoch": 0.5340470221559727, + "grad_norm": 5.439123153686523, + "learning_rate": 9.023752481347115e-05, + "loss": 0.8914, + "step": 7882 + }, + { + "epoch": 0.5341147774239448, + "grad_norm": 6.825755596160889, + "learning_rate": 9.023615579437333e-05, + "loss": 0.9643, + "step": 7883 + }, + { + "epoch": 0.5341825326919168, + "grad_norm": 8.002971649169922, + "learning_rate": 9.023478677527552e-05, + "loss": 0.8881, + "step": 7884 + }, + { + "epoch": 0.5342502879598889, + "grad_norm": 6.1144022941589355, + "learning_rate": 9.02334177561777e-05, + "loss": 0.9887, + "step": 7885 + }, + { + "epoch": 0.5343180432278609, + "grad_norm": 6.414967060089111, + "learning_rate": 9.023204873707988e-05, + "loss": 0.8882, + "step": 7886 + }, + { + "epoch": 0.534385798495833, + "grad_norm": 6.222415447235107, + "learning_rate": 9.023067971798208e-05, + "loss": 0.7824, + "step": 7887 + }, + { + "epoch": 0.5344535537638051, + "grad_norm": 6.128655910491943, + "learning_rate": 9.022931069888426e-05, + "loss": 1.0061, + "step": 7888 + }, + { + "epoch": 0.5345213090317772, + "grad_norm": 8.61054515838623, + "learning_rate": 9.022794167978644e-05, + "loss": 0.8921, + "step": 7889 + }, + { + "epoch": 0.5345890642997493, + "grad_norm": 7.387160301208496, + "learning_rate": 9.022657266068863e-05, + "loss": 1.0001, + "step": 7890 + }, + { + "epoch": 0.5346568195677214, + "grad_norm": 5.878726005554199, + "learning_rate": 9.022520364159081e-05, + "loss": 0.8865, + "step": 7891 + }, + { + "epoch": 0.5347245748356935, + "grad_norm": 6.011378288269043, + "learning_rate": 9.022383462249299e-05, + "loss": 0.8961, + "step": 7892 + }, + { + "epoch": 0.5347923301036656, + "grad_norm": 6.258980751037598, + "learning_rate": 9.022246560339517e-05, + "loss": 0.8744, + "step": 7893 + }, + { + "epoch": 0.5348600853716377, + "grad_norm": 8.403583526611328, + "learning_rate": 9.022109658429737e-05, + "loss": 0.7601, + "step": 7894 + }, + { + "epoch": 0.5349278406396097, + "grad_norm": 9.266546249389648, + "learning_rate": 9.021972756519955e-05, + "loss": 1.0684, + "step": 7895 + }, + { + "epoch": 0.5349955959075818, + "grad_norm": 6.160810470581055, + "learning_rate": 9.021835854610173e-05, + "loss": 0.6533, + "step": 7896 + }, + { + "epoch": 0.5350633511755539, + "grad_norm": 6.975162982940674, + "learning_rate": 9.02169895270039e-05, + "loss": 0.8974, + "step": 7897 + }, + { + "epoch": 0.535131106443526, + "grad_norm": 6.557673931121826, + "learning_rate": 9.021562050790609e-05, + "loss": 0.8291, + "step": 7898 + }, + { + "epoch": 0.5351988617114981, + "grad_norm": 6.393524169921875, + "learning_rate": 9.021425148880828e-05, + "loss": 0.8898, + "step": 7899 + }, + { + "epoch": 0.5352666169794702, + "grad_norm": 6.089913368225098, + "learning_rate": 9.021288246971046e-05, + "loss": 0.9246, + "step": 7900 + }, + { + "epoch": 0.5353343722474423, + "grad_norm": 5.542110919952393, + "learning_rate": 9.021151345061264e-05, + "loss": 0.7785, + "step": 7901 + }, + { + "epoch": 0.5354021275154143, + "grad_norm": 6.648521900177002, + "learning_rate": 9.021014443151482e-05, + "loss": 0.7992, + "step": 7902 + }, + { + "epoch": 0.5354698827833864, + "grad_norm": 5.972752571105957, + "learning_rate": 9.020877541241702e-05, + "loss": 0.8436, + "step": 7903 + }, + { + "epoch": 0.5355376380513585, + "grad_norm": 8.538671493530273, + "learning_rate": 9.02074063933192e-05, + "loss": 0.7983, + "step": 7904 + }, + { + "epoch": 0.5356053933193305, + "grad_norm": 6.8005690574646, + "learning_rate": 9.020603737422138e-05, + "loss": 0.6566, + "step": 7905 + }, + { + "epoch": 0.5356731485873026, + "grad_norm": 9.595316886901855, + "learning_rate": 9.020466835512356e-05, + "loss": 0.9392, + "step": 7906 + }, + { + "epoch": 0.5357409038552747, + "grad_norm": 6.57702112197876, + "learning_rate": 9.020329933602574e-05, + "loss": 0.8176, + "step": 7907 + }, + { + "epoch": 0.5358086591232468, + "grad_norm": 6.209599018096924, + "learning_rate": 9.020193031692793e-05, + "loss": 0.8261, + "step": 7908 + }, + { + "epoch": 0.5358764143912189, + "grad_norm": 8.036966323852539, + "learning_rate": 9.020056129783011e-05, + "loss": 0.8524, + "step": 7909 + }, + { + "epoch": 0.535944169659191, + "grad_norm": 7.813173770904541, + "learning_rate": 9.019919227873229e-05, + "loss": 1.0166, + "step": 7910 + }, + { + "epoch": 0.5360119249271631, + "grad_norm": 6.065460205078125, + "learning_rate": 9.019782325963447e-05, + "loss": 0.9869, + "step": 7911 + }, + { + "epoch": 0.5360796801951352, + "grad_norm": 5.9636993408203125, + "learning_rate": 9.019645424053667e-05, + "loss": 0.8227, + "step": 7912 + }, + { + "epoch": 0.5361474354631073, + "grad_norm": 7.270611763000488, + "learning_rate": 9.019508522143885e-05, + "loss": 0.7653, + "step": 7913 + }, + { + "epoch": 0.5362151907310794, + "grad_norm": 6.054271221160889, + "learning_rate": 9.019371620234103e-05, + "loss": 0.6753, + "step": 7914 + }, + { + "epoch": 0.5362829459990515, + "grad_norm": 6.736015796661377, + "learning_rate": 9.019234718324321e-05, + "loss": 0.868, + "step": 7915 + }, + { + "epoch": 0.5363507012670236, + "grad_norm": 6.971399307250977, + "learning_rate": 9.019097816414539e-05, + "loss": 0.9097, + "step": 7916 + }, + { + "epoch": 0.5364184565349956, + "grad_norm": 7.001399040222168, + "learning_rate": 9.018960914504758e-05, + "loss": 0.9245, + "step": 7917 + }, + { + "epoch": 0.5364862118029676, + "grad_norm": 6.465878009796143, + "learning_rate": 9.018824012594976e-05, + "loss": 0.8308, + "step": 7918 + }, + { + "epoch": 0.5365539670709397, + "grad_norm": 6.622878551483154, + "learning_rate": 9.018687110685194e-05, + "loss": 0.6466, + "step": 7919 + }, + { + "epoch": 0.5366217223389118, + "grad_norm": 7.167222023010254, + "learning_rate": 9.018550208775412e-05, + "loss": 0.9521, + "step": 7920 + }, + { + "epoch": 0.5366894776068839, + "grad_norm": 7.362137794494629, + "learning_rate": 9.01841330686563e-05, + "loss": 0.7561, + "step": 7921 + }, + { + "epoch": 0.536757232874856, + "grad_norm": 6.314590930938721, + "learning_rate": 9.01827640495585e-05, + "loss": 0.8892, + "step": 7922 + }, + { + "epoch": 0.5368249881428281, + "grad_norm": 5.427424430847168, + "learning_rate": 9.018139503046068e-05, + "loss": 0.7648, + "step": 7923 + }, + { + "epoch": 0.5368927434108002, + "grad_norm": 8.388633728027344, + "learning_rate": 9.018002601136286e-05, + "loss": 1.0141, + "step": 7924 + }, + { + "epoch": 0.5369604986787723, + "grad_norm": 5.852472305297852, + "learning_rate": 9.017865699226504e-05, + "loss": 0.7744, + "step": 7925 + }, + { + "epoch": 0.5370282539467444, + "grad_norm": 6.68207311630249, + "learning_rate": 9.017728797316723e-05, + "loss": 0.9003, + "step": 7926 + }, + { + "epoch": 0.5370960092147165, + "grad_norm": 6.475498199462891, + "learning_rate": 9.017591895406941e-05, + "loss": 1.0294, + "step": 7927 + }, + { + "epoch": 0.5371637644826885, + "grad_norm": 8.690805435180664, + "learning_rate": 9.017454993497159e-05, + "loss": 1.0175, + "step": 7928 + }, + { + "epoch": 0.5372315197506606, + "grad_norm": 5.874716758728027, + "learning_rate": 9.017318091587377e-05, + "loss": 0.7761, + "step": 7929 + }, + { + "epoch": 0.5372992750186327, + "grad_norm": 5.937534332275391, + "learning_rate": 9.017181189677595e-05, + "loss": 0.6782, + "step": 7930 + }, + { + "epoch": 0.5373670302866048, + "grad_norm": 6.882140159606934, + "learning_rate": 9.017044287767815e-05, + "loss": 0.8815, + "step": 7931 + }, + { + "epoch": 0.5374347855545769, + "grad_norm": 5.27649450302124, + "learning_rate": 9.016907385858033e-05, + "loss": 0.6816, + "step": 7932 + }, + { + "epoch": 0.537502540822549, + "grad_norm": 5.861900329589844, + "learning_rate": 9.016770483948251e-05, + "loss": 0.7224, + "step": 7933 + }, + { + "epoch": 0.537570296090521, + "grad_norm": 5.196268558502197, + "learning_rate": 9.01663358203847e-05, + "loss": 0.7526, + "step": 7934 + }, + { + "epoch": 0.5376380513584931, + "grad_norm": 7.050784111022949, + "learning_rate": 9.016496680128688e-05, + "loss": 0.7451, + "step": 7935 + }, + { + "epoch": 0.5377058066264652, + "grad_norm": 6.952356338500977, + "learning_rate": 9.016359778218906e-05, + "loss": 0.8074, + "step": 7936 + }, + { + "epoch": 0.5377735618944373, + "grad_norm": 6.38202428817749, + "learning_rate": 9.016222876309126e-05, + "loss": 0.8738, + "step": 7937 + }, + { + "epoch": 0.5378413171624093, + "grad_norm": 7.252553462982178, + "learning_rate": 9.016085974399344e-05, + "loss": 1.001, + "step": 7938 + }, + { + "epoch": 0.5379090724303814, + "grad_norm": 7.240688323974609, + "learning_rate": 9.015949072489562e-05, + "loss": 0.818, + "step": 7939 + }, + { + "epoch": 0.5379768276983535, + "grad_norm": 8.255992889404297, + "learning_rate": 9.015812170579781e-05, + "loss": 0.9443, + "step": 7940 + }, + { + "epoch": 0.5380445829663256, + "grad_norm": 8.154399871826172, + "learning_rate": 9.015675268669999e-05, + "loss": 1.2508, + "step": 7941 + }, + { + "epoch": 0.5381123382342977, + "grad_norm": 6.9914069175720215, + "learning_rate": 9.015538366760217e-05, + "loss": 0.8552, + "step": 7942 + }, + { + "epoch": 0.5381800935022698, + "grad_norm": 6.447218894958496, + "learning_rate": 9.015401464850435e-05, + "loss": 0.9249, + "step": 7943 + }, + { + "epoch": 0.5382478487702419, + "grad_norm": 6.867689609527588, + "learning_rate": 9.015264562940653e-05, + "loss": 1.0145, + "step": 7944 + }, + { + "epoch": 0.538315604038214, + "grad_norm": 5.5277419090271, + "learning_rate": 9.015127661030873e-05, + "loss": 0.5239, + "step": 7945 + }, + { + "epoch": 0.5383833593061861, + "grad_norm": 6.092846393585205, + "learning_rate": 9.01499075912109e-05, + "loss": 0.7933, + "step": 7946 + }, + { + "epoch": 0.5384511145741582, + "grad_norm": 5.636915683746338, + "learning_rate": 9.014853857211309e-05, + "loss": 1.0177, + "step": 7947 + }, + { + "epoch": 0.5385188698421303, + "grad_norm": 6.031810760498047, + "learning_rate": 9.014716955301527e-05, + "loss": 0.7189, + "step": 7948 + }, + { + "epoch": 0.5385866251101024, + "grad_norm": 6.822633266448975, + "learning_rate": 9.014580053391746e-05, + "loss": 0.8511, + "step": 7949 + }, + { + "epoch": 0.5386543803780744, + "grad_norm": 6.105456352233887, + "learning_rate": 9.014443151481964e-05, + "loss": 0.8941, + "step": 7950 + }, + { + "epoch": 0.5387221356460464, + "grad_norm": 6.250092029571533, + "learning_rate": 9.014306249572182e-05, + "loss": 0.7968, + "step": 7951 + }, + { + "epoch": 0.5387898909140185, + "grad_norm": 7.53422212600708, + "learning_rate": 9.0141693476624e-05, + "loss": 0.7581, + "step": 7952 + }, + { + "epoch": 0.5388576461819906, + "grad_norm": 5.800909042358398, + "learning_rate": 9.014032445752618e-05, + "loss": 0.5946, + "step": 7953 + }, + { + "epoch": 0.5389254014499627, + "grad_norm": 6.202663898468018, + "learning_rate": 9.013895543842838e-05, + "loss": 0.9158, + "step": 7954 + }, + { + "epoch": 0.5389931567179348, + "grad_norm": 6.153894424438477, + "learning_rate": 9.013758641933056e-05, + "loss": 0.921, + "step": 7955 + }, + { + "epoch": 0.5390609119859069, + "grad_norm": 7.0276384353637695, + "learning_rate": 9.013621740023274e-05, + "loss": 0.8353, + "step": 7956 + }, + { + "epoch": 0.539128667253879, + "grad_norm": 7.378089427947998, + "learning_rate": 9.013484838113492e-05, + "loss": 0.7336, + "step": 7957 + }, + { + "epoch": 0.5391964225218511, + "grad_norm": 7.097971439361572, + "learning_rate": 9.013347936203711e-05, + "loss": 0.7911, + "step": 7958 + }, + { + "epoch": 0.5392641777898232, + "grad_norm": 6.771259307861328, + "learning_rate": 9.013211034293929e-05, + "loss": 0.8515, + "step": 7959 + }, + { + "epoch": 0.5393319330577953, + "grad_norm": 6.949087142944336, + "learning_rate": 9.013074132384147e-05, + "loss": 1.0146, + "step": 7960 + }, + { + "epoch": 0.5393996883257673, + "grad_norm": 7.584465026855469, + "learning_rate": 9.012937230474365e-05, + "loss": 0.681, + "step": 7961 + }, + { + "epoch": 0.5394674435937394, + "grad_norm": 7.227475643157959, + "learning_rate": 9.012800328564583e-05, + "loss": 1.085, + "step": 7962 + }, + { + "epoch": 0.5395351988617115, + "grad_norm": 5.863607406616211, + "learning_rate": 9.012663426654803e-05, + "loss": 0.7184, + "step": 7963 + }, + { + "epoch": 0.5396029541296836, + "grad_norm": 6.440633296966553, + "learning_rate": 9.01252652474502e-05, + "loss": 0.7896, + "step": 7964 + }, + { + "epoch": 0.5396707093976557, + "grad_norm": 6.37113094329834, + "learning_rate": 9.012389622835239e-05, + "loss": 0.8235, + "step": 7965 + }, + { + "epoch": 0.5397384646656278, + "grad_norm": 7.00465202331543, + "learning_rate": 9.012252720925457e-05, + "loss": 0.9816, + "step": 7966 + }, + { + "epoch": 0.5398062199335998, + "grad_norm": 6.948329925537109, + "learning_rate": 9.012115819015676e-05, + "loss": 0.721, + "step": 7967 + }, + { + "epoch": 0.5398739752015719, + "grad_norm": 7.836682319641113, + "learning_rate": 9.011978917105894e-05, + "loss": 0.8239, + "step": 7968 + }, + { + "epoch": 0.539941730469544, + "grad_norm": 7.849715232849121, + "learning_rate": 9.011842015196112e-05, + "loss": 0.9537, + "step": 7969 + }, + { + "epoch": 0.5400094857375161, + "grad_norm": 6.9721221923828125, + "learning_rate": 9.01170511328633e-05, + "loss": 0.8685, + "step": 7970 + }, + { + "epoch": 0.5400772410054882, + "grad_norm": 6.895398139953613, + "learning_rate": 9.011568211376548e-05, + "loss": 0.7817, + "step": 7971 + }, + { + "epoch": 0.5401449962734602, + "grad_norm": 6.040884017944336, + "learning_rate": 9.011431309466768e-05, + "loss": 0.7531, + "step": 7972 + }, + { + "epoch": 0.5402127515414323, + "grad_norm": 8.733409881591797, + "learning_rate": 9.011294407556986e-05, + "loss": 1.1737, + "step": 7973 + }, + { + "epoch": 0.5402805068094044, + "grad_norm": 7.4223761558532715, + "learning_rate": 9.011157505647204e-05, + "loss": 0.7954, + "step": 7974 + }, + { + "epoch": 0.5403482620773765, + "grad_norm": 6.471921443939209, + "learning_rate": 9.011020603737422e-05, + "loss": 0.8795, + "step": 7975 + }, + { + "epoch": 0.5404160173453486, + "grad_norm": 7.537938594818115, + "learning_rate": 9.01088370182764e-05, + "loss": 0.888, + "step": 7976 + }, + { + "epoch": 0.5404837726133207, + "grad_norm": 7.756222724914551, + "learning_rate": 9.010746799917859e-05, + "loss": 1.0905, + "step": 7977 + }, + { + "epoch": 0.5405515278812928, + "grad_norm": 6.659689426422119, + "learning_rate": 9.010609898008077e-05, + "loss": 0.7667, + "step": 7978 + }, + { + "epoch": 0.5406192831492649, + "grad_norm": 9.589966773986816, + "learning_rate": 9.010472996098295e-05, + "loss": 0.9853, + "step": 7979 + }, + { + "epoch": 0.540687038417237, + "grad_norm": 7.351951599121094, + "learning_rate": 9.010336094188515e-05, + "loss": 0.7348, + "step": 7980 + }, + { + "epoch": 0.5407547936852091, + "grad_norm": 7.74218225479126, + "learning_rate": 9.010199192278733e-05, + "loss": 1.1712, + "step": 7981 + }, + { + "epoch": 0.5408225489531812, + "grad_norm": 6.658298015594482, + "learning_rate": 9.01006229036895e-05, + "loss": 0.8334, + "step": 7982 + }, + { + "epoch": 0.5408903042211531, + "grad_norm": 6.736339092254639, + "learning_rate": 9.00992538845917e-05, + "loss": 0.9479, + "step": 7983 + }, + { + "epoch": 0.5409580594891252, + "grad_norm": 6.63740873336792, + "learning_rate": 9.009788486549388e-05, + "loss": 0.9086, + "step": 7984 + }, + { + "epoch": 0.5410258147570973, + "grad_norm": 6.095336437225342, + "learning_rate": 9.009651584639606e-05, + "loss": 0.8407, + "step": 7985 + }, + { + "epoch": 0.5410935700250694, + "grad_norm": 5.067421913146973, + "learning_rate": 9.009514682729826e-05, + "loss": 0.8147, + "step": 7986 + }, + { + "epoch": 0.5411613252930415, + "grad_norm": 5.876379013061523, + "learning_rate": 9.009377780820044e-05, + "loss": 0.8639, + "step": 7987 + }, + { + "epoch": 0.5412290805610136, + "grad_norm": 6.525248050689697, + "learning_rate": 9.009240878910262e-05, + "loss": 0.7483, + "step": 7988 + }, + { + "epoch": 0.5412968358289857, + "grad_norm": 7.615539073944092, + "learning_rate": 9.00910397700048e-05, + "loss": 0.8863, + "step": 7989 + }, + { + "epoch": 0.5413645910969578, + "grad_norm": 7.767553329467773, + "learning_rate": 9.008967075090699e-05, + "loss": 0.8501, + "step": 7990 + }, + { + "epoch": 0.5414323463649299, + "grad_norm": 7.159237384796143, + "learning_rate": 9.008830173180917e-05, + "loss": 0.8759, + "step": 7991 + }, + { + "epoch": 0.541500101632902, + "grad_norm": 6.989110469818115, + "learning_rate": 9.008693271271135e-05, + "loss": 0.9121, + "step": 7992 + }, + { + "epoch": 0.5415678569008741, + "grad_norm": 6.767205238342285, + "learning_rate": 9.008556369361353e-05, + "loss": 0.6788, + "step": 7993 + }, + { + "epoch": 0.5416356121688461, + "grad_norm": 5.92645788192749, + "learning_rate": 9.008419467451571e-05, + "loss": 0.6165, + "step": 7994 + }, + { + "epoch": 0.5417033674368182, + "grad_norm": 5.536478042602539, + "learning_rate": 9.00828256554179e-05, + "loss": 0.6475, + "step": 7995 + }, + { + "epoch": 0.5417711227047903, + "grad_norm": 6.186464786529541, + "learning_rate": 9.008145663632009e-05, + "loss": 0.6399, + "step": 7996 + }, + { + "epoch": 0.5418388779727624, + "grad_norm": 6.986947536468506, + "learning_rate": 9.008008761722227e-05, + "loss": 0.8797, + "step": 7997 + }, + { + "epoch": 0.5419066332407345, + "grad_norm": 7.718267917633057, + "learning_rate": 9.007871859812445e-05, + "loss": 0.8077, + "step": 7998 + }, + { + "epoch": 0.5419743885087066, + "grad_norm": 6.4064483642578125, + "learning_rate": 9.007734957902663e-05, + "loss": 0.7077, + "step": 7999 + }, + { + "epoch": 0.5420421437766786, + "grad_norm": 7.1472320556640625, + "learning_rate": 9.007598055992882e-05, + "loss": 0.8007, + "step": 8000 + }, + { + "epoch": 0.5421098990446507, + "grad_norm": 7.518289566040039, + "learning_rate": 9.0074611540831e-05, + "loss": 0.8132, + "step": 8001 + }, + { + "epoch": 0.5421776543126228, + "grad_norm": 6.509357929229736, + "learning_rate": 9.007324252173318e-05, + "loss": 0.8157, + "step": 8002 + }, + { + "epoch": 0.5422454095805949, + "grad_norm": 6.129663467407227, + "learning_rate": 9.007187350263536e-05, + "loss": 1.0118, + "step": 8003 + }, + { + "epoch": 0.542313164848567, + "grad_norm": 7.274816513061523, + "learning_rate": 9.007050448353756e-05, + "loss": 1.0167, + "step": 8004 + }, + { + "epoch": 0.542380920116539, + "grad_norm": 6.708817005157471, + "learning_rate": 9.006913546443974e-05, + "loss": 0.8317, + "step": 8005 + }, + { + "epoch": 0.5424486753845111, + "grad_norm": 6.974154949188232, + "learning_rate": 9.006776644534192e-05, + "loss": 0.6758, + "step": 8006 + }, + { + "epoch": 0.5425164306524832, + "grad_norm": 6.915993690490723, + "learning_rate": 9.00663974262441e-05, + "loss": 0.8113, + "step": 8007 + }, + { + "epoch": 0.5425841859204553, + "grad_norm": 7.1193623542785645, + "learning_rate": 9.006502840714628e-05, + "loss": 0.9428, + "step": 8008 + }, + { + "epoch": 0.5426519411884274, + "grad_norm": 6.0862298011779785, + "learning_rate": 9.006365938804847e-05, + "loss": 0.8424, + "step": 8009 + }, + { + "epoch": 0.5427196964563995, + "grad_norm": 7.726379871368408, + "learning_rate": 9.006229036895065e-05, + "loss": 1.0722, + "step": 8010 + }, + { + "epoch": 0.5427874517243716, + "grad_norm": 7.901097774505615, + "learning_rate": 9.006092134985283e-05, + "loss": 0.9023, + "step": 8011 + }, + { + "epoch": 0.5428552069923437, + "grad_norm": 6.264953136444092, + "learning_rate": 9.005955233075501e-05, + "loss": 0.967, + "step": 8012 + }, + { + "epoch": 0.5429229622603158, + "grad_norm": 7.827919960021973, + "learning_rate": 9.00581833116572e-05, + "loss": 0.8026, + "step": 8013 + }, + { + "epoch": 0.5429907175282879, + "grad_norm": 5.923975944519043, + "learning_rate": 9.005681429255939e-05, + "loss": 0.8619, + "step": 8014 + }, + { + "epoch": 0.54305847279626, + "grad_norm": 8.078742027282715, + "learning_rate": 9.005544527346157e-05, + "loss": 1.0838, + "step": 8015 + }, + { + "epoch": 0.5431262280642319, + "grad_norm": 8.241996765136719, + "learning_rate": 9.005407625436375e-05, + "loss": 0.8315, + "step": 8016 + }, + { + "epoch": 0.543193983332204, + "grad_norm": 8.346336364746094, + "learning_rate": 9.005270723526593e-05, + "loss": 0.809, + "step": 8017 + }, + { + "epoch": 0.5432617386001761, + "grad_norm": 8.163125991821289, + "learning_rate": 9.005133821616812e-05, + "loss": 0.9445, + "step": 8018 + }, + { + "epoch": 0.5433294938681482, + "grad_norm": 6.224919319152832, + "learning_rate": 9.00499691970703e-05, + "loss": 0.8583, + "step": 8019 + }, + { + "epoch": 0.5433972491361203, + "grad_norm": 6.901029586791992, + "learning_rate": 9.004860017797248e-05, + "loss": 0.8486, + "step": 8020 + }, + { + "epoch": 0.5434650044040924, + "grad_norm": 5.692861080169678, + "learning_rate": 9.004723115887466e-05, + "loss": 0.7233, + "step": 8021 + }, + { + "epoch": 0.5435327596720645, + "grad_norm": 7.3206939697265625, + "learning_rate": 9.004586213977684e-05, + "loss": 0.9268, + "step": 8022 + }, + { + "epoch": 0.5436005149400366, + "grad_norm": 5.781581401824951, + "learning_rate": 9.004449312067904e-05, + "loss": 0.8536, + "step": 8023 + }, + { + "epoch": 0.5436682702080087, + "grad_norm": 6.006667137145996, + "learning_rate": 9.004312410158122e-05, + "loss": 0.6872, + "step": 8024 + }, + { + "epoch": 0.5437360254759808, + "grad_norm": 6.082067012786865, + "learning_rate": 9.00417550824834e-05, + "loss": 1.0156, + "step": 8025 + }, + { + "epoch": 0.5438037807439529, + "grad_norm": 9.1718111038208, + "learning_rate": 9.004038606338559e-05, + "loss": 1.1392, + "step": 8026 + }, + { + "epoch": 0.543871536011925, + "grad_norm": 8.803008079528809, + "learning_rate": 9.003901704428777e-05, + "loss": 1.1443, + "step": 8027 + }, + { + "epoch": 0.543939291279897, + "grad_norm": 6.125607013702393, + "learning_rate": 9.003764802518995e-05, + "loss": 0.8334, + "step": 8028 + }, + { + "epoch": 0.5440070465478691, + "grad_norm": 5.974092960357666, + "learning_rate": 9.003627900609215e-05, + "loss": 0.7561, + "step": 8029 + }, + { + "epoch": 0.5440748018158412, + "grad_norm": 6.347378730773926, + "learning_rate": 9.003490998699433e-05, + "loss": 0.7946, + "step": 8030 + }, + { + "epoch": 0.5441425570838133, + "grad_norm": 6.224796295166016, + "learning_rate": 9.00335409678965e-05, + "loss": 0.7501, + "step": 8031 + }, + { + "epoch": 0.5442103123517853, + "grad_norm": 9.026642799377441, + "learning_rate": 9.00321719487987e-05, + "loss": 0.8056, + "step": 8032 + }, + { + "epoch": 0.5442780676197574, + "grad_norm": 6.823796272277832, + "learning_rate": 9.003080292970088e-05, + "loss": 0.7378, + "step": 8033 + }, + { + "epoch": 0.5443458228877295, + "grad_norm": 6.37065315246582, + "learning_rate": 9.002943391060306e-05, + "loss": 1.0857, + "step": 8034 + }, + { + "epoch": 0.5444135781557016, + "grad_norm": 6.7236151695251465, + "learning_rate": 9.002806489150524e-05, + "loss": 0.7145, + "step": 8035 + }, + { + "epoch": 0.5444813334236737, + "grad_norm": 8.093012809753418, + "learning_rate": 9.002669587240743e-05, + "loss": 1.1783, + "step": 8036 + }, + { + "epoch": 0.5445490886916458, + "grad_norm": 6.60814094543457, + "learning_rate": 9.002532685330962e-05, + "loss": 0.7929, + "step": 8037 + }, + { + "epoch": 0.5446168439596178, + "grad_norm": 7.279522895812988, + "learning_rate": 9.00239578342118e-05, + "loss": 0.6758, + "step": 8038 + }, + { + "epoch": 0.5446845992275899, + "grad_norm": 5.578268051147461, + "learning_rate": 9.002258881511398e-05, + "loss": 0.7601, + "step": 8039 + }, + { + "epoch": 0.544752354495562, + "grad_norm": 4.871660232543945, + "learning_rate": 9.002121979601616e-05, + "loss": 0.635, + "step": 8040 + }, + { + "epoch": 0.5448201097635341, + "grad_norm": 6.011699199676514, + "learning_rate": 9.001985077691835e-05, + "loss": 0.8716, + "step": 8041 + }, + { + "epoch": 0.5448878650315062, + "grad_norm": 5.491373062133789, + "learning_rate": 9.001848175782053e-05, + "loss": 0.9069, + "step": 8042 + }, + { + "epoch": 0.5449556202994783, + "grad_norm": 7.352384567260742, + "learning_rate": 9.001711273872271e-05, + "loss": 0.9996, + "step": 8043 + }, + { + "epoch": 0.5450233755674504, + "grad_norm": 5.829630374908447, + "learning_rate": 9.001574371962489e-05, + "loss": 0.9563, + "step": 8044 + }, + { + "epoch": 0.5450911308354225, + "grad_norm": 6.887197017669678, + "learning_rate": 9.001437470052708e-05, + "loss": 1.0043, + "step": 8045 + }, + { + "epoch": 0.5451588861033946, + "grad_norm": 7.507176399230957, + "learning_rate": 9.001300568142927e-05, + "loss": 1.1256, + "step": 8046 + }, + { + "epoch": 0.5452266413713667, + "grad_norm": 5.741685390472412, + "learning_rate": 9.001163666233145e-05, + "loss": 0.7904, + "step": 8047 + }, + { + "epoch": 0.5452943966393388, + "grad_norm": 6.044425964355469, + "learning_rate": 9.001026764323363e-05, + "loss": 0.8707, + "step": 8048 + }, + { + "epoch": 0.5453621519073107, + "grad_norm": 7.574776649475098, + "learning_rate": 9.00088986241358e-05, + "loss": 1.1286, + "step": 8049 + }, + { + "epoch": 0.5454299071752828, + "grad_norm": 6.032942771911621, + "learning_rate": 9.0007529605038e-05, + "loss": 0.7711, + "step": 8050 + }, + { + "epoch": 0.5454976624432549, + "grad_norm": 9.648497581481934, + "learning_rate": 9.000616058594018e-05, + "loss": 0.8222, + "step": 8051 + }, + { + "epoch": 0.545565417711227, + "grad_norm": 6.6689839363098145, + "learning_rate": 9.000479156684236e-05, + "loss": 0.7225, + "step": 8052 + }, + { + "epoch": 0.5456331729791991, + "grad_norm": 4.215627193450928, + "learning_rate": 9.000342254774454e-05, + "loss": 0.6903, + "step": 8053 + }, + { + "epoch": 0.5457009282471712, + "grad_norm": 6.513290882110596, + "learning_rate": 9.000205352864672e-05, + "loss": 0.9443, + "step": 8054 + }, + { + "epoch": 0.5457686835151433, + "grad_norm": 5.759246826171875, + "learning_rate": 9.000068450954892e-05, + "loss": 0.7979, + "step": 8055 + }, + { + "epoch": 0.5458364387831154, + "grad_norm": 5.937248229980469, + "learning_rate": 8.99993154904511e-05, + "loss": 0.8381, + "step": 8056 + }, + { + "epoch": 0.5459041940510875, + "grad_norm": 8.01611328125, + "learning_rate": 8.999794647135328e-05, + "loss": 0.9463, + "step": 8057 + }, + { + "epoch": 0.5459719493190596, + "grad_norm": 7.086623191833496, + "learning_rate": 8.999657745225546e-05, + "loss": 0.711, + "step": 8058 + }, + { + "epoch": 0.5460397045870317, + "grad_norm": 8.19179916381836, + "learning_rate": 8.999520843315765e-05, + "loss": 0.7565, + "step": 8059 + }, + { + "epoch": 0.5461074598550038, + "grad_norm": 7.0273518562316895, + "learning_rate": 8.999383941405983e-05, + "loss": 0.9285, + "step": 8060 + }, + { + "epoch": 0.5461752151229758, + "grad_norm": 7.5481743812561035, + "learning_rate": 8.999247039496201e-05, + "loss": 0.9347, + "step": 8061 + }, + { + "epoch": 0.5462429703909479, + "grad_norm": 7.882439613342285, + "learning_rate": 8.999110137586419e-05, + "loss": 0.8584, + "step": 8062 + }, + { + "epoch": 0.54631072565892, + "grad_norm": 7.260676860809326, + "learning_rate": 8.998973235676637e-05, + "loss": 0.7111, + "step": 8063 + }, + { + "epoch": 0.5463784809268921, + "grad_norm": 6.826397895812988, + "learning_rate": 8.998836333766857e-05, + "loss": 0.6173, + "step": 8064 + }, + { + "epoch": 0.5464462361948641, + "grad_norm": 7.557692527770996, + "learning_rate": 8.998699431857075e-05, + "loss": 0.9096, + "step": 8065 + }, + { + "epoch": 0.5465139914628362, + "grad_norm": 5.558192253112793, + "learning_rate": 8.998562529947293e-05, + "loss": 0.8556, + "step": 8066 + }, + { + "epoch": 0.5465817467308083, + "grad_norm": 7.186037540435791, + "learning_rate": 8.99842562803751e-05, + "loss": 1.0854, + "step": 8067 + }, + { + "epoch": 0.5466495019987804, + "grad_norm": 6.516674518585205, + "learning_rate": 8.99828872612773e-05, + "loss": 0.7311, + "step": 8068 + }, + { + "epoch": 0.5467172572667525, + "grad_norm": 7.974714279174805, + "learning_rate": 8.998151824217948e-05, + "loss": 0.9699, + "step": 8069 + }, + { + "epoch": 0.5467850125347246, + "grad_norm": 6.709014892578125, + "learning_rate": 8.998014922308166e-05, + "loss": 0.8903, + "step": 8070 + }, + { + "epoch": 0.5468527678026966, + "grad_norm": 5.7826361656188965, + "learning_rate": 8.997878020398384e-05, + "loss": 0.7266, + "step": 8071 + }, + { + "epoch": 0.5469205230706687, + "grad_norm": 6.743555068969727, + "learning_rate": 8.997741118488604e-05, + "loss": 0.9665, + "step": 8072 + }, + { + "epoch": 0.5469882783386408, + "grad_norm": 6.872563362121582, + "learning_rate": 8.997604216578822e-05, + "loss": 0.8867, + "step": 8073 + }, + { + "epoch": 0.5470560336066129, + "grad_norm": 7.1092658042907715, + "learning_rate": 8.99746731466904e-05, + "loss": 1.1457, + "step": 8074 + }, + { + "epoch": 0.547123788874585, + "grad_norm": 5.996377944946289, + "learning_rate": 8.997330412759259e-05, + "loss": 0.784, + "step": 8075 + }, + { + "epoch": 0.5471915441425571, + "grad_norm": 6.767289638519287, + "learning_rate": 8.997193510849477e-05, + "loss": 0.9997, + "step": 8076 + }, + { + "epoch": 0.5472592994105292, + "grad_norm": 8.10904598236084, + "learning_rate": 8.997056608939695e-05, + "loss": 0.9479, + "step": 8077 + }, + { + "epoch": 0.5473270546785013, + "grad_norm": 7.407421588897705, + "learning_rate": 8.996919707029914e-05, + "loss": 0.6885, + "step": 8078 + }, + { + "epoch": 0.5473948099464734, + "grad_norm": 9.024150848388672, + "learning_rate": 8.996782805120132e-05, + "loss": 0.8968, + "step": 8079 + }, + { + "epoch": 0.5474625652144455, + "grad_norm": 5.622673511505127, + "learning_rate": 8.99664590321035e-05, + "loss": 0.6612, + "step": 8080 + }, + { + "epoch": 0.5475303204824175, + "grad_norm": 6.339783191680908, + "learning_rate": 8.996509001300569e-05, + "loss": 1.0033, + "step": 8081 + }, + { + "epoch": 0.5475980757503895, + "grad_norm": 6.384222984313965, + "learning_rate": 8.996372099390788e-05, + "loss": 0.9403, + "step": 8082 + }, + { + "epoch": 0.5476658310183616, + "grad_norm": 4.613900661468506, + "learning_rate": 8.996235197481006e-05, + "loss": 0.6112, + "step": 8083 + }, + { + "epoch": 0.5477335862863337, + "grad_norm": 8.393567085266113, + "learning_rate": 8.996098295571224e-05, + "loss": 1.0299, + "step": 8084 + }, + { + "epoch": 0.5478013415543058, + "grad_norm": 6.199436664581299, + "learning_rate": 8.995961393661442e-05, + "loss": 0.7737, + "step": 8085 + }, + { + "epoch": 0.5478690968222779, + "grad_norm": 8.227348327636719, + "learning_rate": 8.99582449175166e-05, + "loss": 0.9148, + "step": 8086 + }, + { + "epoch": 0.54793685209025, + "grad_norm": 4.897773265838623, + "learning_rate": 8.99568758984188e-05, + "loss": 0.7832, + "step": 8087 + }, + { + "epoch": 0.5480046073582221, + "grad_norm": 5.995389938354492, + "learning_rate": 8.995550687932098e-05, + "loss": 0.7117, + "step": 8088 + }, + { + "epoch": 0.5480723626261942, + "grad_norm": 6.70106840133667, + "learning_rate": 8.995413786022316e-05, + "loss": 0.6605, + "step": 8089 + }, + { + "epoch": 0.5481401178941663, + "grad_norm": 6.50885534286499, + "learning_rate": 8.995276884112534e-05, + "loss": 0.8888, + "step": 8090 + }, + { + "epoch": 0.5482078731621384, + "grad_norm": 6.068159580230713, + "learning_rate": 8.995139982202753e-05, + "loss": 0.6167, + "step": 8091 + }, + { + "epoch": 0.5482756284301105, + "grad_norm": 7.8484578132629395, + "learning_rate": 8.995003080292971e-05, + "loss": 1.0058, + "step": 8092 + }, + { + "epoch": 0.5483433836980826, + "grad_norm": 8.712899208068848, + "learning_rate": 8.994866178383189e-05, + "loss": 1.0509, + "step": 8093 + }, + { + "epoch": 0.5484111389660546, + "grad_norm": 5.627629280090332, + "learning_rate": 8.994729276473407e-05, + "loss": 0.6542, + "step": 8094 + }, + { + "epoch": 0.5484788942340267, + "grad_norm": 9.192571640014648, + "learning_rate": 8.994592374563625e-05, + "loss": 1.072, + "step": 8095 + }, + { + "epoch": 0.5485466495019988, + "grad_norm": 7.169675350189209, + "learning_rate": 8.994455472653844e-05, + "loss": 0.8983, + "step": 8096 + }, + { + "epoch": 0.5486144047699709, + "grad_norm": 5.486343860626221, + "learning_rate": 8.994318570744063e-05, + "loss": 0.8792, + "step": 8097 + }, + { + "epoch": 0.5486821600379429, + "grad_norm": 7.902504920959473, + "learning_rate": 8.99418166883428e-05, + "loss": 0.7665, + "step": 8098 + }, + { + "epoch": 0.548749915305915, + "grad_norm": 6.8819122314453125, + "learning_rate": 8.994044766924499e-05, + "loss": 0.7169, + "step": 8099 + }, + { + "epoch": 0.5488176705738871, + "grad_norm": 8.217923164367676, + "learning_rate": 8.993907865014718e-05, + "loss": 0.7619, + "step": 8100 + }, + { + "epoch": 0.5488854258418592, + "grad_norm": 6.089666366577148, + "learning_rate": 8.993770963104936e-05, + "loss": 0.6823, + "step": 8101 + }, + { + "epoch": 0.5489531811098313, + "grad_norm": 5.951727390289307, + "learning_rate": 8.993634061195154e-05, + "loss": 0.9311, + "step": 8102 + }, + { + "epoch": 0.5490209363778034, + "grad_norm": 5.070009231567383, + "learning_rate": 8.993497159285372e-05, + "loss": 0.7265, + "step": 8103 + }, + { + "epoch": 0.5490886916457755, + "grad_norm": 6.631993293762207, + "learning_rate": 8.99336025737559e-05, + "loss": 0.8403, + "step": 8104 + }, + { + "epoch": 0.5491564469137475, + "grad_norm": 6.938666820526123, + "learning_rate": 8.99322335546581e-05, + "loss": 1.0324, + "step": 8105 + }, + { + "epoch": 0.5492242021817196, + "grad_norm": 6.868338584899902, + "learning_rate": 8.993086453556028e-05, + "loss": 0.8591, + "step": 8106 + }, + { + "epoch": 0.5492919574496917, + "grad_norm": 6.081043720245361, + "learning_rate": 8.992949551646246e-05, + "loss": 0.9657, + "step": 8107 + }, + { + "epoch": 0.5493597127176638, + "grad_norm": 6.885746955871582, + "learning_rate": 8.992812649736464e-05, + "loss": 0.7396, + "step": 8108 + }, + { + "epoch": 0.5494274679856359, + "grad_norm": 7.478306770324707, + "learning_rate": 8.992675747826682e-05, + "loss": 0.8213, + "step": 8109 + }, + { + "epoch": 0.549495223253608, + "grad_norm": 6.534050941467285, + "learning_rate": 8.992538845916901e-05, + "loss": 0.928, + "step": 8110 + }, + { + "epoch": 0.5495629785215801, + "grad_norm": 6.312312602996826, + "learning_rate": 8.992401944007119e-05, + "loss": 0.8283, + "step": 8111 + }, + { + "epoch": 0.5496307337895522, + "grad_norm": 6.73023796081543, + "learning_rate": 8.992265042097337e-05, + "loss": 0.9155, + "step": 8112 + }, + { + "epoch": 0.5496984890575243, + "grad_norm": 6.13808012008667, + "learning_rate": 8.992128140187555e-05, + "loss": 0.9905, + "step": 8113 + }, + { + "epoch": 0.5497662443254963, + "grad_norm": 5.021573066711426, + "learning_rate": 8.991991238277775e-05, + "loss": 0.6841, + "step": 8114 + }, + { + "epoch": 0.5498339995934683, + "grad_norm": 6.291537284851074, + "learning_rate": 8.991854336367993e-05, + "loss": 1.1789, + "step": 8115 + }, + { + "epoch": 0.5499017548614404, + "grad_norm": 5.784443378448486, + "learning_rate": 8.99171743445821e-05, + "loss": 0.7338, + "step": 8116 + }, + { + "epoch": 0.5499695101294125, + "grad_norm": 5.844326496124268, + "learning_rate": 8.991580532548429e-05, + "loss": 0.7649, + "step": 8117 + }, + { + "epoch": 0.5500372653973846, + "grad_norm": 6.961515426635742, + "learning_rate": 8.991443630638648e-05, + "loss": 0.8765, + "step": 8118 + }, + { + "epoch": 0.5501050206653567, + "grad_norm": 6.371798515319824, + "learning_rate": 8.991306728728866e-05, + "loss": 0.7896, + "step": 8119 + }, + { + "epoch": 0.5501727759333288, + "grad_norm": 8.220330238342285, + "learning_rate": 8.991169826819084e-05, + "loss": 0.9147, + "step": 8120 + }, + { + "epoch": 0.5502405312013009, + "grad_norm": 15.172338485717773, + "learning_rate": 8.991032924909303e-05, + "loss": 0.798, + "step": 8121 + }, + { + "epoch": 0.550308286469273, + "grad_norm": 7.026042938232422, + "learning_rate": 8.990896022999522e-05, + "loss": 0.9536, + "step": 8122 + }, + { + "epoch": 0.5503760417372451, + "grad_norm": 7.6364336013793945, + "learning_rate": 8.99075912108974e-05, + "loss": 0.8633, + "step": 8123 + }, + { + "epoch": 0.5504437970052172, + "grad_norm": 7.09596586227417, + "learning_rate": 8.990622219179959e-05, + "loss": 0.7266, + "step": 8124 + }, + { + "epoch": 0.5505115522731893, + "grad_norm": 6.502638816833496, + "learning_rate": 8.990485317270177e-05, + "loss": 1.0156, + "step": 8125 + }, + { + "epoch": 0.5505793075411614, + "grad_norm": 6.83535623550415, + "learning_rate": 8.990348415360395e-05, + "loss": 0.7604, + "step": 8126 + }, + { + "epoch": 0.5506470628091334, + "grad_norm": 8.162100791931152, + "learning_rate": 8.990211513450613e-05, + "loss": 0.9476, + "step": 8127 + }, + { + "epoch": 0.5507148180771055, + "grad_norm": 7.7978410720825195, + "learning_rate": 8.990074611540832e-05, + "loss": 0.9341, + "step": 8128 + }, + { + "epoch": 0.5507825733450776, + "grad_norm": 7.659994602203369, + "learning_rate": 8.98993770963105e-05, + "loss": 0.9403, + "step": 8129 + }, + { + "epoch": 0.5508503286130496, + "grad_norm": 7.724880218505859, + "learning_rate": 8.989800807721268e-05, + "loss": 0.8563, + "step": 8130 + }, + { + "epoch": 0.5509180838810217, + "grad_norm": 8.337576866149902, + "learning_rate": 8.989663905811487e-05, + "loss": 0.874, + "step": 8131 + }, + { + "epoch": 0.5509858391489938, + "grad_norm": 6.336655616760254, + "learning_rate": 8.989527003901705e-05, + "loss": 0.6825, + "step": 8132 + }, + { + "epoch": 0.5510535944169659, + "grad_norm": 8.086697578430176, + "learning_rate": 8.989390101991924e-05, + "loss": 1.1419, + "step": 8133 + }, + { + "epoch": 0.551121349684938, + "grad_norm": 6.572722434997559, + "learning_rate": 8.989253200082142e-05, + "loss": 0.9213, + "step": 8134 + }, + { + "epoch": 0.5511891049529101, + "grad_norm": 6.7894487380981445, + "learning_rate": 8.98911629817236e-05, + "loss": 1.0189, + "step": 8135 + }, + { + "epoch": 0.5512568602208822, + "grad_norm": 8.123970985412598, + "learning_rate": 8.988979396262578e-05, + "loss": 1.1131, + "step": 8136 + }, + { + "epoch": 0.5513246154888543, + "grad_norm": 7.39744758605957, + "learning_rate": 8.988842494352797e-05, + "loss": 0.7286, + "step": 8137 + }, + { + "epoch": 0.5513923707568263, + "grad_norm": 6.353941440582275, + "learning_rate": 8.988705592443015e-05, + "loss": 0.7356, + "step": 8138 + }, + { + "epoch": 0.5514601260247984, + "grad_norm": 10.126983642578125, + "learning_rate": 8.988568690533234e-05, + "loss": 0.8082, + "step": 8139 + }, + { + "epoch": 0.5515278812927705, + "grad_norm": 7.518803596496582, + "learning_rate": 8.988431788623452e-05, + "loss": 0.801, + "step": 8140 + }, + { + "epoch": 0.5515956365607426, + "grad_norm": 6.769824981689453, + "learning_rate": 8.98829488671367e-05, + "loss": 0.7756, + "step": 8141 + }, + { + "epoch": 0.5516633918287147, + "grad_norm": 6.841986656188965, + "learning_rate": 8.988157984803889e-05, + "loss": 0.6801, + "step": 8142 + }, + { + "epoch": 0.5517311470966868, + "grad_norm": 6.92106294631958, + "learning_rate": 8.988021082894107e-05, + "loss": 0.9085, + "step": 8143 + }, + { + "epoch": 0.5517989023646589, + "grad_norm": 5.728677272796631, + "learning_rate": 8.987884180984325e-05, + "loss": 0.8921, + "step": 8144 + }, + { + "epoch": 0.551866657632631, + "grad_norm": 6.752305507659912, + "learning_rate": 8.987747279074543e-05, + "loss": 0.7462, + "step": 8145 + }, + { + "epoch": 0.551934412900603, + "grad_norm": 7.200686454772949, + "learning_rate": 8.987610377164762e-05, + "loss": 1.3344, + "step": 8146 + }, + { + "epoch": 0.5520021681685751, + "grad_norm": 5.663136959075928, + "learning_rate": 8.98747347525498e-05, + "loss": 0.8558, + "step": 8147 + }, + { + "epoch": 0.5520699234365471, + "grad_norm": 5.89145565032959, + "learning_rate": 8.987336573345199e-05, + "loss": 0.8393, + "step": 8148 + }, + { + "epoch": 0.5521376787045192, + "grad_norm": 6.481657028198242, + "learning_rate": 8.987199671435417e-05, + "loss": 1.0019, + "step": 8149 + }, + { + "epoch": 0.5522054339724913, + "grad_norm": 6.341575622558594, + "learning_rate": 8.987062769525635e-05, + "loss": 0.84, + "step": 8150 + }, + { + "epoch": 0.5522731892404634, + "grad_norm": 8.580132484436035, + "learning_rate": 8.986925867615854e-05, + "loss": 0.9604, + "step": 8151 + }, + { + "epoch": 0.5523409445084355, + "grad_norm": 6.118538856506348, + "learning_rate": 8.986788965706072e-05, + "loss": 0.8404, + "step": 8152 + }, + { + "epoch": 0.5524086997764076, + "grad_norm": 6.263945579528809, + "learning_rate": 8.98665206379629e-05, + "loss": 0.93, + "step": 8153 + }, + { + "epoch": 0.5524764550443797, + "grad_norm": 6.55873441696167, + "learning_rate": 8.986515161886508e-05, + "loss": 0.8502, + "step": 8154 + }, + { + "epoch": 0.5525442103123518, + "grad_norm": 6.372352600097656, + "learning_rate": 8.986378259976726e-05, + "loss": 1.0641, + "step": 8155 + }, + { + "epoch": 0.5526119655803239, + "grad_norm": 7.0687665939331055, + "learning_rate": 8.986241358066946e-05, + "loss": 0.8543, + "step": 8156 + }, + { + "epoch": 0.552679720848296, + "grad_norm": 8.265472412109375, + "learning_rate": 8.986104456157164e-05, + "loss": 0.9062, + "step": 8157 + }, + { + "epoch": 0.5527474761162681, + "grad_norm": 6.169217586517334, + "learning_rate": 8.985967554247382e-05, + "loss": 0.7173, + "step": 8158 + }, + { + "epoch": 0.5528152313842402, + "grad_norm": 5.441207408905029, + "learning_rate": 8.9858306523376e-05, + "loss": 0.6463, + "step": 8159 + }, + { + "epoch": 0.5528829866522122, + "grad_norm": 7.118704795837402, + "learning_rate": 8.985693750427819e-05, + "loss": 0.8944, + "step": 8160 + }, + { + "epoch": 0.5529507419201843, + "grad_norm": 6.042327404022217, + "learning_rate": 8.985556848518037e-05, + "loss": 1.081, + "step": 8161 + }, + { + "epoch": 0.5530184971881564, + "grad_norm": 7.080348014831543, + "learning_rate": 8.985419946608255e-05, + "loss": 0.6969, + "step": 8162 + }, + { + "epoch": 0.5530862524561284, + "grad_norm": 6.585824489593506, + "learning_rate": 8.985283044698473e-05, + "loss": 0.817, + "step": 8163 + }, + { + "epoch": 0.5531540077241005, + "grad_norm": 7.307035446166992, + "learning_rate": 8.985146142788692e-05, + "loss": 0.9677, + "step": 8164 + }, + { + "epoch": 0.5532217629920726, + "grad_norm": 7.577461242675781, + "learning_rate": 8.98500924087891e-05, + "loss": 0.9558, + "step": 8165 + }, + { + "epoch": 0.5532895182600447, + "grad_norm": 8.000713348388672, + "learning_rate": 8.984872338969129e-05, + "loss": 0.8819, + "step": 8166 + }, + { + "epoch": 0.5533572735280168, + "grad_norm": 8.770408630371094, + "learning_rate": 8.984735437059348e-05, + "loss": 1.0097, + "step": 8167 + }, + { + "epoch": 0.5534250287959889, + "grad_norm": 6.20897102355957, + "learning_rate": 8.984598535149566e-05, + "loss": 0.936, + "step": 8168 + }, + { + "epoch": 0.553492784063961, + "grad_norm": 6.364378929138184, + "learning_rate": 8.984461633239784e-05, + "loss": 0.8158, + "step": 8169 + }, + { + "epoch": 0.553560539331933, + "grad_norm": 6.148647308349609, + "learning_rate": 8.984324731330003e-05, + "loss": 0.8356, + "step": 8170 + }, + { + "epoch": 0.5536282945999051, + "grad_norm": 6.537686347961426, + "learning_rate": 8.984187829420221e-05, + "loss": 0.9588, + "step": 8171 + }, + { + "epoch": 0.5536960498678772, + "grad_norm": 5.661696910858154, + "learning_rate": 8.98405092751044e-05, + "loss": 0.8554, + "step": 8172 + }, + { + "epoch": 0.5537638051358493, + "grad_norm": 5.869355201721191, + "learning_rate": 8.983914025600658e-05, + "loss": 0.7676, + "step": 8173 + }, + { + "epoch": 0.5538315604038214, + "grad_norm": 5.434372425079346, + "learning_rate": 8.983777123690877e-05, + "loss": 0.7861, + "step": 8174 + }, + { + "epoch": 0.5538993156717935, + "grad_norm": 7.260756015777588, + "learning_rate": 8.983640221781095e-05, + "loss": 0.793, + "step": 8175 + }, + { + "epoch": 0.5539670709397656, + "grad_norm": 7.556422710418701, + "learning_rate": 8.983503319871313e-05, + "loss": 0.9004, + "step": 8176 + }, + { + "epoch": 0.5540348262077377, + "grad_norm": 5.610649585723877, + "learning_rate": 8.983366417961531e-05, + "loss": 0.6254, + "step": 8177 + }, + { + "epoch": 0.5541025814757098, + "grad_norm": 8.736811637878418, + "learning_rate": 8.98322951605175e-05, + "loss": 1.028, + "step": 8178 + }, + { + "epoch": 0.5541703367436818, + "grad_norm": 6.036482334136963, + "learning_rate": 8.983092614141968e-05, + "loss": 0.805, + "step": 8179 + }, + { + "epoch": 0.5542380920116539, + "grad_norm": 7.7951250076293945, + "learning_rate": 8.982955712232186e-05, + "loss": 0.8463, + "step": 8180 + }, + { + "epoch": 0.554305847279626, + "grad_norm": 5.3911871910095215, + "learning_rate": 8.982818810322404e-05, + "loss": 0.6961, + "step": 8181 + }, + { + "epoch": 0.554373602547598, + "grad_norm": 7.813619613647461, + "learning_rate": 8.982681908412623e-05, + "loss": 0.7671, + "step": 8182 + }, + { + "epoch": 0.5544413578155701, + "grad_norm": 6.328850269317627, + "learning_rate": 8.982545006502842e-05, + "loss": 0.7286, + "step": 8183 + }, + { + "epoch": 0.5545091130835422, + "grad_norm": 7.214839935302734, + "learning_rate": 8.98240810459306e-05, + "loss": 0.8523, + "step": 8184 + }, + { + "epoch": 0.5545768683515143, + "grad_norm": 5.996763229370117, + "learning_rate": 8.982271202683278e-05, + "loss": 0.8575, + "step": 8185 + }, + { + "epoch": 0.5546446236194864, + "grad_norm": 8.032330513000488, + "learning_rate": 8.982134300773496e-05, + "loss": 0.9224, + "step": 8186 + }, + { + "epoch": 0.5547123788874585, + "grad_norm": 7.919235706329346, + "learning_rate": 8.981997398863714e-05, + "loss": 0.9237, + "step": 8187 + }, + { + "epoch": 0.5547801341554306, + "grad_norm": 7.324405670166016, + "learning_rate": 8.981860496953933e-05, + "loss": 0.904, + "step": 8188 + }, + { + "epoch": 0.5548478894234027, + "grad_norm": 7.116741180419922, + "learning_rate": 8.981723595044151e-05, + "loss": 1.078, + "step": 8189 + }, + { + "epoch": 0.5549156446913748, + "grad_norm": 6.27305269241333, + "learning_rate": 8.98158669313437e-05, + "loss": 0.7974, + "step": 8190 + }, + { + "epoch": 0.5549833999593469, + "grad_norm": 6.298983097076416, + "learning_rate": 8.981449791224588e-05, + "loss": 0.8742, + "step": 8191 + }, + { + "epoch": 0.555051155227319, + "grad_norm": 5.965509414672852, + "learning_rate": 8.981312889314807e-05, + "loss": 0.7601, + "step": 8192 + }, + { + "epoch": 0.555118910495291, + "grad_norm": 6.754343509674072, + "learning_rate": 8.981175987405025e-05, + "loss": 1.0106, + "step": 8193 + }, + { + "epoch": 0.5551866657632631, + "grad_norm": 4.9216790199279785, + "learning_rate": 8.981039085495243e-05, + "loss": 0.7036, + "step": 8194 + }, + { + "epoch": 0.5552544210312351, + "grad_norm": 5.694727897644043, + "learning_rate": 8.980902183585461e-05, + "loss": 0.6663, + "step": 8195 + }, + { + "epoch": 0.5553221762992072, + "grad_norm": 7.885236740112305, + "learning_rate": 8.980765281675679e-05, + "loss": 0.9234, + "step": 8196 + }, + { + "epoch": 0.5553899315671793, + "grad_norm": 9.071615219116211, + "learning_rate": 8.980628379765898e-05, + "loss": 0.8357, + "step": 8197 + }, + { + "epoch": 0.5554576868351514, + "grad_norm": 5.474040508270264, + "learning_rate": 8.980491477856116e-05, + "loss": 0.6571, + "step": 8198 + }, + { + "epoch": 0.5555254421031235, + "grad_norm": 6.519590854644775, + "learning_rate": 8.980354575946335e-05, + "loss": 0.7547, + "step": 8199 + }, + { + "epoch": 0.5555931973710956, + "grad_norm": 6.605935096740723, + "learning_rate": 8.980217674036553e-05, + "loss": 0.8487, + "step": 8200 + }, + { + "epoch": 0.5556609526390677, + "grad_norm": 7.937171459197998, + "learning_rate": 8.980080772126772e-05, + "loss": 1.0935, + "step": 8201 + }, + { + "epoch": 0.5557287079070398, + "grad_norm": 8.487578392028809, + "learning_rate": 8.97994387021699e-05, + "loss": 1.2286, + "step": 8202 + }, + { + "epoch": 0.5557964631750119, + "grad_norm": 5.452174186706543, + "learning_rate": 8.979806968307208e-05, + "loss": 0.8182, + "step": 8203 + }, + { + "epoch": 0.555864218442984, + "grad_norm": 7.257054805755615, + "learning_rate": 8.979670066397426e-05, + "loss": 1.0002, + "step": 8204 + }, + { + "epoch": 0.555931973710956, + "grad_norm": 6.426529407501221, + "learning_rate": 8.979533164487644e-05, + "loss": 0.8226, + "step": 8205 + }, + { + "epoch": 0.5559997289789281, + "grad_norm": 7.804161548614502, + "learning_rate": 8.979396262577863e-05, + "loss": 1.0396, + "step": 8206 + }, + { + "epoch": 0.5560674842469002, + "grad_norm": 5.7507004737854, + "learning_rate": 8.979259360668082e-05, + "loss": 0.856, + "step": 8207 + }, + { + "epoch": 0.5561352395148723, + "grad_norm": 5.974365234375, + "learning_rate": 8.9791224587583e-05, + "loss": 0.7983, + "step": 8208 + }, + { + "epoch": 0.5562029947828444, + "grad_norm": 8.598740577697754, + "learning_rate": 8.978985556848518e-05, + "loss": 0.9326, + "step": 8209 + }, + { + "epoch": 0.5562707500508165, + "grad_norm": 6.554419994354248, + "learning_rate": 8.978848654938736e-05, + "loss": 0.8367, + "step": 8210 + }, + { + "epoch": 0.5563385053187886, + "grad_norm": 5.485220432281494, + "learning_rate": 8.978711753028955e-05, + "loss": 0.8231, + "step": 8211 + }, + { + "epoch": 0.5564062605867606, + "grad_norm": 7.286705493927002, + "learning_rate": 8.978574851119173e-05, + "loss": 1.0673, + "step": 8212 + }, + { + "epoch": 0.5564740158547327, + "grad_norm": 7.718635559082031, + "learning_rate": 8.978437949209391e-05, + "loss": 1.1467, + "step": 8213 + }, + { + "epoch": 0.5565417711227048, + "grad_norm": 7.21420431137085, + "learning_rate": 8.97830104729961e-05, + "loss": 0.7332, + "step": 8214 + }, + { + "epoch": 0.5566095263906768, + "grad_norm": 7.05917501449585, + "learning_rate": 8.978164145389828e-05, + "loss": 0.8301, + "step": 8215 + }, + { + "epoch": 0.5566772816586489, + "grad_norm": 7.492615699768066, + "learning_rate": 8.978027243480047e-05, + "loss": 0.7282, + "step": 8216 + }, + { + "epoch": 0.556745036926621, + "grad_norm": 5.647882461547852, + "learning_rate": 8.977890341570266e-05, + "loss": 0.9054, + "step": 8217 + }, + { + "epoch": 0.5568127921945931, + "grad_norm": 7.022364616394043, + "learning_rate": 8.977753439660484e-05, + "loss": 0.7759, + "step": 8218 + }, + { + "epoch": 0.5568805474625652, + "grad_norm": 7.175726890563965, + "learning_rate": 8.977616537750702e-05, + "loss": 0.8165, + "step": 8219 + }, + { + "epoch": 0.5569483027305373, + "grad_norm": 5.680481433868408, + "learning_rate": 8.977479635840921e-05, + "loss": 0.8801, + "step": 8220 + }, + { + "epoch": 0.5570160579985094, + "grad_norm": 6.144750118255615, + "learning_rate": 8.97734273393114e-05, + "loss": 0.8273, + "step": 8221 + }, + { + "epoch": 0.5570838132664815, + "grad_norm": 6.248444080352783, + "learning_rate": 8.977205832021357e-05, + "loss": 1.1512, + "step": 8222 + }, + { + "epoch": 0.5571515685344536, + "grad_norm": 5.904661655426025, + "learning_rate": 8.977068930111575e-05, + "loss": 0.8319, + "step": 8223 + }, + { + "epoch": 0.5572193238024257, + "grad_norm": 5.726762771606445, + "learning_rate": 8.976932028201795e-05, + "loss": 0.7412, + "step": 8224 + }, + { + "epoch": 0.5572870790703978, + "grad_norm": 6.143310070037842, + "learning_rate": 8.976795126292013e-05, + "loss": 1.0178, + "step": 8225 + }, + { + "epoch": 0.5573548343383699, + "grad_norm": 5.4348907470703125, + "learning_rate": 8.976658224382231e-05, + "loss": 0.7297, + "step": 8226 + }, + { + "epoch": 0.557422589606342, + "grad_norm": 6.981692314147949, + "learning_rate": 8.976521322472449e-05, + "loss": 0.8807, + "step": 8227 + }, + { + "epoch": 0.5574903448743139, + "grad_norm": 7.862269878387451, + "learning_rate": 8.976384420562667e-05, + "loss": 1.0944, + "step": 8228 + }, + { + "epoch": 0.557558100142286, + "grad_norm": 6.2533345222473145, + "learning_rate": 8.976247518652886e-05, + "loss": 0.8815, + "step": 8229 + }, + { + "epoch": 0.5576258554102581, + "grad_norm": 5.445539951324463, + "learning_rate": 8.976110616743104e-05, + "loss": 0.7853, + "step": 8230 + }, + { + "epoch": 0.5576936106782302, + "grad_norm": 5.220200538635254, + "learning_rate": 8.975973714833322e-05, + "loss": 0.6072, + "step": 8231 + }, + { + "epoch": 0.5577613659462023, + "grad_norm": 7.320289611816406, + "learning_rate": 8.97583681292354e-05, + "loss": 0.7023, + "step": 8232 + }, + { + "epoch": 0.5578291212141744, + "grad_norm": 6.331526756286621, + "learning_rate": 8.97569991101376e-05, + "loss": 0.8804, + "step": 8233 + }, + { + "epoch": 0.5578968764821465, + "grad_norm": 7.493820667266846, + "learning_rate": 8.975563009103978e-05, + "loss": 0.8167, + "step": 8234 + }, + { + "epoch": 0.5579646317501186, + "grad_norm": 5.947346210479736, + "learning_rate": 8.975426107194196e-05, + "loss": 0.6117, + "step": 8235 + }, + { + "epoch": 0.5580323870180907, + "grad_norm": 6.589901447296143, + "learning_rate": 8.975289205284414e-05, + "loss": 0.8721, + "step": 8236 + }, + { + "epoch": 0.5581001422860628, + "grad_norm": 5.9715423583984375, + "learning_rate": 8.975152303374632e-05, + "loss": 0.8644, + "step": 8237 + }, + { + "epoch": 0.5581678975540348, + "grad_norm": 8.454349517822266, + "learning_rate": 8.975015401464851e-05, + "loss": 0.89, + "step": 8238 + }, + { + "epoch": 0.5582356528220069, + "grad_norm": 6.455380916595459, + "learning_rate": 8.97487849955507e-05, + "loss": 0.8405, + "step": 8239 + }, + { + "epoch": 0.558303408089979, + "grad_norm": 6.558000564575195, + "learning_rate": 8.974741597645287e-05, + "loss": 0.8561, + "step": 8240 + }, + { + "epoch": 0.5583711633579511, + "grad_norm": 6.80634880065918, + "learning_rate": 8.974604695735506e-05, + "loss": 0.8508, + "step": 8241 + }, + { + "epoch": 0.5584389186259232, + "grad_norm": 5.507918357849121, + "learning_rate": 8.974467793825724e-05, + "loss": 0.9459, + "step": 8242 + }, + { + "epoch": 0.5585066738938953, + "grad_norm": 6.2580037117004395, + "learning_rate": 8.974330891915943e-05, + "loss": 0.919, + "step": 8243 + }, + { + "epoch": 0.5585744291618673, + "grad_norm": 7.050917148590088, + "learning_rate": 8.974193990006161e-05, + "loss": 0.6281, + "step": 8244 + }, + { + "epoch": 0.5586421844298394, + "grad_norm": 6.926290512084961, + "learning_rate": 8.974057088096379e-05, + "loss": 0.9378, + "step": 8245 + }, + { + "epoch": 0.5587099396978115, + "grad_norm": 7.242706298828125, + "learning_rate": 8.973920186186597e-05, + "loss": 0.9724, + "step": 8246 + }, + { + "epoch": 0.5587776949657836, + "grad_norm": 6.239006519317627, + "learning_rate": 8.973783284276816e-05, + "loss": 1.1008, + "step": 8247 + }, + { + "epoch": 0.5588454502337556, + "grad_norm": 5.4303998947143555, + "learning_rate": 8.973646382367034e-05, + "loss": 0.8486, + "step": 8248 + }, + { + "epoch": 0.5589132055017277, + "grad_norm": 7.883191108703613, + "learning_rate": 8.973509480457252e-05, + "loss": 1.1543, + "step": 8249 + }, + { + "epoch": 0.5589809607696998, + "grad_norm": 6.10230016708374, + "learning_rate": 8.97337257854747e-05, + "loss": 0.8662, + "step": 8250 + }, + { + "epoch": 0.5590487160376719, + "grad_norm": 7.152645587921143, + "learning_rate": 8.973235676637689e-05, + "loss": 0.7332, + "step": 8251 + }, + { + "epoch": 0.559116471305644, + "grad_norm": 5.797628879547119, + "learning_rate": 8.973098774727908e-05, + "loss": 0.7454, + "step": 8252 + }, + { + "epoch": 0.5591842265736161, + "grad_norm": 6.342854976654053, + "learning_rate": 8.972961872818126e-05, + "loss": 0.6993, + "step": 8253 + }, + { + "epoch": 0.5592519818415882, + "grad_norm": 8.492813110351562, + "learning_rate": 8.972824970908344e-05, + "loss": 0.8986, + "step": 8254 + }, + { + "epoch": 0.5593197371095603, + "grad_norm": 5.740788459777832, + "learning_rate": 8.972688068998562e-05, + "loss": 0.8702, + "step": 8255 + }, + { + "epoch": 0.5593874923775324, + "grad_norm": 5.833809852600098, + "learning_rate": 8.972551167088781e-05, + "loss": 0.9314, + "step": 8256 + }, + { + "epoch": 0.5594552476455045, + "grad_norm": 7.672467231750488, + "learning_rate": 8.972414265179e-05, + "loss": 0.7654, + "step": 8257 + }, + { + "epoch": 0.5595230029134766, + "grad_norm": 6.214809417724609, + "learning_rate": 8.972277363269218e-05, + "loss": 0.8391, + "step": 8258 + }, + { + "epoch": 0.5595907581814487, + "grad_norm": 6.621498107910156, + "learning_rate": 8.972140461359436e-05, + "loss": 0.8521, + "step": 8259 + }, + { + "epoch": 0.5596585134494207, + "grad_norm": 6.759031295776367, + "learning_rate": 8.972003559449655e-05, + "loss": 1.0709, + "step": 8260 + }, + { + "epoch": 0.5597262687173927, + "grad_norm": 6.506051540374756, + "learning_rate": 8.971866657539873e-05, + "loss": 1.1408, + "step": 8261 + }, + { + "epoch": 0.5597940239853648, + "grad_norm": 5.346510410308838, + "learning_rate": 8.971729755630091e-05, + "loss": 0.7054, + "step": 8262 + }, + { + "epoch": 0.5598617792533369, + "grad_norm": 7.418613910675049, + "learning_rate": 8.97159285372031e-05, + "loss": 0.976, + "step": 8263 + }, + { + "epoch": 0.559929534521309, + "grad_norm": 5.501760482788086, + "learning_rate": 8.971455951810528e-05, + "loss": 0.7454, + "step": 8264 + }, + { + "epoch": 0.5599972897892811, + "grad_norm": 6.592947959899902, + "learning_rate": 8.971319049900746e-05, + "loss": 0.6618, + "step": 8265 + }, + { + "epoch": 0.5600650450572532, + "grad_norm": 7.502736568450928, + "learning_rate": 8.971182147990966e-05, + "loss": 0.8033, + "step": 8266 + }, + { + "epoch": 0.5601328003252253, + "grad_norm": 7.374648094177246, + "learning_rate": 8.971045246081184e-05, + "loss": 0.8319, + "step": 8267 + }, + { + "epoch": 0.5602005555931974, + "grad_norm": 7.346577167510986, + "learning_rate": 8.970908344171402e-05, + "loss": 0.9729, + "step": 8268 + }, + { + "epoch": 0.5602683108611695, + "grad_norm": 5.724850654602051, + "learning_rate": 8.97077144226162e-05, + "loss": 0.8124, + "step": 8269 + }, + { + "epoch": 0.5603360661291416, + "grad_norm": 7.397567272186279, + "learning_rate": 8.97063454035184e-05, + "loss": 1.2057, + "step": 8270 + }, + { + "epoch": 0.5604038213971136, + "grad_norm": 8.235245704650879, + "learning_rate": 8.970497638442057e-05, + "loss": 0.7024, + "step": 8271 + }, + { + "epoch": 0.5604715766650857, + "grad_norm": 6.514690399169922, + "learning_rate": 8.970360736532275e-05, + "loss": 0.7149, + "step": 8272 + }, + { + "epoch": 0.5605393319330578, + "grad_norm": 7.889196872711182, + "learning_rate": 8.970223834622493e-05, + "loss": 0.664, + "step": 8273 + }, + { + "epoch": 0.5606070872010299, + "grad_norm": 6.1832475662231445, + "learning_rate": 8.970086932712711e-05, + "loss": 0.8841, + "step": 8274 + }, + { + "epoch": 0.560674842469002, + "grad_norm": 5.73254919052124, + "learning_rate": 8.969950030802931e-05, + "loss": 0.8285, + "step": 8275 + }, + { + "epoch": 0.5607425977369741, + "grad_norm": 7.358850955963135, + "learning_rate": 8.969813128893149e-05, + "loss": 0.8508, + "step": 8276 + }, + { + "epoch": 0.5608103530049461, + "grad_norm": 7.491328716278076, + "learning_rate": 8.969676226983367e-05, + "loss": 1.0391, + "step": 8277 + }, + { + "epoch": 0.5608781082729182, + "grad_norm": 6.1154375076293945, + "learning_rate": 8.969539325073585e-05, + "loss": 0.8254, + "step": 8278 + }, + { + "epoch": 0.5609458635408903, + "grad_norm": 7.245144367218018, + "learning_rate": 8.969402423163804e-05, + "loss": 0.9224, + "step": 8279 + }, + { + "epoch": 0.5610136188088624, + "grad_norm": 7.001935958862305, + "learning_rate": 8.969265521254022e-05, + "loss": 0.6674, + "step": 8280 + }, + { + "epoch": 0.5610813740768344, + "grad_norm": 7.420647144317627, + "learning_rate": 8.96912861934424e-05, + "loss": 0.9413, + "step": 8281 + }, + { + "epoch": 0.5611491293448065, + "grad_norm": 8.350207328796387, + "learning_rate": 8.968991717434458e-05, + "loss": 1.0078, + "step": 8282 + }, + { + "epoch": 0.5612168846127786, + "grad_norm": 7.219613075256348, + "learning_rate": 8.968854815524676e-05, + "loss": 0.9071, + "step": 8283 + }, + { + "epoch": 0.5612846398807507, + "grad_norm": 6.428452014923096, + "learning_rate": 8.968717913614896e-05, + "loss": 1.0902, + "step": 8284 + }, + { + "epoch": 0.5613523951487228, + "grad_norm": 6.86848258972168, + "learning_rate": 8.968581011705114e-05, + "loss": 0.967, + "step": 8285 + }, + { + "epoch": 0.5614201504166949, + "grad_norm": 6.413376808166504, + "learning_rate": 8.968444109795332e-05, + "loss": 0.7595, + "step": 8286 + }, + { + "epoch": 0.561487905684667, + "grad_norm": 7.2994866371154785, + "learning_rate": 8.96830720788555e-05, + "loss": 1.176, + "step": 8287 + }, + { + "epoch": 0.5615556609526391, + "grad_norm": 6.439824104309082, + "learning_rate": 8.968170305975768e-05, + "loss": 0.8067, + "step": 8288 + }, + { + "epoch": 0.5616234162206112, + "grad_norm": 7.186101913452148, + "learning_rate": 8.968033404065987e-05, + "loss": 0.6857, + "step": 8289 + }, + { + "epoch": 0.5616911714885833, + "grad_norm": 7.133284091949463, + "learning_rate": 8.967896502156205e-05, + "loss": 0.9404, + "step": 8290 + }, + { + "epoch": 0.5617589267565554, + "grad_norm": 5.822772979736328, + "learning_rate": 8.967759600246423e-05, + "loss": 0.8007, + "step": 8291 + }, + { + "epoch": 0.5618266820245275, + "grad_norm": 6.281779766082764, + "learning_rate": 8.967622698336642e-05, + "loss": 0.8698, + "step": 8292 + }, + { + "epoch": 0.5618944372924994, + "grad_norm": 6.691267490386963, + "learning_rate": 8.967485796426861e-05, + "loss": 0.7307, + "step": 8293 + }, + { + "epoch": 0.5619621925604715, + "grad_norm": 8.061805725097656, + "learning_rate": 8.967348894517079e-05, + "loss": 0.9223, + "step": 8294 + }, + { + "epoch": 0.5620299478284436, + "grad_norm": 6.480556488037109, + "learning_rate": 8.967211992607297e-05, + "loss": 0.6982, + "step": 8295 + }, + { + "epoch": 0.5620977030964157, + "grad_norm": 6.673006534576416, + "learning_rate": 8.967075090697515e-05, + "loss": 0.9059, + "step": 8296 + }, + { + "epoch": 0.5621654583643878, + "grad_norm": 5.882975101470947, + "learning_rate": 8.966938188787733e-05, + "loss": 0.7252, + "step": 8297 + }, + { + "epoch": 0.5622332136323599, + "grad_norm": 7.131857872009277, + "learning_rate": 8.966801286877952e-05, + "loss": 0.8641, + "step": 8298 + }, + { + "epoch": 0.562300968900332, + "grad_norm": 6.564770698547363, + "learning_rate": 8.96666438496817e-05, + "loss": 0.8122, + "step": 8299 + }, + { + "epoch": 0.5623687241683041, + "grad_norm": 5.747824192047119, + "learning_rate": 8.966527483058388e-05, + "loss": 0.8257, + "step": 8300 + }, + { + "epoch": 0.5624364794362762, + "grad_norm": 5.892056465148926, + "learning_rate": 8.966390581148607e-05, + "loss": 0.5754, + "step": 8301 + }, + { + "epoch": 0.5625042347042483, + "grad_norm": 7.563606262207031, + "learning_rate": 8.966253679238826e-05, + "loss": 0.7932, + "step": 8302 + }, + { + "epoch": 0.5625719899722204, + "grad_norm": 6.839925289154053, + "learning_rate": 8.966116777329044e-05, + "loss": 0.8489, + "step": 8303 + }, + { + "epoch": 0.5626397452401924, + "grad_norm": 7.593649387359619, + "learning_rate": 8.965979875419262e-05, + "loss": 0.7083, + "step": 8304 + }, + { + "epoch": 0.5627075005081645, + "grad_norm": 6.513439178466797, + "learning_rate": 8.96584297350948e-05, + "loss": 0.8906, + "step": 8305 + }, + { + "epoch": 0.5627752557761366, + "grad_norm": 7.1831889152526855, + "learning_rate": 8.9657060715997e-05, + "loss": 1.1598, + "step": 8306 + }, + { + "epoch": 0.5628430110441087, + "grad_norm": 5.392392158508301, + "learning_rate": 8.965569169689917e-05, + "loss": 0.7419, + "step": 8307 + }, + { + "epoch": 0.5629107663120808, + "grad_norm": 5.79964017868042, + "learning_rate": 8.965432267780135e-05, + "loss": 0.5992, + "step": 8308 + }, + { + "epoch": 0.5629785215800529, + "grad_norm": 6.701503276824951, + "learning_rate": 8.965295365870355e-05, + "loss": 0.9259, + "step": 8309 + }, + { + "epoch": 0.5630462768480249, + "grad_norm": 7.373070240020752, + "learning_rate": 8.965158463960573e-05, + "loss": 0.7898, + "step": 8310 + }, + { + "epoch": 0.563114032115997, + "grad_norm": 5.849572658538818, + "learning_rate": 8.965021562050791e-05, + "loss": 0.6339, + "step": 8311 + }, + { + "epoch": 0.5631817873839691, + "grad_norm": 7.531026840209961, + "learning_rate": 8.96488466014101e-05, + "loss": 0.9883, + "step": 8312 + }, + { + "epoch": 0.5632495426519412, + "grad_norm": 6.910517692565918, + "learning_rate": 8.964747758231228e-05, + "loss": 0.8582, + "step": 8313 + }, + { + "epoch": 0.5633172979199133, + "grad_norm": 7.760013103485107, + "learning_rate": 8.964610856321446e-05, + "loss": 0.8696, + "step": 8314 + }, + { + "epoch": 0.5633850531878853, + "grad_norm": 5.592878341674805, + "learning_rate": 8.964473954411664e-05, + "loss": 0.7325, + "step": 8315 + }, + { + "epoch": 0.5634528084558574, + "grad_norm": 6.491446018218994, + "learning_rate": 8.964337052501884e-05, + "loss": 0.7607, + "step": 8316 + }, + { + "epoch": 0.5635205637238295, + "grad_norm": 5.725716590881348, + "learning_rate": 8.964200150592102e-05, + "loss": 0.928, + "step": 8317 + }, + { + "epoch": 0.5635883189918016, + "grad_norm": 6.139345169067383, + "learning_rate": 8.96406324868232e-05, + "loss": 0.9763, + "step": 8318 + }, + { + "epoch": 0.5636560742597737, + "grad_norm": 7.040091514587402, + "learning_rate": 8.963926346772538e-05, + "loss": 0.7395, + "step": 8319 + }, + { + "epoch": 0.5637238295277458, + "grad_norm": 5.59568977355957, + "learning_rate": 8.963789444862756e-05, + "loss": 0.7304, + "step": 8320 + }, + { + "epoch": 0.5637915847957179, + "grad_norm": 5.827798366546631, + "learning_rate": 8.963652542952975e-05, + "loss": 0.7063, + "step": 8321 + }, + { + "epoch": 0.56385934006369, + "grad_norm": 7.218785285949707, + "learning_rate": 8.963515641043193e-05, + "loss": 0.8324, + "step": 8322 + }, + { + "epoch": 0.5639270953316621, + "grad_norm": 6.695993423461914, + "learning_rate": 8.963378739133411e-05, + "loss": 0.7374, + "step": 8323 + }, + { + "epoch": 0.5639948505996342, + "grad_norm": 7.842550754547119, + "learning_rate": 8.96324183722363e-05, + "loss": 1.1097, + "step": 8324 + }, + { + "epoch": 0.5640626058676063, + "grad_norm": 10.275467872619629, + "learning_rate": 8.963104935313849e-05, + "loss": 0.974, + "step": 8325 + }, + { + "epoch": 0.5641303611355782, + "grad_norm": 5.597927093505859, + "learning_rate": 8.962968033404067e-05, + "loss": 0.8399, + "step": 8326 + }, + { + "epoch": 0.5641981164035503, + "grad_norm": 8.49342155456543, + "learning_rate": 8.962831131494285e-05, + "loss": 0.8277, + "step": 8327 + }, + { + "epoch": 0.5642658716715224, + "grad_norm": 5.775453567504883, + "learning_rate": 8.962694229584503e-05, + "loss": 0.7712, + "step": 8328 + }, + { + "epoch": 0.5643336269394945, + "grad_norm": 6.063767910003662, + "learning_rate": 8.962557327674721e-05, + "loss": 0.6816, + "step": 8329 + }, + { + "epoch": 0.5644013822074666, + "grad_norm": 6.82351541519165, + "learning_rate": 8.96242042576494e-05, + "loss": 0.7759, + "step": 8330 + }, + { + "epoch": 0.5644691374754387, + "grad_norm": 7.734725475311279, + "learning_rate": 8.962283523855158e-05, + "loss": 0.856, + "step": 8331 + }, + { + "epoch": 0.5645368927434108, + "grad_norm": 9.442476272583008, + "learning_rate": 8.962146621945376e-05, + "loss": 0.9022, + "step": 8332 + }, + { + "epoch": 0.5646046480113829, + "grad_norm": 4.6692352294921875, + "learning_rate": 8.962009720035594e-05, + "loss": 0.689, + "step": 8333 + }, + { + "epoch": 0.564672403279355, + "grad_norm": 7.143623352050781, + "learning_rate": 8.961872818125814e-05, + "loss": 0.7562, + "step": 8334 + }, + { + "epoch": 0.5647401585473271, + "grad_norm": 6.053460597991943, + "learning_rate": 8.961735916216032e-05, + "loss": 0.6182, + "step": 8335 + }, + { + "epoch": 0.5648079138152992, + "grad_norm": 6.414344310760498, + "learning_rate": 8.96159901430625e-05, + "loss": 1.0768, + "step": 8336 + }, + { + "epoch": 0.5648756690832712, + "grad_norm": 5.544490814208984, + "learning_rate": 8.961462112396468e-05, + "loss": 0.722, + "step": 8337 + }, + { + "epoch": 0.5649434243512433, + "grad_norm": 7.075659275054932, + "learning_rate": 8.961325210486686e-05, + "loss": 1.0114, + "step": 8338 + }, + { + "epoch": 0.5650111796192154, + "grad_norm": 6.1844024658203125, + "learning_rate": 8.961188308576905e-05, + "loss": 0.7534, + "step": 8339 + }, + { + "epoch": 0.5650789348871875, + "grad_norm": 7.791917324066162, + "learning_rate": 8.961051406667123e-05, + "loss": 0.9825, + "step": 8340 + }, + { + "epoch": 0.5651466901551596, + "grad_norm": 6.58551025390625, + "learning_rate": 8.960914504757341e-05, + "loss": 0.7715, + "step": 8341 + }, + { + "epoch": 0.5652144454231316, + "grad_norm": 6.225470066070557, + "learning_rate": 8.96077760284756e-05, + "loss": 0.7685, + "step": 8342 + }, + { + "epoch": 0.5652822006911037, + "grad_norm": 10.40294361114502, + "learning_rate": 8.960640700937778e-05, + "loss": 0.9553, + "step": 8343 + }, + { + "epoch": 0.5653499559590758, + "grad_norm": 5.97885274887085, + "learning_rate": 8.960503799027997e-05, + "loss": 0.7282, + "step": 8344 + }, + { + "epoch": 0.5654177112270479, + "grad_norm": 7.777407646179199, + "learning_rate": 8.960366897118215e-05, + "loss": 0.859, + "step": 8345 + }, + { + "epoch": 0.56548546649502, + "grad_norm": 6.5945892333984375, + "learning_rate": 8.960229995208433e-05, + "loss": 1.0263, + "step": 8346 + }, + { + "epoch": 0.565553221762992, + "grad_norm": 5.678567409515381, + "learning_rate": 8.960093093298651e-05, + "loss": 0.6956, + "step": 8347 + }, + { + "epoch": 0.5656209770309641, + "grad_norm": 6.3759846687316895, + "learning_rate": 8.95995619138887e-05, + "loss": 0.8164, + "step": 8348 + }, + { + "epoch": 0.5656887322989362, + "grad_norm": 6.627008438110352, + "learning_rate": 8.959819289479088e-05, + "loss": 0.9505, + "step": 8349 + }, + { + "epoch": 0.5657564875669083, + "grad_norm": 8.078243255615234, + "learning_rate": 8.959682387569306e-05, + "loss": 0.9061, + "step": 8350 + }, + { + "epoch": 0.5658242428348804, + "grad_norm": 5.019404888153076, + "learning_rate": 8.959545485659524e-05, + "loss": 0.7382, + "step": 8351 + }, + { + "epoch": 0.5658919981028525, + "grad_norm": 8.406132698059082, + "learning_rate": 8.959408583749744e-05, + "loss": 1.0107, + "step": 8352 + }, + { + "epoch": 0.5659597533708246, + "grad_norm": 6.881223678588867, + "learning_rate": 8.959271681839962e-05, + "loss": 0.7417, + "step": 8353 + }, + { + "epoch": 0.5660275086387967, + "grad_norm": 5.801870822906494, + "learning_rate": 8.95913477993018e-05, + "loss": 0.7736, + "step": 8354 + }, + { + "epoch": 0.5660952639067688, + "grad_norm": 5.533797740936279, + "learning_rate": 8.9589978780204e-05, + "loss": 0.9576, + "step": 8355 + }, + { + "epoch": 0.5661630191747409, + "grad_norm": 5.76112174987793, + "learning_rate": 8.958860976110617e-05, + "loss": 0.8774, + "step": 8356 + }, + { + "epoch": 0.566230774442713, + "grad_norm": 6.424200534820557, + "learning_rate": 8.958724074200835e-05, + "loss": 0.9104, + "step": 8357 + }, + { + "epoch": 0.566298529710685, + "grad_norm": 7.279561519622803, + "learning_rate": 8.958587172291055e-05, + "loss": 0.8566, + "step": 8358 + }, + { + "epoch": 0.566366284978657, + "grad_norm": 7.023104667663574, + "learning_rate": 8.958450270381273e-05, + "loss": 0.785, + "step": 8359 + }, + { + "epoch": 0.5664340402466291, + "grad_norm": 5.087419033050537, + "learning_rate": 8.958313368471491e-05, + "loss": 0.7379, + "step": 8360 + }, + { + "epoch": 0.5665017955146012, + "grad_norm": 6.156972885131836, + "learning_rate": 8.958176466561709e-05, + "loss": 0.9361, + "step": 8361 + }, + { + "epoch": 0.5665695507825733, + "grad_norm": 6.432037830352783, + "learning_rate": 8.958039564651928e-05, + "loss": 0.804, + "step": 8362 + }, + { + "epoch": 0.5666373060505454, + "grad_norm": 6.592261791229248, + "learning_rate": 8.957902662742146e-05, + "loss": 0.8668, + "step": 8363 + }, + { + "epoch": 0.5667050613185175, + "grad_norm": 6.8097944259643555, + "learning_rate": 8.957765760832364e-05, + "loss": 0.8852, + "step": 8364 + }, + { + "epoch": 0.5667728165864896, + "grad_norm": 6.4310078620910645, + "learning_rate": 8.957628858922582e-05, + "loss": 0.9113, + "step": 8365 + }, + { + "epoch": 0.5668405718544617, + "grad_norm": 4.992232322692871, + "learning_rate": 8.957491957012802e-05, + "loss": 0.7373, + "step": 8366 + }, + { + "epoch": 0.5669083271224338, + "grad_norm": 6.666651725769043, + "learning_rate": 8.95735505510302e-05, + "loss": 0.9169, + "step": 8367 + }, + { + "epoch": 0.5669760823904059, + "grad_norm": 6.658245086669922, + "learning_rate": 8.957218153193238e-05, + "loss": 0.7536, + "step": 8368 + }, + { + "epoch": 0.567043837658378, + "grad_norm": 7.354119300842285, + "learning_rate": 8.957081251283456e-05, + "loss": 0.9655, + "step": 8369 + }, + { + "epoch": 0.56711159292635, + "grad_norm": 7.857820510864258, + "learning_rate": 8.956944349373674e-05, + "loss": 1.0672, + "step": 8370 + }, + { + "epoch": 0.5671793481943221, + "grad_norm": 9.102689743041992, + "learning_rate": 8.956807447463893e-05, + "loss": 1.1671, + "step": 8371 + }, + { + "epoch": 0.5672471034622942, + "grad_norm": 7.795739650726318, + "learning_rate": 8.956670545554111e-05, + "loss": 0.8685, + "step": 8372 + }, + { + "epoch": 0.5673148587302663, + "grad_norm": 9.154378890991211, + "learning_rate": 8.95653364364433e-05, + "loss": 0.9891, + "step": 8373 + }, + { + "epoch": 0.5673826139982384, + "grad_norm": 5.6434645652771, + "learning_rate": 8.956396741734547e-05, + "loss": 0.7481, + "step": 8374 + }, + { + "epoch": 0.5674503692662104, + "grad_norm": 4.90270471572876, + "learning_rate": 8.956259839824765e-05, + "loss": 0.5697, + "step": 8375 + }, + { + "epoch": 0.5675181245341825, + "grad_norm": 6.498236656188965, + "learning_rate": 8.956122937914985e-05, + "loss": 0.8037, + "step": 8376 + }, + { + "epoch": 0.5675858798021546, + "grad_norm": 6.723139762878418, + "learning_rate": 8.955986036005203e-05, + "loss": 0.7044, + "step": 8377 + }, + { + "epoch": 0.5676536350701267, + "grad_norm": 6.958408832550049, + "learning_rate": 8.955849134095421e-05, + "loss": 1.2551, + "step": 8378 + }, + { + "epoch": 0.5677213903380988, + "grad_norm": 5.749763488769531, + "learning_rate": 8.955712232185639e-05, + "loss": 0.6988, + "step": 8379 + }, + { + "epoch": 0.5677891456060709, + "grad_norm": 8.176393508911133, + "learning_rate": 8.955575330275858e-05, + "loss": 0.9854, + "step": 8380 + }, + { + "epoch": 0.567856900874043, + "grad_norm": 6.546833038330078, + "learning_rate": 8.955438428366076e-05, + "loss": 0.6731, + "step": 8381 + }, + { + "epoch": 0.567924656142015, + "grad_norm": 5.418013095855713, + "learning_rate": 8.955301526456294e-05, + "loss": 0.6531, + "step": 8382 + }, + { + "epoch": 0.5679924114099871, + "grad_norm": 5.538142681121826, + "learning_rate": 8.955164624546512e-05, + "loss": 0.6456, + "step": 8383 + }, + { + "epoch": 0.5680601666779592, + "grad_norm": 7.09976053237915, + "learning_rate": 8.95502772263673e-05, + "loss": 0.9614, + "step": 8384 + }, + { + "epoch": 0.5681279219459313, + "grad_norm": 6.105874061584473, + "learning_rate": 8.95489082072695e-05, + "loss": 0.6241, + "step": 8385 + }, + { + "epoch": 0.5681956772139034, + "grad_norm": 6.9980998039245605, + "learning_rate": 8.954753918817168e-05, + "loss": 0.9669, + "step": 8386 + }, + { + "epoch": 0.5682634324818755, + "grad_norm": 6.8302764892578125, + "learning_rate": 8.954617016907386e-05, + "loss": 0.6397, + "step": 8387 + }, + { + "epoch": 0.5683311877498476, + "grad_norm": 7.777739524841309, + "learning_rate": 8.954480114997604e-05, + "loss": 1.0691, + "step": 8388 + }, + { + "epoch": 0.5683989430178197, + "grad_norm": 7.322915554046631, + "learning_rate": 8.954343213087823e-05, + "loss": 0.9179, + "step": 8389 + }, + { + "epoch": 0.5684666982857918, + "grad_norm": 8.225046157836914, + "learning_rate": 8.954206311178041e-05, + "loss": 1.1727, + "step": 8390 + }, + { + "epoch": 0.5685344535537638, + "grad_norm": 5.573197364807129, + "learning_rate": 8.95406940926826e-05, + "loss": 0.9111, + "step": 8391 + }, + { + "epoch": 0.5686022088217358, + "grad_norm": 5.810042381286621, + "learning_rate": 8.953932507358477e-05, + "loss": 0.7115, + "step": 8392 + }, + { + "epoch": 0.5686699640897079, + "grad_norm": 7.172820091247559, + "learning_rate": 8.953795605448695e-05, + "loss": 1.0404, + "step": 8393 + }, + { + "epoch": 0.56873771935768, + "grad_norm": 8.901079177856445, + "learning_rate": 8.953658703538915e-05, + "loss": 1.164, + "step": 8394 + }, + { + "epoch": 0.5688054746256521, + "grad_norm": 5.692160129547119, + "learning_rate": 8.953521801629133e-05, + "loss": 0.7094, + "step": 8395 + }, + { + "epoch": 0.5688732298936242, + "grad_norm": 5.794686317443848, + "learning_rate": 8.953384899719351e-05, + "loss": 0.7625, + "step": 8396 + }, + { + "epoch": 0.5689409851615963, + "grad_norm": 6.330542087554932, + "learning_rate": 8.953247997809569e-05, + "loss": 0.8826, + "step": 8397 + }, + { + "epoch": 0.5690087404295684, + "grad_norm": 6.6873698234558105, + "learning_rate": 8.953111095899788e-05, + "loss": 0.8109, + "step": 8398 + }, + { + "epoch": 0.5690764956975405, + "grad_norm": 5.331307411193848, + "learning_rate": 8.952974193990006e-05, + "loss": 0.6919, + "step": 8399 + }, + { + "epoch": 0.5691442509655126, + "grad_norm": 6.942624568939209, + "learning_rate": 8.952837292080224e-05, + "loss": 0.9336, + "step": 8400 + }, + { + "epoch": 0.5692120062334847, + "grad_norm": 5.526673793792725, + "learning_rate": 8.952700390170444e-05, + "loss": 0.6552, + "step": 8401 + }, + { + "epoch": 0.5692797615014568, + "grad_norm": 6.674069404602051, + "learning_rate": 8.952563488260662e-05, + "loss": 0.7824, + "step": 8402 + }, + { + "epoch": 0.5693475167694289, + "grad_norm": 7.103878498077393, + "learning_rate": 8.95242658635088e-05, + "loss": 0.7983, + "step": 8403 + }, + { + "epoch": 0.5694152720374009, + "grad_norm": 5.963317394256592, + "learning_rate": 8.952289684441099e-05, + "loss": 0.7804, + "step": 8404 + }, + { + "epoch": 0.569483027305373, + "grad_norm": 5.7701215744018555, + "learning_rate": 8.952152782531317e-05, + "loss": 0.6684, + "step": 8405 + }, + { + "epoch": 0.5695507825733451, + "grad_norm": 6.809589862823486, + "learning_rate": 8.952015880621535e-05, + "loss": 0.8358, + "step": 8406 + }, + { + "epoch": 0.5696185378413171, + "grad_norm": 5.772964954376221, + "learning_rate": 8.951878978711753e-05, + "loss": 0.8283, + "step": 8407 + }, + { + "epoch": 0.5696862931092892, + "grad_norm": 5.902827262878418, + "learning_rate": 8.951742076801973e-05, + "loss": 0.9207, + "step": 8408 + }, + { + "epoch": 0.5697540483772613, + "grad_norm": 6.312051296234131, + "learning_rate": 8.951605174892191e-05, + "loss": 0.8424, + "step": 8409 + }, + { + "epoch": 0.5698218036452334, + "grad_norm": 6.687371730804443, + "learning_rate": 8.951468272982409e-05, + "loss": 0.9563, + "step": 8410 + }, + { + "epoch": 0.5698895589132055, + "grad_norm": 5.801156520843506, + "learning_rate": 8.951331371072627e-05, + "loss": 0.898, + "step": 8411 + }, + { + "epoch": 0.5699573141811776, + "grad_norm": 5.955209255218506, + "learning_rate": 8.951194469162846e-05, + "loss": 0.9062, + "step": 8412 + }, + { + "epoch": 0.5700250694491497, + "grad_norm": 6.845731258392334, + "learning_rate": 8.951057567253064e-05, + "loss": 0.7345, + "step": 8413 + }, + { + "epoch": 0.5700928247171217, + "grad_norm": 7.945831298828125, + "learning_rate": 8.950920665343282e-05, + "loss": 1.1156, + "step": 8414 + }, + { + "epoch": 0.5701605799850938, + "grad_norm": 6.423880100250244, + "learning_rate": 8.9507837634335e-05, + "loss": 0.886, + "step": 8415 + }, + { + "epoch": 0.5702283352530659, + "grad_norm": 6.089376449584961, + "learning_rate": 8.950646861523718e-05, + "loss": 0.7217, + "step": 8416 + }, + { + "epoch": 0.570296090521038, + "grad_norm": 7.061302661895752, + "learning_rate": 8.950509959613938e-05, + "loss": 1.0641, + "step": 8417 + }, + { + "epoch": 0.5703638457890101, + "grad_norm": 7.304849147796631, + "learning_rate": 8.950373057704156e-05, + "loss": 0.8099, + "step": 8418 + }, + { + "epoch": 0.5704316010569822, + "grad_norm": 7.116214752197266, + "learning_rate": 8.950236155794374e-05, + "loss": 0.7974, + "step": 8419 + }, + { + "epoch": 0.5704993563249543, + "grad_norm": 6.840792655944824, + "learning_rate": 8.950099253884592e-05, + "loss": 0.7558, + "step": 8420 + }, + { + "epoch": 0.5705671115929264, + "grad_norm": 5.458795547485352, + "learning_rate": 8.94996235197481e-05, + "loss": 0.7173, + "step": 8421 + }, + { + "epoch": 0.5706348668608985, + "grad_norm": 7.809031963348389, + "learning_rate": 8.949825450065029e-05, + "loss": 0.7751, + "step": 8422 + }, + { + "epoch": 0.5707026221288706, + "grad_norm": 7.147863388061523, + "learning_rate": 8.949688548155247e-05, + "loss": 0.7824, + "step": 8423 + }, + { + "epoch": 0.5707703773968426, + "grad_norm": 7.230483055114746, + "learning_rate": 8.949551646245465e-05, + "loss": 1.0771, + "step": 8424 + }, + { + "epoch": 0.5708381326648146, + "grad_norm": 7.998863697052002, + "learning_rate": 8.949414744335683e-05, + "loss": 0.8166, + "step": 8425 + }, + { + "epoch": 0.5709058879327867, + "grad_norm": 6.230087757110596, + "learning_rate": 8.949277842425903e-05, + "loss": 0.8565, + "step": 8426 + }, + { + "epoch": 0.5709736432007588, + "grad_norm": 5.557027816772461, + "learning_rate": 8.949140940516121e-05, + "loss": 0.9794, + "step": 8427 + }, + { + "epoch": 0.5710413984687309, + "grad_norm": 6.321481704711914, + "learning_rate": 8.949004038606339e-05, + "loss": 0.8254, + "step": 8428 + }, + { + "epoch": 0.571109153736703, + "grad_norm": 4.4708428382873535, + "learning_rate": 8.948867136696557e-05, + "loss": 0.8051, + "step": 8429 + }, + { + "epoch": 0.5711769090046751, + "grad_norm": 6.521273612976074, + "learning_rate": 8.948730234786775e-05, + "loss": 0.6903, + "step": 8430 + }, + { + "epoch": 0.5712446642726472, + "grad_norm": 7.250330448150635, + "learning_rate": 8.948593332876994e-05, + "loss": 1.0477, + "step": 8431 + }, + { + "epoch": 0.5713124195406193, + "grad_norm": 6.6420207023620605, + "learning_rate": 8.948456430967212e-05, + "loss": 0.7598, + "step": 8432 + }, + { + "epoch": 0.5713801748085914, + "grad_norm": 5.907214164733887, + "learning_rate": 8.94831952905743e-05, + "loss": 0.8957, + "step": 8433 + }, + { + "epoch": 0.5714479300765635, + "grad_norm": 7.927393913269043, + "learning_rate": 8.948182627147648e-05, + "loss": 0.814, + "step": 8434 + }, + { + "epoch": 0.5715156853445356, + "grad_norm": 7.30061674118042, + "learning_rate": 8.948045725237868e-05, + "loss": 1.0433, + "step": 8435 + }, + { + "epoch": 0.5715834406125077, + "grad_norm": 6.409942150115967, + "learning_rate": 8.947908823328086e-05, + "loss": 1.1921, + "step": 8436 + }, + { + "epoch": 0.5716511958804797, + "grad_norm": 6.589274883270264, + "learning_rate": 8.947771921418304e-05, + "loss": 0.9055, + "step": 8437 + }, + { + "epoch": 0.5717189511484518, + "grad_norm": 6.010043144226074, + "learning_rate": 8.947635019508522e-05, + "loss": 0.8581, + "step": 8438 + }, + { + "epoch": 0.5717867064164239, + "grad_norm": 5.75832462310791, + "learning_rate": 8.94749811759874e-05, + "loss": 0.7936, + "step": 8439 + }, + { + "epoch": 0.5718544616843959, + "grad_norm": 5.600092887878418, + "learning_rate": 8.94736121568896e-05, + "loss": 0.6994, + "step": 8440 + }, + { + "epoch": 0.571922216952368, + "grad_norm": 5.774161338806152, + "learning_rate": 8.947224313779177e-05, + "loss": 0.8647, + "step": 8441 + }, + { + "epoch": 0.5719899722203401, + "grad_norm": 6.7266950607299805, + "learning_rate": 8.947087411869395e-05, + "loss": 1.0754, + "step": 8442 + }, + { + "epoch": 0.5720577274883122, + "grad_norm": 7.049251556396484, + "learning_rate": 8.946950509959613e-05, + "loss": 0.7269, + "step": 8443 + }, + { + "epoch": 0.5721254827562843, + "grad_norm": 7.4380879402160645, + "learning_rate": 8.946813608049833e-05, + "loss": 0.8803, + "step": 8444 + }, + { + "epoch": 0.5721932380242564, + "grad_norm": 6.558119297027588, + "learning_rate": 8.946676706140051e-05, + "loss": 0.774, + "step": 8445 + }, + { + "epoch": 0.5722609932922285, + "grad_norm": 6.586884021759033, + "learning_rate": 8.946539804230269e-05, + "loss": 0.8607, + "step": 8446 + }, + { + "epoch": 0.5723287485602006, + "grad_norm": 7.708719253540039, + "learning_rate": 8.946402902320487e-05, + "loss": 0.9618, + "step": 8447 + }, + { + "epoch": 0.5723965038281726, + "grad_norm": 7.740994930267334, + "learning_rate": 8.946266000410706e-05, + "loss": 0.6727, + "step": 8448 + }, + { + "epoch": 0.5724642590961447, + "grad_norm": 6.2427287101745605, + "learning_rate": 8.946129098500924e-05, + "loss": 0.8622, + "step": 8449 + }, + { + "epoch": 0.5725320143641168, + "grad_norm": 8.826958656311035, + "learning_rate": 8.945992196591142e-05, + "loss": 0.8872, + "step": 8450 + }, + { + "epoch": 0.5725997696320889, + "grad_norm": 7.137535572052002, + "learning_rate": 8.945855294681362e-05, + "loss": 0.7906, + "step": 8451 + }, + { + "epoch": 0.572667524900061, + "grad_norm": 6.379401206970215, + "learning_rate": 8.94571839277158e-05, + "loss": 0.9356, + "step": 8452 + }, + { + "epoch": 0.5727352801680331, + "grad_norm": 6.800835609436035, + "learning_rate": 8.945581490861798e-05, + "loss": 0.7702, + "step": 8453 + }, + { + "epoch": 0.5728030354360052, + "grad_norm": 6.04707670211792, + "learning_rate": 8.945444588952017e-05, + "loss": 0.7877, + "step": 8454 + }, + { + "epoch": 0.5728707907039773, + "grad_norm": 6.483813285827637, + "learning_rate": 8.945307687042235e-05, + "loss": 1.0264, + "step": 8455 + }, + { + "epoch": 0.5729385459719493, + "grad_norm": 7.314120292663574, + "learning_rate": 8.945170785132453e-05, + "loss": 1.2052, + "step": 8456 + }, + { + "epoch": 0.5730063012399214, + "grad_norm": 8.42261791229248, + "learning_rate": 8.945033883222671e-05, + "loss": 0.963, + "step": 8457 + }, + { + "epoch": 0.5730740565078934, + "grad_norm": 5.949848175048828, + "learning_rate": 8.944896981312891e-05, + "loss": 0.7765, + "step": 8458 + }, + { + "epoch": 0.5731418117758655, + "grad_norm": 8.215453147888184, + "learning_rate": 8.944760079403109e-05, + "loss": 0.9526, + "step": 8459 + }, + { + "epoch": 0.5732095670438376, + "grad_norm": 7.214929580688477, + "learning_rate": 8.944623177493327e-05, + "loss": 0.9869, + "step": 8460 + }, + { + "epoch": 0.5732773223118097, + "grad_norm": 7.484433174133301, + "learning_rate": 8.944486275583545e-05, + "loss": 0.9432, + "step": 8461 + }, + { + "epoch": 0.5733450775797818, + "grad_norm": 6.356048107147217, + "learning_rate": 8.944349373673763e-05, + "loss": 0.8457, + "step": 8462 + }, + { + "epoch": 0.5734128328477539, + "grad_norm": 6.3201117515563965, + "learning_rate": 8.944212471763982e-05, + "loss": 0.8232, + "step": 8463 + }, + { + "epoch": 0.573480588115726, + "grad_norm": 7.538288593292236, + "learning_rate": 8.9440755698542e-05, + "loss": 0.655, + "step": 8464 + }, + { + "epoch": 0.5735483433836981, + "grad_norm": 6.674449920654297, + "learning_rate": 8.943938667944418e-05, + "loss": 0.7804, + "step": 8465 + }, + { + "epoch": 0.5736160986516702, + "grad_norm": 6.578172206878662, + "learning_rate": 8.943801766034636e-05, + "loss": 0.7174, + "step": 8466 + }, + { + "epoch": 0.5736838539196423, + "grad_norm": 5.858245372772217, + "learning_rate": 8.943664864124856e-05, + "loss": 0.7236, + "step": 8467 + }, + { + "epoch": 0.5737516091876144, + "grad_norm": 5.510379791259766, + "learning_rate": 8.943527962215074e-05, + "loss": 0.7767, + "step": 8468 + }, + { + "epoch": 0.5738193644555865, + "grad_norm": 6.925917148590088, + "learning_rate": 8.943391060305292e-05, + "loss": 1.0637, + "step": 8469 + }, + { + "epoch": 0.5738871197235585, + "grad_norm": 5.841418266296387, + "learning_rate": 8.94325415839551e-05, + "loss": 0.7703, + "step": 8470 + }, + { + "epoch": 0.5739548749915306, + "grad_norm": 5.818962097167969, + "learning_rate": 8.943117256485728e-05, + "loss": 0.8358, + "step": 8471 + }, + { + "epoch": 0.5740226302595027, + "grad_norm": 4.567956924438477, + "learning_rate": 8.942980354575947e-05, + "loss": 0.7853, + "step": 8472 + }, + { + "epoch": 0.5740903855274747, + "grad_norm": 7.374995231628418, + "learning_rate": 8.942843452666165e-05, + "loss": 0.7623, + "step": 8473 + }, + { + "epoch": 0.5741581407954468, + "grad_norm": 6.090158939361572, + "learning_rate": 8.942706550756383e-05, + "loss": 0.617, + "step": 8474 + }, + { + "epoch": 0.5742258960634189, + "grad_norm": 7.125964641571045, + "learning_rate": 8.942569648846601e-05, + "loss": 0.9063, + "step": 8475 + }, + { + "epoch": 0.574293651331391, + "grad_norm": 6.784455299377441, + "learning_rate": 8.94243274693682e-05, + "loss": 0.8758, + "step": 8476 + }, + { + "epoch": 0.5743614065993631, + "grad_norm": 6.138723373413086, + "learning_rate": 8.942295845027039e-05, + "loss": 0.9269, + "step": 8477 + }, + { + "epoch": 0.5744291618673352, + "grad_norm": 7.604799747467041, + "learning_rate": 8.942158943117257e-05, + "loss": 0.9721, + "step": 8478 + }, + { + "epoch": 0.5744969171353073, + "grad_norm": 7.989227771759033, + "learning_rate": 8.942022041207475e-05, + "loss": 0.9722, + "step": 8479 + }, + { + "epoch": 0.5745646724032794, + "grad_norm": 5.422422885894775, + "learning_rate": 8.941885139297693e-05, + "loss": 0.6328, + "step": 8480 + }, + { + "epoch": 0.5746324276712514, + "grad_norm": 5.384946346282959, + "learning_rate": 8.941748237387912e-05, + "loss": 0.677, + "step": 8481 + }, + { + "epoch": 0.5747001829392235, + "grad_norm": 6.695030212402344, + "learning_rate": 8.94161133547813e-05, + "loss": 0.8051, + "step": 8482 + }, + { + "epoch": 0.5747679382071956, + "grad_norm": 6.037063121795654, + "learning_rate": 8.941474433568348e-05, + "loss": 0.7381, + "step": 8483 + }, + { + "epoch": 0.5748356934751677, + "grad_norm": 5.64218807220459, + "learning_rate": 8.941337531658566e-05, + "loss": 0.943, + "step": 8484 + }, + { + "epoch": 0.5749034487431398, + "grad_norm": 7.324346542358398, + "learning_rate": 8.941200629748784e-05, + "loss": 0.8886, + "step": 8485 + }, + { + "epoch": 0.5749712040111119, + "grad_norm": 7.247791767120361, + "learning_rate": 8.941063727839004e-05, + "loss": 0.7939, + "step": 8486 + }, + { + "epoch": 0.575038959279084, + "grad_norm": 6.780027866363525, + "learning_rate": 8.940926825929222e-05, + "loss": 0.9357, + "step": 8487 + }, + { + "epoch": 0.5751067145470561, + "grad_norm": 6.983544826507568, + "learning_rate": 8.94078992401944e-05, + "loss": 0.73, + "step": 8488 + }, + { + "epoch": 0.5751744698150281, + "grad_norm": 5.811152458190918, + "learning_rate": 8.940653022109658e-05, + "loss": 0.875, + "step": 8489 + }, + { + "epoch": 0.5752422250830002, + "grad_norm": 5.372483253479004, + "learning_rate": 8.940516120199877e-05, + "loss": 0.8868, + "step": 8490 + }, + { + "epoch": 0.5753099803509722, + "grad_norm": 7.48350191116333, + "learning_rate": 8.940379218290095e-05, + "loss": 1.0441, + "step": 8491 + }, + { + "epoch": 0.5753777356189443, + "grad_norm": 4.902670383453369, + "learning_rate": 8.940242316380313e-05, + "loss": 0.6726, + "step": 8492 + }, + { + "epoch": 0.5754454908869164, + "grad_norm": 5.900030136108398, + "learning_rate": 8.940105414470531e-05, + "loss": 0.799, + "step": 8493 + }, + { + "epoch": 0.5755132461548885, + "grad_norm": 7.37599515914917, + "learning_rate": 8.939968512560751e-05, + "loss": 1.0577, + "step": 8494 + }, + { + "epoch": 0.5755810014228606, + "grad_norm": 5.871433734893799, + "learning_rate": 8.939831610650969e-05, + "loss": 0.9335, + "step": 8495 + }, + { + "epoch": 0.5756487566908327, + "grad_norm": 7.249385833740234, + "learning_rate": 8.939694708741187e-05, + "loss": 0.8244, + "step": 8496 + }, + { + "epoch": 0.5757165119588048, + "grad_norm": 7.102729320526123, + "learning_rate": 8.939557806831406e-05, + "loss": 0.7431, + "step": 8497 + }, + { + "epoch": 0.5757842672267769, + "grad_norm": 6.558697700500488, + "learning_rate": 8.939420904921624e-05, + "loss": 0.7765, + "step": 8498 + }, + { + "epoch": 0.575852022494749, + "grad_norm": 6.82787561416626, + "learning_rate": 8.939284003011842e-05, + "loss": 0.9449, + "step": 8499 + }, + { + "epoch": 0.5759197777627211, + "grad_norm": 7.055886268615723, + "learning_rate": 8.939147101102062e-05, + "loss": 0.8397, + "step": 8500 + }, + { + "epoch": 0.5759875330306932, + "grad_norm": 7.750473976135254, + "learning_rate": 8.93901019919228e-05, + "loss": 0.8001, + "step": 8501 + }, + { + "epoch": 0.5760552882986653, + "grad_norm": 9.581280708312988, + "learning_rate": 8.938873297282498e-05, + "loss": 0.7048, + "step": 8502 + }, + { + "epoch": 0.5761230435666373, + "grad_norm": 6.592195510864258, + "learning_rate": 8.938736395372716e-05, + "loss": 0.9294, + "step": 8503 + }, + { + "epoch": 0.5761907988346094, + "grad_norm": 5.0412797927856445, + "learning_rate": 8.938599493462935e-05, + "loss": 0.6723, + "step": 8504 + }, + { + "epoch": 0.5762585541025814, + "grad_norm": 6.099496364593506, + "learning_rate": 8.938462591553153e-05, + "loss": 0.8038, + "step": 8505 + }, + { + "epoch": 0.5763263093705535, + "grad_norm": 6.402875900268555, + "learning_rate": 8.938325689643371e-05, + "loss": 0.7956, + "step": 8506 + }, + { + "epoch": 0.5763940646385256, + "grad_norm": 6.937565326690674, + "learning_rate": 8.938188787733589e-05, + "loss": 0.9354, + "step": 8507 + }, + { + "epoch": 0.5764618199064977, + "grad_norm": 7.218740940093994, + "learning_rate": 8.938051885823807e-05, + "loss": 0.8022, + "step": 8508 + }, + { + "epoch": 0.5765295751744698, + "grad_norm": 7.368991851806641, + "learning_rate": 8.937914983914027e-05, + "loss": 1.0043, + "step": 8509 + }, + { + "epoch": 0.5765973304424419, + "grad_norm": 5.916724681854248, + "learning_rate": 8.937778082004245e-05, + "loss": 0.7391, + "step": 8510 + }, + { + "epoch": 0.576665085710414, + "grad_norm": 6.9001007080078125, + "learning_rate": 8.937641180094463e-05, + "loss": 0.9798, + "step": 8511 + }, + { + "epoch": 0.5767328409783861, + "grad_norm": 6.652282238006592, + "learning_rate": 8.937504278184681e-05, + "loss": 0.9313, + "step": 8512 + }, + { + "epoch": 0.5768005962463582, + "grad_norm": 7.920009613037109, + "learning_rate": 8.9373673762749e-05, + "loss": 0.8018, + "step": 8513 + }, + { + "epoch": 0.5768683515143302, + "grad_norm": 7.468683242797852, + "learning_rate": 8.937230474365118e-05, + "loss": 0.8666, + "step": 8514 + }, + { + "epoch": 0.5769361067823023, + "grad_norm": 5.6963653564453125, + "learning_rate": 8.937093572455336e-05, + "loss": 0.8645, + "step": 8515 + }, + { + "epoch": 0.5770038620502744, + "grad_norm": 9.493772506713867, + "learning_rate": 8.936956670545554e-05, + "loss": 0.7124, + "step": 8516 + }, + { + "epoch": 0.5770716173182465, + "grad_norm": 6.918614864349365, + "learning_rate": 8.936819768635772e-05, + "loss": 0.7712, + "step": 8517 + }, + { + "epoch": 0.5771393725862186, + "grad_norm": 5.827054977416992, + "learning_rate": 8.936682866725992e-05, + "loss": 0.7484, + "step": 8518 + }, + { + "epoch": 0.5772071278541907, + "grad_norm": 4.4998626708984375, + "learning_rate": 8.93654596481621e-05, + "loss": 0.7199, + "step": 8519 + }, + { + "epoch": 0.5772748831221628, + "grad_norm": 5.738202095031738, + "learning_rate": 8.936409062906428e-05, + "loss": 0.9027, + "step": 8520 + }, + { + "epoch": 0.5773426383901349, + "grad_norm": 6.771662712097168, + "learning_rate": 8.936272160996646e-05, + "loss": 0.7006, + "step": 8521 + }, + { + "epoch": 0.5774103936581069, + "grad_norm": 6.145763397216797, + "learning_rate": 8.936135259086865e-05, + "loss": 0.927, + "step": 8522 + }, + { + "epoch": 0.577478148926079, + "grad_norm": 6.981403827667236, + "learning_rate": 8.935998357177083e-05, + "loss": 0.8556, + "step": 8523 + }, + { + "epoch": 0.577545904194051, + "grad_norm": 6.167423725128174, + "learning_rate": 8.935861455267301e-05, + "loss": 0.643, + "step": 8524 + }, + { + "epoch": 0.5776136594620231, + "grad_norm": 6.135037899017334, + "learning_rate": 8.93572455335752e-05, + "loss": 0.8866, + "step": 8525 + }, + { + "epoch": 0.5776814147299952, + "grad_norm": 5.348565101623535, + "learning_rate": 8.935587651447737e-05, + "loss": 0.7559, + "step": 8526 + }, + { + "epoch": 0.5777491699979673, + "grad_norm": 5.95448637008667, + "learning_rate": 8.935450749537957e-05, + "loss": 0.6934, + "step": 8527 + }, + { + "epoch": 0.5778169252659394, + "grad_norm": 6.3629961013793945, + "learning_rate": 8.935313847628175e-05, + "loss": 0.7634, + "step": 8528 + }, + { + "epoch": 0.5778846805339115, + "grad_norm": 6.271450996398926, + "learning_rate": 8.935176945718393e-05, + "loss": 0.8269, + "step": 8529 + }, + { + "epoch": 0.5779524358018836, + "grad_norm": 5.41464900970459, + "learning_rate": 8.935040043808611e-05, + "loss": 0.5833, + "step": 8530 + }, + { + "epoch": 0.5780201910698557, + "grad_norm": 10.096611022949219, + "learning_rate": 8.934903141898829e-05, + "loss": 0.7474, + "step": 8531 + }, + { + "epoch": 0.5780879463378278, + "grad_norm": 7.8773393630981445, + "learning_rate": 8.934766239989048e-05, + "loss": 0.894, + "step": 8532 + }, + { + "epoch": 0.5781557016057999, + "grad_norm": 6.695224761962891, + "learning_rate": 8.934629338079266e-05, + "loss": 0.8072, + "step": 8533 + }, + { + "epoch": 0.578223456873772, + "grad_norm": 6.079283237457275, + "learning_rate": 8.934492436169484e-05, + "loss": 0.6978, + "step": 8534 + }, + { + "epoch": 0.5782912121417441, + "grad_norm": 6.053205490112305, + "learning_rate": 8.934355534259702e-05, + "loss": 0.8264, + "step": 8535 + }, + { + "epoch": 0.5783589674097162, + "grad_norm": 6.483332633972168, + "learning_rate": 8.934218632349922e-05, + "loss": 0.9318, + "step": 8536 + }, + { + "epoch": 0.5784267226776882, + "grad_norm": 7.496954441070557, + "learning_rate": 8.93408173044014e-05, + "loss": 0.69, + "step": 8537 + }, + { + "epoch": 0.5784944779456602, + "grad_norm": 6.839014530181885, + "learning_rate": 8.933944828530358e-05, + "loss": 0.8462, + "step": 8538 + }, + { + "epoch": 0.5785622332136323, + "grad_norm": 7.050039291381836, + "learning_rate": 8.933807926620576e-05, + "loss": 1.0005, + "step": 8539 + }, + { + "epoch": 0.5786299884816044, + "grad_norm": 6.284921169281006, + "learning_rate": 8.933671024710795e-05, + "loss": 0.6977, + "step": 8540 + }, + { + "epoch": 0.5786977437495765, + "grad_norm": 6.051140308380127, + "learning_rate": 8.933534122801013e-05, + "loss": 0.9931, + "step": 8541 + }, + { + "epoch": 0.5787654990175486, + "grad_norm": 6.233377933502197, + "learning_rate": 8.933397220891231e-05, + "loss": 0.6643, + "step": 8542 + }, + { + "epoch": 0.5788332542855207, + "grad_norm": 5.1774678230285645, + "learning_rate": 8.933260318981451e-05, + "loss": 0.8994, + "step": 8543 + }, + { + "epoch": 0.5789010095534928, + "grad_norm": 5.453426837921143, + "learning_rate": 8.933123417071669e-05, + "loss": 0.9404, + "step": 8544 + }, + { + "epoch": 0.5789687648214649, + "grad_norm": 6.1316609382629395, + "learning_rate": 8.932986515161887e-05, + "loss": 0.8306, + "step": 8545 + }, + { + "epoch": 0.579036520089437, + "grad_norm": 7.448431968688965, + "learning_rate": 8.932849613252106e-05, + "loss": 0.8259, + "step": 8546 + }, + { + "epoch": 0.579104275357409, + "grad_norm": 8.094486236572266, + "learning_rate": 8.932712711342324e-05, + "loss": 1.011, + "step": 8547 + }, + { + "epoch": 0.5791720306253811, + "grad_norm": 4.893822193145752, + "learning_rate": 8.932575809432542e-05, + "loss": 0.8489, + "step": 8548 + }, + { + "epoch": 0.5792397858933532, + "grad_norm": 5.577503204345703, + "learning_rate": 8.93243890752276e-05, + "loss": 0.8933, + "step": 8549 + }, + { + "epoch": 0.5793075411613253, + "grad_norm": 5.874213218688965, + "learning_rate": 8.93230200561298e-05, + "loss": 1.0447, + "step": 8550 + }, + { + "epoch": 0.5793752964292974, + "grad_norm": 7.764857769012451, + "learning_rate": 8.932165103703198e-05, + "loss": 1.0503, + "step": 8551 + }, + { + "epoch": 0.5794430516972695, + "grad_norm": 6.291406631469727, + "learning_rate": 8.932028201793416e-05, + "loss": 0.7368, + "step": 8552 + }, + { + "epoch": 0.5795108069652416, + "grad_norm": 6.035033226013184, + "learning_rate": 8.931891299883634e-05, + "loss": 0.7683, + "step": 8553 + }, + { + "epoch": 0.5795785622332136, + "grad_norm": 4.60941743850708, + "learning_rate": 8.931754397973853e-05, + "loss": 0.8216, + "step": 8554 + }, + { + "epoch": 0.5796463175011857, + "grad_norm": 5.524953365325928, + "learning_rate": 8.931617496064071e-05, + "loss": 0.8958, + "step": 8555 + }, + { + "epoch": 0.5797140727691578, + "grad_norm": 5.544130802154541, + "learning_rate": 8.931480594154289e-05, + "loss": 0.7621, + "step": 8556 + }, + { + "epoch": 0.5797818280371299, + "grad_norm": 5.973321437835693, + "learning_rate": 8.931343692244507e-05, + "loss": 0.8805, + "step": 8557 + }, + { + "epoch": 0.579849583305102, + "grad_norm": 5.566673755645752, + "learning_rate": 8.931206790334725e-05, + "loss": 0.7309, + "step": 8558 + }, + { + "epoch": 0.579917338573074, + "grad_norm": 5.564174652099609, + "learning_rate": 8.931069888424945e-05, + "loss": 0.7333, + "step": 8559 + }, + { + "epoch": 0.5799850938410461, + "grad_norm": 5.655849933624268, + "learning_rate": 8.930932986515163e-05, + "loss": 0.7378, + "step": 8560 + }, + { + "epoch": 0.5800528491090182, + "grad_norm": 6.19892692565918, + "learning_rate": 8.930796084605381e-05, + "loss": 0.5912, + "step": 8561 + }, + { + "epoch": 0.5801206043769903, + "grad_norm": 6.6580095291137695, + "learning_rate": 8.930659182695599e-05, + "loss": 0.7054, + "step": 8562 + }, + { + "epoch": 0.5801883596449624, + "grad_norm": 5.171754837036133, + "learning_rate": 8.930522280785817e-05, + "loss": 0.7478, + "step": 8563 + }, + { + "epoch": 0.5802561149129345, + "grad_norm": 6.493400573730469, + "learning_rate": 8.930385378876036e-05, + "loss": 0.7751, + "step": 8564 + }, + { + "epoch": 0.5803238701809066, + "grad_norm": 7.6895833015441895, + "learning_rate": 8.930248476966254e-05, + "loss": 0.776, + "step": 8565 + }, + { + "epoch": 0.5803916254488787, + "grad_norm": 6.17990255355835, + "learning_rate": 8.930111575056472e-05, + "loss": 0.6127, + "step": 8566 + }, + { + "epoch": 0.5804593807168508, + "grad_norm": 7.613852024078369, + "learning_rate": 8.92997467314669e-05, + "loss": 0.901, + "step": 8567 + }, + { + "epoch": 0.5805271359848229, + "grad_norm": 6.891161918640137, + "learning_rate": 8.92983777123691e-05, + "loss": 0.8535, + "step": 8568 + }, + { + "epoch": 0.580594891252795, + "grad_norm": 8.584257125854492, + "learning_rate": 8.929700869327128e-05, + "loss": 0.8043, + "step": 8569 + }, + { + "epoch": 0.580662646520767, + "grad_norm": 6.883901119232178, + "learning_rate": 8.929563967417346e-05, + "loss": 1.0738, + "step": 8570 + }, + { + "epoch": 0.580730401788739, + "grad_norm": 6.844675064086914, + "learning_rate": 8.929427065507564e-05, + "loss": 0.868, + "step": 8571 + }, + { + "epoch": 0.5807981570567111, + "grad_norm": 8.33270263671875, + "learning_rate": 8.929290163597782e-05, + "loss": 1.2373, + "step": 8572 + }, + { + "epoch": 0.5808659123246832, + "grad_norm": 7.0670037269592285, + "learning_rate": 8.929153261688001e-05, + "loss": 0.875, + "step": 8573 + }, + { + "epoch": 0.5809336675926553, + "grad_norm": 6.652496337890625, + "learning_rate": 8.929016359778219e-05, + "loss": 0.9693, + "step": 8574 + }, + { + "epoch": 0.5810014228606274, + "grad_norm": 6.9150390625, + "learning_rate": 8.928879457868437e-05, + "loss": 1.0438, + "step": 8575 + }, + { + "epoch": 0.5810691781285995, + "grad_norm": 7.5465192794799805, + "learning_rate": 8.928742555958655e-05, + "loss": 0.7079, + "step": 8576 + }, + { + "epoch": 0.5811369333965716, + "grad_norm": 5.88304328918457, + "learning_rate": 8.928605654048875e-05, + "loss": 0.8251, + "step": 8577 + }, + { + "epoch": 0.5812046886645437, + "grad_norm": 6.625080108642578, + "learning_rate": 8.928468752139093e-05, + "loss": 0.833, + "step": 8578 + }, + { + "epoch": 0.5812724439325158, + "grad_norm": 5.93492317199707, + "learning_rate": 8.928331850229311e-05, + "loss": 0.8211, + "step": 8579 + }, + { + "epoch": 0.5813401992004879, + "grad_norm": 5.411419868469238, + "learning_rate": 8.928194948319529e-05, + "loss": 0.8365, + "step": 8580 + }, + { + "epoch": 0.5814079544684599, + "grad_norm": 6.126975059509277, + "learning_rate": 8.928058046409747e-05, + "loss": 0.8646, + "step": 8581 + }, + { + "epoch": 0.581475709736432, + "grad_norm": 6.076915264129639, + "learning_rate": 8.927921144499966e-05, + "loss": 0.8304, + "step": 8582 + }, + { + "epoch": 0.5815434650044041, + "grad_norm": 7.348392009735107, + "learning_rate": 8.927784242590184e-05, + "loss": 0.8467, + "step": 8583 + }, + { + "epoch": 0.5816112202723762, + "grad_norm": 5.092489242553711, + "learning_rate": 8.927647340680402e-05, + "loss": 0.7371, + "step": 8584 + }, + { + "epoch": 0.5816789755403483, + "grad_norm": 5.680901527404785, + "learning_rate": 8.92751043877062e-05, + "loss": 0.7388, + "step": 8585 + }, + { + "epoch": 0.5817467308083204, + "grad_norm": 7.501118183135986, + "learning_rate": 8.92737353686084e-05, + "loss": 0.6264, + "step": 8586 + }, + { + "epoch": 0.5818144860762924, + "grad_norm": 6.8912529945373535, + "learning_rate": 8.927236634951058e-05, + "loss": 0.8985, + "step": 8587 + }, + { + "epoch": 0.5818822413442645, + "grad_norm": 6.703160285949707, + "learning_rate": 8.927099733041276e-05, + "loss": 1.1611, + "step": 8588 + }, + { + "epoch": 0.5819499966122366, + "grad_norm": 5.655977725982666, + "learning_rate": 8.926962831131495e-05, + "loss": 0.7641, + "step": 8589 + }, + { + "epoch": 0.5820177518802087, + "grad_norm": 7.246835708618164, + "learning_rate": 8.926825929221713e-05, + "loss": 0.97, + "step": 8590 + }, + { + "epoch": 0.5820855071481807, + "grad_norm": 7.098568916320801, + "learning_rate": 8.926689027311931e-05, + "loss": 0.9531, + "step": 8591 + }, + { + "epoch": 0.5821532624161528, + "grad_norm": 5.854083061218262, + "learning_rate": 8.92655212540215e-05, + "loss": 0.5604, + "step": 8592 + }, + { + "epoch": 0.5822210176841249, + "grad_norm": 9.047096252441406, + "learning_rate": 8.926415223492369e-05, + "loss": 0.9453, + "step": 8593 + }, + { + "epoch": 0.582288772952097, + "grad_norm": 6.015660762786865, + "learning_rate": 8.926278321582587e-05, + "loss": 0.8029, + "step": 8594 + }, + { + "epoch": 0.5823565282200691, + "grad_norm": 5.913661479949951, + "learning_rate": 8.926141419672805e-05, + "loss": 0.6898, + "step": 8595 + }, + { + "epoch": 0.5824242834880412, + "grad_norm": 5.473297595977783, + "learning_rate": 8.926004517763024e-05, + "loss": 0.9829, + "step": 8596 + }, + { + "epoch": 0.5824920387560133, + "grad_norm": 5.53269624710083, + "learning_rate": 8.925867615853242e-05, + "loss": 0.7763, + "step": 8597 + }, + { + "epoch": 0.5825597940239854, + "grad_norm": 6.618366241455078, + "learning_rate": 8.92573071394346e-05, + "loss": 1.1884, + "step": 8598 + }, + { + "epoch": 0.5826275492919575, + "grad_norm": 6.628344535827637, + "learning_rate": 8.925593812033678e-05, + "loss": 0.7971, + "step": 8599 + }, + { + "epoch": 0.5826953045599296, + "grad_norm": 6.690967082977295, + "learning_rate": 8.925456910123898e-05, + "loss": 0.837, + "step": 8600 + }, + { + "epoch": 0.5827630598279017, + "grad_norm": 6.788214683532715, + "learning_rate": 8.925320008214116e-05, + "loss": 0.6239, + "step": 8601 + }, + { + "epoch": 0.5828308150958738, + "grad_norm": 6.47343111038208, + "learning_rate": 8.925183106304334e-05, + "loss": 0.8656, + "step": 8602 + }, + { + "epoch": 0.5828985703638457, + "grad_norm": 7.423914909362793, + "learning_rate": 8.925046204394552e-05, + "loss": 0.9741, + "step": 8603 + }, + { + "epoch": 0.5829663256318178, + "grad_norm": 6.18986701965332, + "learning_rate": 8.92490930248477e-05, + "loss": 0.8261, + "step": 8604 + }, + { + "epoch": 0.5830340808997899, + "grad_norm": 6.904397010803223, + "learning_rate": 8.924772400574989e-05, + "loss": 0.6875, + "step": 8605 + }, + { + "epoch": 0.583101836167762, + "grad_norm": 6.054782867431641, + "learning_rate": 8.924635498665207e-05, + "loss": 0.7726, + "step": 8606 + }, + { + "epoch": 0.5831695914357341, + "grad_norm": 7.563791751861572, + "learning_rate": 8.924498596755425e-05, + "loss": 1.038, + "step": 8607 + }, + { + "epoch": 0.5832373467037062, + "grad_norm": 7.547990322113037, + "learning_rate": 8.924361694845643e-05, + "loss": 0.8108, + "step": 8608 + }, + { + "epoch": 0.5833051019716783, + "grad_norm": 6.109129905700684, + "learning_rate": 8.924224792935861e-05, + "loss": 0.9512, + "step": 8609 + }, + { + "epoch": 0.5833728572396504, + "grad_norm": 6.7443528175354, + "learning_rate": 8.924087891026081e-05, + "loss": 1.0178, + "step": 8610 + }, + { + "epoch": 0.5834406125076225, + "grad_norm": 4.522619247436523, + "learning_rate": 8.923950989116299e-05, + "loss": 0.7338, + "step": 8611 + }, + { + "epoch": 0.5835083677755946, + "grad_norm": 6.25988245010376, + "learning_rate": 8.923814087206517e-05, + "loss": 0.9412, + "step": 8612 + }, + { + "epoch": 0.5835761230435667, + "grad_norm": 8.428157806396484, + "learning_rate": 8.923677185296735e-05, + "loss": 0.9129, + "step": 8613 + }, + { + "epoch": 0.5836438783115387, + "grad_norm": 6.854711055755615, + "learning_rate": 8.923540283386954e-05, + "loss": 0.9614, + "step": 8614 + }, + { + "epoch": 0.5837116335795108, + "grad_norm": 5.909132957458496, + "learning_rate": 8.923403381477172e-05, + "loss": 1.0066, + "step": 8615 + }, + { + "epoch": 0.5837793888474829, + "grad_norm": 6.080239295959473, + "learning_rate": 8.92326647956739e-05, + "loss": 0.8778, + "step": 8616 + }, + { + "epoch": 0.583847144115455, + "grad_norm": 5.880582809448242, + "learning_rate": 8.923129577657608e-05, + "loss": 0.674, + "step": 8617 + }, + { + "epoch": 0.5839148993834271, + "grad_norm": 5.32850980758667, + "learning_rate": 8.922992675747826e-05, + "loss": 0.7366, + "step": 8618 + }, + { + "epoch": 0.5839826546513991, + "grad_norm": 5.818620204925537, + "learning_rate": 8.922855773838046e-05, + "loss": 0.8723, + "step": 8619 + }, + { + "epoch": 0.5840504099193712, + "grad_norm": 7.105060577392578, + "learning_rate": 8.922718871928264e-05, + "loss": 0.9449, + "step": 8620 + }, + { + "epoch": 0.5841181651873433, + "grad_norm": 7.047363758087158, + "learning_rate": 8.922581970018482e-05, + "loss": 1.0505, + "step": 8621 + }, + { + "epoch": 0.5841859204553154, + "grad_norm": 7.657389163970947, + "learning_rate": 8.9224450681087e-05, + "loss": 1.0046, + "step": 8622 + }, + { + "epoch": 0.5842536757232875, + "grad_norm": 6.699533462524414, + "learning_rate": 8.922308166198919e-05, + "loss": 0.9382, + "step": 8623 + }, + { + "epoch": 0.5843214309912595, + "grad_norm": 7.720858573913574, + "learning_rate": 8.922171264289137e-05, + "loss": 0.732, + "step": 8624 + }, + { + "epoch": 0.5843891862592316, + "grad_norm": 5.816531658172607, + "learning_rate": 8.922034362379355e-05, + "loss": 0.7825, + "step": 8625 + }, + { + "epoch": 0.5844569415272037, + "grad_norm": 5.647473335266113, + "learning_rate": 8.921897460469573e-05, + "loss": 0.8234, + "step": 8626 + }, + { + "epoch": 0.5845246967951758, + "grad_norm": 6.133992671966553, + "learning_rate": 8.921760558559791e-05, + "loss": 0.6883, + "step": 8627 + }, + { + "epoch": 0.5845924520631479, + "grad_norm": 6.076829433441162, + "learning_rate": 8.921623656650011e-05, + "loss": 0.6453, + "step": 8628 + }, + { + "epoch": 0.58466020733112, + "grad_norm": 6.3974385261535645, + "learning_rate": 8.921486754740229e-05, + "loss": 0.7473, + "step": 8629 + }, + { + "epoch": 0.5847279625990921, + "grad_norm": 8.298005104064941, + "learning_rate": 8.921349852830447e-05, + "loss": 0.563, + "step": 8630 + }, + { + "epoch": 0.5847957178670642, + "grad_norm": 6.789668083190918, + "learning_rate": 8.921212950920665e-05, + "loss": 0.8418, + "step": 8631 + }, + { + "epoch": 0.5848634731350363, + "grad_norm": 5.68512487411499, + "learning_rate": 8.921076049010884e-05, + "loss": 0.7862, + "step": 8632 + }, + { + "epoch": 0.5849312284030084, + "grad_norm": 7.587048053741455, + "learning_rate": 8.920939147101102e-05, + "loss": 0.6798, + "step": 8633 + }, + { + "epoch": 0.5849989836709805, + "grad_norm": 6.540048122406006, + "learning_rate": 8.92080224519132e-05, + "loss": 0.9243, + "step": 8634 + }, + { + "epoch": 0.5850667389389526, + "grad_norm": 6.688183784484863, + "learning_rate": 8.92066534328154e-05, + "loss": 0.7537, + "step": 8635 + }, + { + "epoch": 0.5851344942069245, + "grad_norm": 6.010653972625732, + "learning_rate": 8.920528441371758e-05, + "loss": 0.962, + "step": 8636 + }, + { + "epoch": 0.5852022494748966, + "grad_norm": 6.197324752807617, + "learning_rate": 8.920391539461976e-05, + "loss": 0.8345, + "step": 8637 + }, + { + "epoch": 0.5852700047428687, + "grad_norm": 6.193318843841553, + "learning_rate": 8.920254637552195e-05, + "loss": 0.77, + "step": 8638 + }, + { + "epoch": 0.5853377600108408, + "grad_norm": 7.584078788757324, + "learning_rate": 8.920117735642413e-05, + "loss": 0.9704, + "step": 8639 + }, + { + "epoch": 0.5854055152788129, + "grad_norm": 9.150726318359375, + "learning_rate": 8.919980833732631e-05, + "loss": 0.9343, + "step": 8640 + }, + { + "epoch": 0.585473270546785, + "grad_norm": 5.945910930633545, + "learning_rate": 8.919843931822849e-05, + "loss": 0.8009, + "step": 8641 + }, + { + "epoch": 0.5855410258147571, + "grad_norm": 6.68170690536499, + "learning_rate": 8.919707029913069e-05, + "loss": 0.7701, + "step": 8642 + }, + { + "epoch": 0.5856087810827292, + "grad_norm": 5.520997047424316, + "learning_rate": 8.919570128003287e-05, + "loss": 0.8759, + "step": 8643 + }, + { + "epoch": 0.5856765363507013, + "grad_norm": 6.285208702087402, + "learning_rate": 8.919433226093505e-05, + "loss": 0.9845, + "step": 8644 + }, + { + "epoch": 0.5857442916186734, + "grad_norm": 5.63783597946167, + "learning_rate": 8.919296324183723e-05, + "loss": 0.7603, + "step": 8645 + }, + { + "epoch": 0.5858120468866455, + "grad_norm": 6.416496753692627, + "learning_rate": 8.919159422273942e-05, + "loss": 0.6493, + "step": 8646 + }, + { + "epoch": 0.5858798021546175, + "grad_norm": 6.023496627807617, + "learning_rate": 8.91902252036416e-05, + "loss": 0.7045, + "step": 8647 + }, + { + "epoch": 0.5859475574225896, + "grad_norm": 6.045313835144043, + "learning_rate": 8.918885618454378e-05, + "loss": 0.8136, + "step": 8648 + }, + { + "epoch": 0.5860153126905617, + "grad_norm": 5.985274314880371, + "learning_rate": 8.918748716544596e-05, + "loss": 0.6254, + "step": 8649 + }, + { + "epoch": 0.5860830679585338, + "grad_norm": 6.4496965408325195, + "learning_rate": 8.918611814634814e-05, + "loss": 0.8043, + "step": 8650 + }, + { + "epoch": 0.5861508232265059, + "grad_norm": 6.74000883102417, + "learning_rate": 8.918474912725034e-05, + "loss": 0.7568, + "step": 8651 + }, + { + "epoch": 0.5862185784944779, + "grad_norm": 6.625565052032471, + "learning_rate": 8.918338010815252e-05, + "loss": 0.8329, + "step": 8652 + }, + { + "epoch": 0.58628633376245, + "grad_norm": 8.42115592956543, + "learning_rate": 8.91820110890547e-05, + "loss": 0.8818, + "step": 8653 + }, + { + "epoch": 0.5863540890304221, + "grad_norm": 5.852290153503418, + "learning_rate": 8.918064206995688e-05, + "loss": 0.7521, + "step": 8654 + }, + { + "epoch": 0.5864218442983942, + "grad_norm": 5.760341644287109, + "learning_rate": 8.917927305085907e-05, + "loss": 0.7142, + "step": 8655 + }, + { + "epoch": 0.5864895995663663, + "grad_norm": 6.094460964202881, + "learning_rate": 8.917790403176125e-05, + "loss": 0.7863, + "step": 8656 + }, + { + "epoch": 0.5865573548343384, + "grad_norm": 7.665438652038574, + "learning_rate": 8.917653501266343e-05, + "loss": 0.7998, + "step": 8657 + }, + { + "epoch": 0.5866251101023104, + "grad_norm": 8.002902030944824, + "learning_rate": 8.917516599356561e-05, + "loss": 0.6918, + "step": 8658 + }, + { + "epoch": 0.5866928653702825, + "grad_norm": 8.93215274810791, + "learning_rate": 8.917379697446779e-05, + "loss": 1.044, + "step": 8659 + }, + { + "epoch": 0.5867606206382546, + "grad_norm": 6.267491340637207, + "learning_rate": 8.917242795536999e-05, + "loss": 0.9676, + "step": 8660 + }, + { + "epoch": 0.5868283759062267, + "grad_norm": 8.059505462646484, + "learning_rate": 8.917105893627217e-05, + "loss": 0.6688, + "step": 8661 + }, + { + "epoch": 0.5868961311741988, + "grad_norm": 9.9888277053833, + "learning_rate": 8.916968991717435e-05, + "loss": 1.0911, + "step": 8662 + }, + { + "epoch": 0.5869638864421709, + "grad_norm": 5.3233232498168945, + "learning_rate": 8.916832089807653e-05, + "loss": 0.6354, + "step": 8663 + }, + { + "epoch": 0.587031641710143, + "grad_norm": 6.694457054138184, + "learning_rate": 8.916695187897871e-05, + "loss": 0.8184, + "step": 8664 + }, + { + "epoch": 0.5870993969781151, + "grad_norm": 6.7131547927856445, + "learning_rate": 8.91655828598809e-05, + "loss": 0.9447, + "step": 8665 + }, + { + "epoch": 0.5871671522460872, + "grad_norm": 6.680534362792969, + "learning_rate": 8.916421384078308e-05, + "loss": 0.7022, + "step": 8666 + }, + { + "epoch": 0.5872349075140593, + "grad_norm": 5.335456371307373, + "learning_rate": 8.916284482168526e-05, + "loss": 0.806, + "step": 8667 + }, + { + "epoch": 0.5873026627820312, + "grad_norm": 7.5574235916137695, + "learning_rate": 8.916147580258744e-05, + "loss": 0.7042, + "step": 8668 + }, + { + "epoch": 0.5873704180500033, + "grad_norm": 6.056331634521484, + "learning_rate": 8.916010678348964e-05, + "loss": 0.7419, + "step": 8669 + }, + { + "epoch": 0.5874381733179754, + "grad_norm": 6.732639312744141, + "learning_rate": 8.915873776439182e-05, + "loss": 0.7788, + "step": 8670 + }, + { + "epoch": 0.5875059285859475, + "grad_norm": 5.853601932525635, + "learning_rate": 8.9157368745294e-05, + "loss": 0.7423, + "step": 8671 + }, + { + "epoch": 0.5875736838539196, + "grad_norm": 5.647787570953369, + "learning_rate": 8.915599972619618e-05, + "loss": 0.6611, + "step": 8672 + }, + { + "epoch": 0.5876414391218917, + "grad_norm": 5.323878288269043, + "learning_rate": 8.915463070709836e-05, + "loss": 0.7538, + "step": 8673 + }, + { + "epoch": 0.5877091943898638, + "grad_norm": 7.562190055847168, + "learning_rate": 8.915326168800055e-05, + "loss": 0.865, + "step": 8674 + }, + { + "epoch": 0.5877769496578359, + "grad_norm": 5.60343599319458, + "learning_rate": 8.915189266890273e-05, + "loss": 0.8023, + "step": 8675 + }, + { + "epoch": 0.587844704925808, + "grad_norm": 6.189206123352051, + "learning_rate": 8.915052364980491e-05, + "loss": 0.8698, + "step": 8676 + }, + { + "epoch": 0.5879124601937801, + "grad_norm": 6.227395534515381, + "learning_rate": 8.914915463070709e-05, + "loss": 0.9052, + "step": 8677 + }, + { + "epoch": 0.5879802154617522, + "grad_norm": 5.004229545593262, + "learning_rate": 8.914778561160929e-05, + "loss": 0.6775, + "step": 8678 + }, + { + "epoch": 0.5880479707297243, + "grad_norm": 8.418085098266602, + "learning_rate": 8.914641659251147e-05, + "loss": 1.1209, + "step": 8679 + }, + { + "epoch": 0.5881157259976963, + "grad_norm": 7.0277204513549805, + "learning_rate": 8.914504757341365e-05, + "loss": 0.9533, + "step": 8680 + }, + { + "epoch": 0.5881834812656684, + "grad_norm": 6.450404644012451, + "learning_rate": 8.914367855431584e-05, + "loss": 0.9277, + "step": 8681 + }, + { + "epoch": 0.5882512365336405, + "grad_norm": 6.549577236175537, + "learning_rate": 8.914230953521802e-05, + "loss": 0.7418, + "step": 8682 + }, + { + "epoch": 0.5883189918016126, + "grad_norm": 7.030699729919434, + "learning_rate": 8.91409405161202e-05, + "loss": 0.8336, + "step": 8683 + }, + { + "epoch": 0.5883867470695847, + "grad_norm": 6.665583610534668, + "learning_rate": 8.91395714970224e-05, + "loss": 0.8458, + "step": 8684 + }, + { + "epoch": 0.5884545023375567, + "grad_norm": 7.172003269195557, + "learning_rate": 8.913820247792458e-05, + "loss": 0.9778, + "step": 8685 + }, + { + "epoch": 0.5885222576055288, + "grad_norm": 5.771976947784424, + "learning_rate": 8.913683345882676e-05, + "loss": 0.6404, + "step": 8686 + }, + { + "epoch": 0.5885900128735009, + "grad_norm": 6.679132461547852, + "learning_rate": 8.913546443972895e-05, + "loss": 0.8758, + "step": 8687 + }, + { + "epoch": 0.588657768141473, + "grad_norm": 6.341593265533447, + "learning_rate": 8.913409542063113e-05, + "loss": 0.9231, + "step": 8688 + }, + { + "epoch": 0.5887255234094451, + "grad_norm": 4.834532260894775, + "learning_rate": 8.913272640153331e-05, + "loss": 0.6709, + "step": 8689 + }, + { + "epoch": 0.5887932786774172, + "grad_norm": 6.240522861480713, + "learning_rate": 8.913135738243549e-05, + "loss": 0.923, + "step": 8690 + }, + { + "epoch": 0.5888610339453892, + "grad_norm": 6.935693264007568, + "learning_rate": 8.912998836333767e-05, + "loss": 0.7663, + "step": 8691 + }, + { + "epoch": 0.5889287892133613, + "grad_norm": 6.702020168304443, + "learning_rate": 8.912861934423987e-05, + "loss": 0.937, + "step": 8692 + }, + { + "epoch": 0.5889965444813334, + "grad_norm": 6.361667156219482, + "learning_rate": 8.912725032514205e-05, + "loss": 0.7574, + "step": 8693 + }, + { + "epoch": 0.5890642997493055, + "grad_norm": 7.599695682525635, + "learning_rate": 8.912588130604423e-05, + "loss": 0.8366, + "step": 8694 + }, + { + "epoch": 0.5891320550172776, + "grad_norm": 7.101802349090576, + "learning_rate": 8.912451228694641e-05, + "loss": 0.9409, + "step": 8695 + }, + { + "epoch": 0.5891998102852497, + "grad_norm": 7.674192905426025, + "learning_rate": 8.912314326784859e-05, + "loss": 1.0846, + "step": 8696 + }, + { + "epoch": 0.5892675655532218, + "grad_norm": 6.223476886749268, + "learning_rate": 8.912177424875078e-05, + "loss": 0.9597, + "step": 8697 + }, + { + "epoch": 0.5893353208211939, + "grad_norm": 5.847829341888428, + "learning_rate": 8.912040522965296e-05, + "loss": 0.8017, + "step": 8698 + }, + { + "epoch": 0.589403076089166, + "grad_norm": 6.459780216217041, + "learning_rate": 8.911903621055514e-05, + "loss": 0.877, + "step": 8699 + }, + { + "epoch": 0.5894708313571381, + "grad_norm": 5.9923481941223145, + "learning_rate": 8.911766719145732e-05, + "loss": 0.9193, + "step": 8700 + }, + { + "epoch": 0.58953858662511, + "grad_norm": 5.799968242645264, + "learning_rate": 8.911629817235952e-05, + "loss": 0.7449, + "step": 8701 + }, + { + "epoch": 0.5896063418930821, + "grad_norm": 6.3980393409729, + "learning_rate": 8.91149291532617e-05, + "loss": 0.9118, + "step": 8702 + }, + { + "epoch": 0.5896740971610542, + "grad_norm": 6.184665203094482, + "learning_rate": 8.911356013416388e-05, + "loss": 0.6857, + "step": 8703 + }, + { + "epoch": 0.5897418524290263, + "grad_norm": 7.494194030761719, + "learning_rate": 8.911219111506606e-05, + "loss": 1.0483, + "step": 8704 + }, + { + "epoch": 0.5898096076969984, + "grad_norm": 5.360753059387207, + "learning_rate": 8.911082209596824e-05, + "loss": 0.7064, + "step": 8705 + }, + { + "epoch": 0.5898773629649705, + "grad_norm": 6.393972873687744, + "learning_rate": 8.910945307687043e-05, + "loss": 0.9095, + "step": 8706 + }, + { + "epoch": 0.5899451182329426, + "grad_norm": 8.099264144897461, + "learning_rate": 8.910808405777261e-05, + "loss": 1.0004, + "step": 8707 + }, + { + "epoch": 0.5900128735009147, + "grad_norm": 5.654821872711182, + "learning_rate": 8.910671503867479e-05, + "loss": 0.753, + "step": 8708 + }, + { + "epoch": 0.5900806287688868, + "grad_norm": 7.046943187713623, + "learning_rate": 8.910534601957697e-05, + "loss": 1.012, + "step": 8709 + }, + { + "epoch": 0.5901483840368589, + "grad_norm": 6.534369945526123, + "learning_rate": 8.910397700047917e-05, + "loss": 0.8296, + "step": 8710 + }, + { + "epoch": 0.590216139304831, + "grad_norm": 5.612612724304199, + "learning_rate": 8.910260798138135e-05, + "loss": 0.7516, + "step": 8711 + }, + { + "epoch": 0.5902838945728031, + "grad_norm": 5.311154365539551, + "learning_rate": 8.910123896228353e-05, + "loss": 0.7594, + "step": 8712 + }, + { + "epoch": 0.5903516498407752, + "grad_norm": 6.404855251312256, + "learning_rate": 8.909986994318571e-05, + "loss": 0.7252, + "step": 8713 + }, + { + "epoch": 0.5904194051087472, + "grad_norm": 7.446944236755371, + "learning_rate": 8.909850092408789e-05, + "loss": 0.9048, + "step": 8714 + }, + { + "epoch": 0.5904871603767193, + "grad_norm": 5.755975246429443, + "learning_rate": 8.909713190499008e-05, + "loss": 0.6617, + "step": 8715 + }, + { + "epoch": 0.5905549156446914, + "grad_norm": 6.966080188751221, + "learning_rate": 8.909576288589226e-05, + "loss": 0.5794, + "step": 8716 + }, + { + "epoch": 0.5906226709126634, + "grad_norm": 6.13206672668457, + "learning_rate": 8.909439386679444e-05, + "loss": 0.8356, + "step": 8717 + }, + { + "epoch": 0.5906904261806355, + "grad_norm": 7.370462417602539, + "learning_rate": 8.909302484769662e-05, + "loss": 0.7215, + "step": 8718 + }, + { + "epoch": 0.5907581814486076, + "grad_norm": 7.727412223815918, + "learning_rate": 8.90916558285988e-05, + "loss": 0.7943, + "step": 8719 + }, + { + "epoch": 0.5908259367165797, + "grad_norm": 7.801811218261719, + "learning_rate": 8.9090286809501e-05, + "loss": 0.7986, + "step": 8720 + }, + { + "epoch": 0.5908936919845518, + "grad_norm": 8.468879699707031, + "learning_rate": 8.908891779040318e-05, + "loss": 0.9178, + "step": 8721 + }, + { + "epoch": 0.5909614472525239, + "grad_norm": 5.8401665687561035, + "learning_rate": 8.908754877130536e-05, + "loss": 0.9042, + "step": 8722 + }, + { + "epoch": 0.591029202520496, + "grad_norm": 6.750467777252197, + "learning_rate": 8.908617975220754e-05, + "loss": 0.7983, + "step": 8723 + }, + { + "epoch": 0.591096957788468, + "grad_norm": 5.489363193511963, + "learning_rate": 8.908481073310973e-05, + "loss": 0.6839, + "step": 8724 + }, + { + "epoch": 0.5911647130564401, + "grad_norm": 8.433201789855957, + "learning_rate": 8.908344171401191e-05, + "loss": 0.9413, + "step": 8725 + }, + { + "epoch": 0.5912324683244122, + "grad_norm": 6.133078575134277, + "learning_rate": 8.908207269491409e-05, + "loss": 1.0624, + "step": 8726 + }, + { + "epoch": 0.5913002235923843, + "grad_norm": 7.1547698974609375, + "learning_rate": 8.908070367581627e-05, + "loss": 0.7873, + "step": 8727 + }, + { + "epoch": 0.5913679788603564, + "grad_norm": 6.1702494621276855, + "learning_rate": 8.907933465671847e-05, + "loss": 0.737, + "step": 8728 + }, + { + "epoch": 0.5914357341283285, + "grad_norm": 8.64965534210205, + "learning_rate": 8.907796563762065e-05, + "loss": 0.7949, + "step": 8729 + }, + { + "epoch": 0.5915034893963006, + "grad_norm": 6.769810676574707, + "learning_rate": 8.907659661852283e-05, + "loss": 0.9665, + "step": 8730 + }, + { + "epoch": 0.5915712446642727, + "grad_norm": 6.564850807189941, + "learning_rate": 8.907522759942502e-05, + "loss": 0.658, + "step": 8731 + }, + { + "epoch": 0.5916389999322448, + "grad_norm": 5.231021404266357, + "learning_rate": 8.90738585803272e-05, + "loss": 0.9782, + "step": 8732 + }, + { + "epoch": 0.5917067552002169, + "grad_norm": 6.880924701690674, + "learning_rate": 8.907248956122938e-05, + "loss": 0.9236, + "step": 8733 + }, + { + "epoch": 0.5917745104681889, + "grad_norm": 7.0143585205078125, + "learning_rate": 8.907112054213158e-05, + "loss": 0.8532, + "step": 8734 + }, + { + "epoch": 0.5918422657361609, + "grad_norm": 6.451882839202881, + "learning_rate": 8.906975152303376e-05, + "loss": 0.7571, + "step": 8735 + }, + { + "epoch": 0.591910021004133, + "grad_norm": 7.905577182769775, + "learning_rate": 8.906838250393594e-05, + "loss": 0.7599, + "step": 8736 + }, + { + "epoch": 0.5919777762721051, + "grad_norm": 6.122454643249512, + "learning_rate": 8.906701348483812e-05, + "loss": 0.9153, + "step": 8737 + }, + { + "epoch": 0.5920455315400772, + "grad_norm": 7.811397552490234, + "learning_rate": 8.906564446574031e-05, + "loss": 0.906, + "step": 8738 + }, + { + "epoch": 0.5921132868080493, + "grad_norm": 5.695565223693848, + "learning_rate": 8.906427544664249e-05, + "loss": 0.7452, + "step": 8739 + }, + { + "epoch": 0.5921810420760214, + "grad_norm": 6.3696393966674805, + "learning_rate": 8.906290642754467e-05, + "loss": 0.8223, + "step": 8740 + }, + { + "epoch": 0.5922487973439935, + "grad_norm": 6.49605655670166, + "learning_rate": 8.906153740844685e-05, + "loss": 0.9092, + "step": 8741 + }, + { + "epoch": 0.5923165526119656, + "grad_norm": 5.290238380432129, + "learning_rate": 8.906016838934903e-05, + "loss": 0.7335, + "step": 8742 + }, + { + "epoch": 0.5923843078799377, + "grad_norm": 6.913309097290039, + "learning_rate": 8.905879937025123e-05, + "loss": 0.7756, + "step": 8743 + }, + { + "epoch": 0.5924520631479098, + "grad_norm": 5.938857555389404, + "learning_rate": 8.90574303511534e-05, + "loss": 0.758, + "step": 8744 + }, + { + "epoch": 0.5925198184158819, + "grad_norm": 7.276566982269287, + "learning_rate": 8.905606133205559e-05, + "loss": 0.8429, + "step": 8745 + }, + { + "epoch": 0.592587573683854, + "grad_norm": 7.6287522315979, + "learning_rate": 8.905469231295777e-05, + "loss": 1.0939, + "step": 8746 + }, + { + "epoch": 0.592655328951826, + "grad_norm": 7.293666362762451, + "learning_rate": 8.905332329385996e-05, + "loss": 0.8264, + "step": 8747 + }, + { + "epoch": 0.5927230842197981, + "grad_norm": 5.522965908050537, + "learning_rate": 8.905195427476214e-05, + "loss": 0.6578, + "step": 8748 + }, + { + "epoch": 0.5927908394877702, + "grad_norm": 6.8883466720581055, + "learning_rate": 8.905058525566432e-05, + "loss": 0.9412, + "step": 8749 + }, + { + "epoch": 0.5928585947557422, + "grad_norm": 6.682039737701416, + "learning_rate": 8.90492162365665e-05, + "loss": 0.8375, + "step": 8750 + }, + { + "epoch": 0.5929263500237143, + "grad_norm": 9.619691848754883, + "learning_rate": 8.904784721746868e-05, + "loss": 0.9461, + "step": 8751 + }, + { + "epoch": 0.5929941052916864, + "grad_norm": 6.880954265594482, + "learning_rate": 8.904647819837088e-05, + "loss": 0.895, + "step": 8752 + }, + { + "epoch": 0.5930618605596585, + "grad_norm": 5.3324761390686035, + "learning_rate": 8.904510917927306e-05, + "loss": 0.7613, + "step": 8753 + }, + { + "epoch": 0.5931296158276306, + "grad_norm": 7.377603054046631, + "learning_rate": 8.904374016017524e-05, + "loss": 0.8565, + "step": 8754 + }, + { + "epoch": 0.5931973710956027, + "grad_norm": 6.537837982177734, + "learning_rate": 8.904237114107742e-05, + "loss": 0.9888, + "step": 8755 + }, + { + "epoch": 0.5932651263635748, + "grad_norm": 6.159317493438721, + "learning_rate": 8.904100212197961e-05, + "loss": 1.0156, + "step": 8756 + }, + { + "epoch": 0.5933328816315468, + "grad_norm": 10.275593757629395, + "learning_rate": 8.903963310288179e-05, + "loss": 0.8343, + "step": 8757 + }, + { + "epoch": 0.5934006368995189, + "grad_norm": 6.211344242095947, + "learning_rate": 8.903826408378397e-05, + "loss": 0.8728, + "step": 8758 + }, + { + "epoch": 0.593468392167491, + "grad_norm": 8.628105163574219, + "learning_rate": 8.903689506468615e-05, + "loss": 0.8468, + "step": 8759 + }, + { + "epoch": 0.5935361474354631, + "grad_norm": 5.519963264465332, + "learning_rate": 8.903552604558833e-05, + "loss": 0.7188, + "step": 8760 + }, + { + "epoch": 0.5936039027034352, + "grad_norm": 6.6959991455078125, + "learning_rate": 8.903415702649053e-05, + "loss": 1.1405, + "step": 8761 + }, + { + "epoch": 0.5936716579714073, + "grad_norm": 5.843451499938965, + "learning_rate": 8.90327880073927e-05, + "loss": 0.7847, + "step": 8762 + }, + { + "epoch": 0.5937394132393794, + "grad_norm": 5.644291400909424, + "learning_rate": 8.903141898829489e-05, + "loss": 0.6311, + "step": 8763 + }, + { + "epoch": 0.5938071685073515, + "grad_norm": 6.7555952072143555, + "learning_rate": 8.903004996919707e-05, + "loss": 1.0387, + "step": 8764 + }, + { + "epoch": 0.5938749237753236, + "grad_norm": 6.006453990936279, + "learning_rate": 8.902868095009926e-05, + "loss": 0.6646, + "step": 8765 + }, + { + "epoch": 0.5939426790432956, + "grad_norm": 7.705087184906006, + "learning_rate": 8.902731193100144e-05, + "loss": 1.1749, + "step": 8766 + }, + { + "epoch": 0.5940104343112677, + "grad_norm": 6.157181739807129, + "learning_rate": 8.902594291190362e-05, + "loss": 0.979, + "step": 8767 + }, + { + "epoch": 0.5940781895792397, + "grad_norm": 6.080206394195557, + "learning_rate": 8.90245738928058e-05, + "loss": 0.9294, + "step": 8768 + }, + { + "epoch": 0.5941459448472118, + "grad_norm": 5.215951919555664, + "learning_rate": 8.902320487370798e-05, + "loss": 0.7154, + "step": 8769 + }, + { + "epoch": 0.5942137001151839, + "grad_norm": 6.383238792419434, + "learning_rate": 8.902183585461018e-05, + "loss": 0.9136, + "step": 8770 + }, + { + "epoch": 0.594281455383156, + "grad_norm": 5.297086238861084, + "learning_rate": 8.902046683551236e-05, + "loss": 0.7147, + "step": 8771 + }, + { + "epoch": 0.5943492106511281, + "grad_norm": 6.686932563781738, + "learning_rate": 8.901909781641454e-05, + "loss": 1.0018, + "step": 8772 + }, + { + "epoch": 0.5944169659191002, + "grad_norm": 5.988333702087402, + "learning_rate": 8.901772879731672e-05, + "loss": 0.7809, + "step": 8773 + }, + { + "epoch": 0.5944847211870723, + "grad_norm": 6.060636043548584, + "learning_rate": 8.901635977821891e-05, + "loss": 0.6055, + "step": 8774 + }, + { + "epoch": 0.5945524764550444, + "grad_norm": 6.24500036239624, + "learning_rate": 8.901499075912109e-05, + "loss": 0.8885, + "step": 8775 + }, + { + "epoch": 0.5946202317230165, + "grad_norm": 6.189664363861084, + "learning_rate": 8.901362174002327e-05, + "loss": 0.8793, + "step": 8776 + }, + { + "epoch": 0.5946879869909886, + "grad_norm": 5.860182762145996, + "learning_rate": 8.901225272092547e-05, + "loss": 0.9083, + "step": 8777 + }, + { + "epoch": 0.5947557422589607, + "grad_norm": 10.454379081726074, + "learning_rate": 8.901088370182765e-05, + "loss": 0.6481, + "step": 8778 + }, + { + "epoch": 0.5948234975269328, + "grad_norm": 6.585578918457031, + "learning_rate": 8.900951468272983e-05, + "loss": 1.0381, + "step": 8779 + }, + { + "epoch": 0.5948912527949048, + "grad_norm": 7.335190296173096, + "learning_rate": 8.900814566363202e-05, + "loss": 0.947, + "step": 8780 + }, + { + "epoch": 0.5949590080628769, + "grad_norm": 6.560307502746582, + "learning_rate": 8.90067766445342e-05, + "loss": 0.951, + "step": 8781 + }, + { + "epoch": 0.595026763330849, + "grad_norm": 5.926759243011475, + "learning_rate": 8.900540762543638e-05, + "loss": 0.9487, + "step": 8782 + }, + { + "epoch": 0.595094518598821, + "grad_norm": 5.300271034240723, + "learning_rate": 8.900403860633856e-05, + "loss": 0.7232, + "step": 8783 + }, + { + "epoch": 0.5951622738667931, + "grad_norm": 6.828729629516602, + "learning_rate": 8.900266958724076e-05, + "loss": 0.9124, + "step": 8784 + }, + { + "epoch": 0.5952300291347652, + "grad_norm": 5.51554012298584, + "learning_rate": 8.900130056814294e-05, + "loss": 0.7698, + "step": 8785 + }, + { + "epoch": 0.5952977844027373, + "grad_norm": 6.492809295654297, + "learning_rate": 8.899993154904512e-05, + "loss": 0.7388, + "step": 8786 + }, + { + "epoch": 0.5953655396707094, + "grad_norm": 6.4316792488098145, + "learning_rate": 8.89985625299473e-05, + "loss": 0.7843, + "step": 8787 + }, + { + "epoch": 0.5954332949386815, + "grad_norm": 5.711614608764648, + "learning_rate": 8.899719351084949e-05, + "loss": 0.8458, + "step": 8788 + }, + { + "epoch": 0.5955010502066536, + "grad_norm": 7.605274677276611, + "learning_rate": 8.899582449175167e-05, + "loss": 0.8907, + "step": 8789 + }, + { + "epoch": 0.5955688054746257, + "grad_norm": 6.734317302703857, + "learning_rate": 8.899445547265385e-05, + "loss": 0.8063, + "step": 8790 + }, + { + "epoch": 0.5956365607425977, + "grad_norm": 10.887665748596191, + "learning_rate": 8.899308645355603e-05, + "loss": 0.861, + "step": 8791 + }, + { + "epoch": 0.5957043160105698, + "grad_norm": 7.033245086669922, + "learning_rate": 8.899171743445821e-05, + "loss": 1.0345, + "step": 8792 + }, + { + "epoch": 0.5957720712785419, + "grad_norm": 5.743894577026367, + "learning_rate": 8.89903484153604e-05, + "loss": 0.7492, + "step": 8793 + }, + { + "epoch": 0.595839826546514, + "grad_norm": 5.664433479309082, + "learning_rate": 8.898897939626259e-05, + "loss": 0.9206, + "step": 8794 + }, + { + "epoch": 0.5959075818144861, + "grad_norm": 5.304537773132324, + "learning_rate": 8.898761037716477e-05, + "loss": 0.7669, + "step": 8795 + }, + { + "epoch": 0.5959753370824582, + "grad_norm": 6.449788570404053, + "learning_rate": 8.898624135806695e-05, + "loss": 0.9168, + "step": 8796 + }, + { + "epoch": 0.5960430923504303, + "grad_norm": 7.1110429763793945, + "learning_rate": 8.898487233896913e-05, + "loss": 0.8633, + "step": 8797 + }, + { + "epoch": 0.5961108476184024, + "grad_norm": 8.217866897583008, + "learning_rate": 8.898350331987132e-05, + "loss": 0.9472, + "step": 8798 + }, + { + "epoch": 0.5961786028863744, + "grad_norm": 6.968807697296143, + "learning_rate": 8.89821343007735e-05, + "loss": 0.7977, + "step": 8799 + }, + { + "epoch": 0.5962463581543465, + "grad_norm": 6.342806339263916, + "learning_rate": 8.898076528167568e-05, + "loss": 0.8461, + "step": 8800 + }, + { + "epoch": 0.5963141134223185, + "grad_norm": 6.918409824371338, + "learning_rate": 8.897939626257786e-05, + "loss": 0.9768, + "step": 8801 + }, + { + "epoch": 0.5963818686902906, + "grad_norm": 6.3519978523254395, + "learning_rate": 8.897802724348006e-05, + "loss": 0.794, + "step": 8802 + }, + { + "epoch": 0.5964496239582627, + "grad_norm": 9.268524169921875, + "learning_rate": 8.897665822438224e-05, + "loss": 1.0708, + "step": 8803 + }, + { + "epoch": 0.5965173792262348, + "grad_norm": 6.6414008140563965, + "learning_rate": 8.897528920528442e-05, + "loss": 0.9317, + "step": 8804 + }, + { + "epoch": 0.5965851344942069, + "grad_norm": 5.683966159820557, + "learning_rate": 8.89739201861866e-05, + "loss": 0.8018, + "step": 8805 + }, + { + "epoch": 0.596652889762179, + "grad_norm": 8.505208015441895, + "learning_rate": 8.897255116708878e-05, + "loss": 0.8207, + "step": 8806 + }, + { + "epoch": 0.5967206450301511, + "grad_norm": 6.509139060974121, + "learning_rate": 8.897118214799097e-05, + "loss": 0.8822, + "step": 8807 + }, + { + "epoch": 0.5967884002981232, + "grad_norm": 4.886582851409912, + "learning_rate": 8.896981312889315e-05, + "loss": 1.1176, + "step": 8808 + }, + { + "epoch": 0.5968561555660953, + "grad_norm": 6.053840637207031, + "learning_rate": 8.896844410979533e-05, + "loss": 0.6305, + "step": 8809 + }, + { + "epoch": 0.5969239108340674, + "grad_norm": 6.561148166656494, + "learning_rate": 8.896707509069751e-05, + "loss": 1.0339, + "step": 8810 + }, + { + "epoch": 0.5969916661020395, + "grad_norm": 7.02574348449707, + "learning_rate": 8.89657060715997e-05, + "loss": 0.8793, + "step": 8811 + }, + { + "epoch": 0.5970594213700116, + "grad_norm": 6.380439758300781, + "learning_rate": 8.896433705250189e-05, + "loss": 0.9474, + "step": 8812 + }, + { + "epoch": 0.5971271766379836, + "grad_norm": 6.902020454406738, + "learning_rate": 8.896296803340407e-05, + "loss": 0.8541, + "step": 8813 + }, + { + "epoch": 0.5971949319059557, + "grad_norm": 5.170351505279541, + "learning_rate": 8.896159901430625e-05, + "loss": 0.6224, + "step": 8814 + }, + { + "epoch": 0.5972626871739277, + "grad_norm": 6.399029731750488, + "learning_rate": 8.896022999520843e-05, + "loss": 0.7733, + "step": 8815 + }, + { + "epoch": 0.5973304424418998, + "grad_norm": 7.200798988342285, + "learning_rate": 8.895886097611062e-05, + "loss": 0.9942, + "step": 8816 + }, + { + "epoch": 0.5973981977098719, + "grad_norm": 5.057744979858398, + "learning_rate": 8.89574919570128e-05, + "loss": 0.8113, + "step": 8817 + }, + { + "epoch": 0.597465952977844, + "grad_norm": 6.267950534820557, + "learning_rate": 8.895612293791498e-05, + "loss": 1.1343, + "step": 8818 + }, + { + "epoch": 0.5975337082458161, + "grad_norm": 5.712194919586182, + "learning_rate": 8.895475391881716e-05, + "loss": 0.6812, + "step": 8819 + }, + { + "epoch": 0.5976014635137882, + "grad_norm": 6.567288875579834, + "learning_rate": 8.895338489971936e-05, + "loss": 0.8451, + "step": 8820 + }, + { + "epoch": 0.5976692187817603, + "grad_norm": 6.08546781539917, + "learning_rate": 8.895201588062154e-05, + "loss": 0.7353, + "step": 8821 + }, + { + "epoch": 0.5977369740497324, + "grad_norm": 6.859540939331055, + "learning_rate": 8.895064686152372e-05, + "loss": 0.6795, + "step": 8822 + }, + { + "epoch": 0.5978047293177045, + "grad_norm": 5.679804801940918, + "learning_rate": 8.894927784242591e-05, + "loss": 0.7286, + "step": 8823 + }, + { + "epoch": 0.5978724845856765, + "grad_norm": 7.603654384613037, + "learning_rate": 8.894790882332809e-05, + "loss": 0.87, + "step": 8824 + }, + { + "epoch": 0.5979402398536486, + "grad_norm": 6.440685272216797, + "learning_rate": 8.894653980423027e-05, + "loss": 1.043, + "step": 8825 + }, + { + "epoch": 0.5980079951216207, + "grad_norm": 6.420576572418213, + "learning_rate": 8.894517078513247e-05, + "loss": 0.6191, + "step": 8826 + }, + { + "epoch": 0.5980757503895928, + "grad_norm": 6.01546573638916, + "learning_rate": 8.894380176603465e-05, + "loss": 0.9581, + "step": 8827 + }, + { + "epoch": 0.5981435056575649, + "grad_norm": 6.9343767166137695, + "learning_rate": 8.894243274693683e-05, + "loss": 0.9531, + "step": 8828 + }, + { + "epoch": 0.598211260925537, + "grad_norm": 6.511411666870117, + "learning_rate": 8.8941063727839e-05, + "loss": 0.6982, + "step": 8829 + }, + { + "epoch": 0.5982790161935091, + "grad_norm": 5.96348762512207, + "learning_rate": 8.89396947087412e-05, + "loss": 0.6761, + "step": 8830 + }, + { + "epoch": 0.5983467714614811, + "grad_norm": 5.571112155914307, + "learning_rate": 8.893832568964338e-05, + "loss": 0.7856, + "step": 8831 + }, + { + "epoch": 0.5984145267294532, + "grad_norm": 7.437000751495361, + "learning_rate": 8.893695667054556e-05, + "loss": 0.8045, + "step": 8832 + }, + { + "epoch": 0.5984822819974253, + "grad_norm": 6.710272789001465, + "learning_rate": 8.893558765144774e-05, + "loss": 0.725, + "step": 8833 + }, + { + "epoch": 0.5985500372653973, + "grad_norm": 8.211858749389648, + "learning_rate": 8.893421863234994e-05, + "loss": 1.0648, + "step": 8834 + }, + { + "epoch": 0.5986177925333694, + "grad_norm": 6.0621018409729, + "learning_rate": 8.893284961325212e-05, + "loss": 0.8461, + "step": 8835 + }, + { + "epoch": 0.5986855478013415, + "grad_norm": 6.834799766540527, + "learning_rate": 8.89314805941543e-05, + "loss": 0.9162, + "step": 8836 + }, + { + "epoch": 0.5987533030693136, + "grad_norm": 7.666240215301514, + "learning_rate": 8.893011157505648e-05, + "loss": 0.8657, + "step": 8837 + }, + { + "epoch": 0.5988210583372857, + "grad_norm": 5.868284225463867, + "learning_rate": 8.892874255595866e-05, + "loss": 0.78, + "step": 8838 + }, + { + "epoch": 0.5988888136052578, + "grad_norm": 8.84867000579834, + "learning_rate": 8.892737353686085e-05, + "loss": 1.0317, + "step": 8839 + }, + { + "epoch": 0.5989565688732299, + "grad_norm": 7.292559623718262, + "learning_rate": 8.892600451776303e-05, + "loss": 0.9336, + "step": 8840 + }, + { + "epoch": 0.599024324141202, + "grad_norm": 8.978638648986816, + "learning_rate": 8.892463549866521e-05, + "loss": 0.7901, + "step": 8841 + }, + { + "epoch": 0.5990920794091741, + "grad_norm": 6.713160037994385, + "learning_rate": 8.892326647956739e-05, + "loss": 0.9241, + "step": 8842 + }, + { + "epoch": 0.5991598346771462, + "grad_norm": 7.525753021240234, + "learning_rate": 8.892189746046959e-05, + "loss": 1.0159, + "step": 8843 + }, + { + "epoch": 0.5992275899451183, + "grad_norm": 5.174670696258545, + "learning_rate": 8.892052844137177e-05, + "loss": 0.6338, + "step": 8844 + }, + { + "epoch": 0.5992953452130904, + "grad_norm": 6.392926216125488, + "learning_rate": 8.891915942227395e-05, + "loss": 0.9449, + "step": 8845 + }, + { + "epoch": 0.5993631004810624, + "grad_norm": 6.458075523376465, + "learning_rate": 8.891779040317613e-05, + "loss": 0.8109, + "step": 8846 + }, + { + "epoch": 0.5994308557490345, + "grad_norm": 6.092816352844238, + "learning_rate": 8.89164213840783e-05, + "loss": 0.6486, + "step": 8847 + }, + { + "epoch": 0.5994986110170065, + "grad_norm": 5.358661651611328, + "learning_rate": 8.89150523649805e-05, + "loss": 0.6597, + "step": 8848 + }, + { + "epoch": 0.5995663662849786, + "grad_norm": 7.168787956237793, + "learning_rate": 8.891368334588268e-05, + "loss": 0.9348, + "step": 8849 + }, + { + "epoch": 0.5996341215529507, + "grad_norm": 8.961309432983398, + "learning_rate": 8.891231432678486e-05, + "loss": 1.078, + "step": 8850 + }, + { + "epoch": 0.5997018768209228, + "grad_norm": 6.025523662567139, + "learning_rate": 8.891094530768704e-05, + "loss": 0.8771, + "step": 8851 + }, + { + "epoch": 0.5997696320888949, + "grad_norm": 6.479866981506348, + "learning_rate": 8.890957628858922e-05, + "loss": 0.6807, + "step": 8852 + }, + { + "epoch": 0.599837387356867, + "grad_norm": 7.786321640014648, + "learning_rate": 8.890820726949142e-05, + "loss": 1.0007, + "step": 8853 + }, + { + "epoch": 0.5999051426248391, + "grad_norm": 6.334638595581055, + "learning_rate": 8.89068382503936e-05, + "loss": 0.9056, + "step": 8854 + }, + { + "epoch": 0.5999728978928112, + "grad_norm": 6.959164619445801, + "learning_rate": 8.890546923129578e-05, + "loss": 0.9682, + "step": 8855 + }, + { + "epoch": 0.6000406531607833, + "grad_norm": 7.86105489730835, + "learning_rate": 8.890410021219796e-05, + "loss": 0.7696, + "step": 8856 + }, + { + "epoch": 0.6001084084287553, + "grad_norm": 9.848732948303223, + "learning_rate": 8.890273119310015e-05, + "loss": 0.8796, + "step": 8857 + }, + { + "epoch": 0.6001761636967274, + "grad_norm": 6.406124591827393, + "learning_rate": 8.890136217400233e-05, + "loss": 0.9671, + "step": 8858 + }, + { + "epoch": 0.6002439189646995, + "grad_norm": 6.441462993621826, + "learning_rate": 8.889999315490451e-05, + "loss": 0.851, + "step": 8859 + }, + { + "epoch": 0.6003116742326716, + "grad_norm": 7.711560249328613, + "learning_rate": 8.889862413580669e-05, + "loss": 1.0492, + "step": 8860 + }, + { + "epoch": 0.6003794295006437, + "grad_norm": 7.2431960105896, + "learning_rate": 8.889725511670887e-05, + "loss": 0.9887, + "step": 8861 + }, + { + "epoch": 0.6004471847686158, + "grad_norm": 7.861832618713379, + "learning_rate": 8.889588609761107e-05, + "loss": 1.0759, + "step": 8862 + }, + { + "epoch": 0.6005149400365879, + "grad_norm": 6.643199920654297, + "learning_rate": 8.889451707851325e-05, + "loss": 0.7989, + "step": 8863 + }, + { + "epoch": 0.6005826953045599, + "grad_norm": 5.903257369995117, + "learning_rate": 8.889314805941543e-05, + "loss": 0.6606, + "step": 8864 + }, + { + "epoch": 0.600650450572532, + "grad_norm": 6.016655445098877, + "learning_rate": 8.889177904031761e-05, + "loss": 0.6489, + "step": 8865 + }, + { + "epoch": 0.6007182058405041, + "grad_norm": 7.733530521392822, + "learning_rate": 8.88904100212198e-05, + "loss": 1.0653, + "step": 8866 + }, + { + "epoch": 0.6007859611084762, + "grad_norm": 6.951436996459961, + "learning_rate": 8.888904100212198e-05, + "loss": 0.821, + "step": 8867 + }, + { + "epoch": 0.6008537163764482, + "grad_norm": 5.853886604309082, + "learning_rate": 8.888767198302416e-05, + "loss": 0.7154, + "step": 8868 + }, + { + "epoch": 0.6009214716444203, + "grad_norm": 7.300787925720215, + "learning_rate": 8.888630296392636e-05, + "loss": 1.0176, + "step": 8869 + }, + { + "epoch": 0.6009892269123924, + "grad_norm": 6.771045684814453, + "learning_rate": 8.888493394482854e-05, + "loss": 0.7652, + "step": 8870 + }, + { + "epoch": 0.6010569821803645, + "grad_norm": 6.345951557159424, + "learning_rate": 8.888356492573072e-05, + "loss": 1.0231, + "step": 8871 + }, + { + "epoch": 0.6011247374483366, + "grad_norm": 6.479809284210205, + "learning_rate": 8.888219590663291e-05, + "loss": 0.7587, + "step": 8872 + }, + { + "epoch": 0.6011924927163087, + "grad_norm": 7.4716796875, + "learning_rate": 8.888082688753509e-05, + "loss": 0.8283, + "step": 8873 + }, + { + "epoch": 0.6012602479842808, + "grad_norm": 7.2003068923950195, + "learning_rate": 8.887945786843727e-05, + "loss": 0.8622, + "step": 8874 + }, + { + "epoch": 0.6013280032522529, + "grad_norm": 5.774078845977783, + "learning_rate": 8.887808884933945e-05, + "loss": 0.7783, + "step": 8875 + }, + { + "epoch": 0.601395758520225, + "grad_norm": 5.885690689086914, + "learning_rate": 8.887671983024164e-05, + "loss": 0.984, + "step": 8876 + }, + { + "epoch": 0.6014635137881971, + "grad_norm": 6.9094038009643555, + "learning_rate": 8.887535081114383e-05, + "loss": 0.8093, + "step": 8877 + }, + { + "epoch": 0.6015312690561692, + "grad_norm": 7.623313903808594, + "learning_rate": 8.8873981792046e-05, + "loss": 0.8604, + "step": 8878 + }, + { + "epoch": 0.6015990243241413, + "grad_norm": 4.915633201599121, + "learning_rate": 8.887261277294819e-05, + "loss": 0.6538, + "step": 8879 + }, + { + "epoch": 0.6016667795921132, + "grad_norm": 6.35188627243042, + "learning_rate": 8.887124375385038e-05, + "loss": 0.7546, + "step": 8880 + }, + { + "epoch": 0.6017345348600853, + "grad_norm": 6.262838840484619, + "learning_rate": 8.886987473475256e-05, + "loss": 0.7816, + "step": 8881 + }, + { + "epoch": 0.6018022901280574, + "grad_norm": 6.2654032707214355, + "learning_rate": 8.886850571565474e-05, + "loss": 1.0446, + "step": 8882 + }, + { + "epoch": 0.6018700453960295, + "grad_norm": 6.389410972595215, + "learning_rate": 8.886713669655692e-05, + "loss": 0.9104, + "step": 8883 + }, + { + "epoch": 0.6019378006640016, + "grad_norm": 6.192864894866943, + "learning_rate": 8.88657676774591e-05, + "loss": 0.8855, + "step": 8884 + }, + { + "epoch": 0.6020055559319737, + "grad_norm": 6.384714126586914, + "learning_rate": 8.88643986583613e-05, + "loss": 0.7535, + "step": 8885 + }, + { + "epoch": 0.6020733111999458, + "grad_norm": 6.433627605438232, + "learning_rate": 8.886302963926348e-05, + "loss": 0.6994, + "step": 8886 + }, + { + "epoch": 0.6021410664679179, + "grad_norm": 6.539730072021484, + "learning_rate": 8.886166062016566e-05, + "loss": 1.0176, + "step": 8887 + }, + { + "epoch": 0.60220882173589, + "grad_norm": 7.459704875946045, + "learning_rate": 8.886029160106784e-05, + "loss": 0.8599, + "step": 8888 + }, + { + "epoch": 0.6022765770038621, + "grad_norm": 7.512004375457764, + "learning_rate": 8.885892258197003e-05, + "loss": 0.7238, + "step": 8889 + }, + { + "epoch": 0.6023443322718341, + "grad_norm": 7.773438453674316, + "learning_rate": 8.885755356287221e-05, + "loss": 0.72, + "step": 8890 + }, + { + "epoch": 0.6024120875398062, + "grad_norm": 5.3314948081970215, + "learning_rate": 8.885618454377439e-05, + "loss": 0.9285, + "step": 8891 + }, + { + "epoch": 0.6024798428077783, + "grad_norm": 5.834819793701172, + "learning_rate": 8.885481552467657e-05, + "loss": 0.9032, + "step": 8892 + }, + { + "epoch": 0.6025475980757504, + "grad_norm": 6.342952251434326, + "learning_rate": 8.885344650557875e-05, + "loss": 0.8987, + "step": 8893 + }, + { + "epoch": 0.6026153533437225, + "grad_norm": 5.579010963439941, + "learning_rate": 8.885207748648095e-05, + "loss": 0.9275, + "step": 8894 + }, + { + "epoch": 0.6026831086116946, + "grad_norm": 5.604851722717285, + "learning_rate": 8.885070846738313e-05, + "loss": 0.8163, + "step": 8895 + }, + { + "epoch": 0.6027508638796667, + "grad_norm": 6.057446479797363, + "learning_rate": 8.88493394482853e-05, + "loss": 0.8466, + "step": 8896 + }, + { + "epoch": 0.6028186191476387, + "grad_norm": 8.311997413635254, + "learning_rate": 8.884797042918749e-05, + "loss": 0.8493, + "step": 8897 + }, + { + "epoch": 0.6028863744156108, + "grad_norm": 6.672399044036865, + "learning_rate": 8.884660141008968e-05, + "loss": 0.7638, + "step": 8898 + }, + { + "epoch": 0.6029541296835829, + "grad_norm": 7.031404495239258, + "learning_rate": 8.884523239099186e-05, + "loss": 0.7692, + "step": 8899 + }, + { + "epoch": 0.603021884951555, + "grad_norm": 6.296429634094238, + "learning_rate": 8.884386337189404e-05, + "loss": 0.9525, + "step": 8900 + }, + { + "epoch": 0.603089640219527, + "grad_norm": 6.698690891265869, + "learning_rate": 8.884249435279622e-05, + "loss": 1.0234, + "step": 8901 + }, + { + "epoch": 0.6031573954874991, + "grad_norm": 5.885977268218994, + "learning_rate": 8.88411253336984e-05, + "loss": 0.6533, + "step": 8902 + }, + { + "epoch": 0.6032251507554712, + "grad_norm": 7.2855072021484375, + "learning_rate": 8.88397563146006e-05, + "loss": 0.7969, + "step": 8903 + }, + { + "epoch": 0.6032929060234433, + "grad_norm": 5.964268207550049, + "learning_rate": 8.883838729550278e-05, + "loss": 0.9055, + "step": 8904 + }, + { + "epoch": 0.6033606612914154, + "grad_norm": 8.085535049438477, + "learning_rate": 8.883701827640496e-05, + "loss": 0.9988, + "step": 8905 + }, + { + "epoch": 0.6034284165593875, + "grad_norm": 6.891632080078125, + "learning_rate": 8.883564925730714e-05, + "loss": 1.0141, + "step": 8906 + }, + { + "epoch": 0.6034961718273596, + "grad_norm": 5.324792861938477, + "learning_rate": 8.883428023820932e-05, + "loss": 0.7212, + "step": 8907 + }, + { + "epoch": 0.6035639270953317, + "grad_norm": 5.735985279083252, + "learning_rate": 8.883291121911151e-05, + "loss": 0.9523, + "step": 8908 + }, + { + "epoch": 0.6036316823633038, + "grad_norm": 7.501296043395996, + "learning_rate": 8.883154220001369e-05, + "loss": 0.938, + "step": 8909 + }, + { + "epoch": 0.6036994376312759, + "grad_norm": 6.442415714263916, + "learning_rate": 8.883017318091587e-05, + "loss": 0.5799, + "step": 8910 + }, + { + "epoch": 0.603767192899248, + "grad_norm": 6.753159999847412, + "learning_rate": 8.882880416181805e-05, + "loss": 0.6011, + "step": 8911 + }, + { + "epoch": 0.60383494816722, + "grad_norm": 7.805209636688232, + "learning_rate": 8.882743514272025e-05, + "loss": 1.1798, + "step": 8912 + }, + { + "epoch": 0.603902703435192, + "grad_norm": 5.123414993286133, + "learning_rate": 8.882606612362243e-05, + "loss": 0.7891, + "step": 8913 + }, + { + "epoch": 0.6039704587031641, + "grad_norm": 6.106377601623535, + "learning_rate": 8.88246971045246e-05, + "loss": 0.8042, + "step": 8914 + }, + { + "epoch": 0.6040382139711362, + "grad_norm": 6.874667167663574, + "learning_rate": 8.88233280854268e-05, + "loss": 0.8167, + "step": 8915 + }, + { + "epoch": 0.6041059692391083, + "grad_norm": 5.306685447692871, + "learning_rate": 8.882195906632898e-05, + "loss": 0.6487, + "step": 8916 + }, + { + "epoch": 0.6041737245070804, + "grad_norm": 6.309480667114258, + "learning_rate": 8.882059004723116e-05, + "loss": 0.6866, + "step": 8917 + }, + { + "epoch": 0.6042414797750525, + "grad_norm": 7.042599201202393, + "learning_rate": 8.881922102813335e-05, + "loss": 0.8205, + "step": 8918 + }, + { + "epoch": 0.6043092350430246, + "grad_norm": 5.572051048278809, + "learning_rate": 8.881785200903554e-05, + "loss": 0.8188, + "step": 8919 + }, + { + "epoch": 0.6043769903109967, + "grad_norm": 5.881922245025635, + "learning_rate": 8.881648298993772e-05, + "loss": 0.8344, + "step": 8920 + }, + { + "epoch": 0.6044447455789688, + "grad_norm": 6.141275882720947, + "learning_rate": 8.881511397083991e-05, + "loss": 0.7084, + "step": 8921 + }, + { + "epoch": 0.6045125008469409, + "grad_norm": 6.847328186035156, + "learning_rate": 8.881374495174209e-05, + "loss": 0.9248, + "step": 8922 + }, + { + "epoch": 0.604580256114913, + "grad_norm": 8.392770767211914, + "learning_rate": 8.881237593264427e-05, + "loss": 1.0319, + "step": 8923 + }, + { + "epoch": 0.604648011382885, + "grad_norm": 6.491360664367676, + "learning_rate": 8.881100691354645e-05, + "loss": 0.8098, + "step": 8924 + }, + { + "epoch": 0.6047157666508571, + "grad_norm": 7.355408191680908, + "learning_rate": 8.880963789444863e-05, + "loss": 0.9513, + "step": 8925 + }, + { + "epoch": 0.6047835219188292, + "grad_norm": 5.667849063873291, + "learning_rate": 8.880826887535082e-05, + "loss": 0.6251, + "step": 8926 + }, + { + "epoch": 0.6048512771868013, + "grad_norm": 7.0746636390686035, + "learning_rate": 8.8806899856253e-05, + "loss": 0.9789, + "step": 8927 + }, + { + "epoch": 0.6049190324547734, + "grad_norm": 8.575697898864746, + "learning_rate": 8.880553083715519e-05, + "loss": 0.6239, + "step": 8928 + }, + { + "epoch": 0.6049867877227454, + "grad_norm": 5.967097759246826, + "learning_rate": 8.880416181805737e-05, + "loss": 0.7224, + "step": 8929 + }, + { + "epoch": 0.6050545429907175, + "grad_norm": 7.468807697296143, + "learning_rate": 8.880279279895955e-05, + "loss": 0.663, + "step": 8930 + }, + { + "epoch": 0.6051222982586896, + "grad_norm": 7.503567218780518, + "learning_rate": 8.880142377986174e-05, + "loss": 1.0067, + "step": 8931 + }, + { + "epoch": 0.6051900535266617, + "grad_norm": 5.722857475280762, + "learning_rate": 8.880005476076392e-05, + "loss": 0.6875, + "step": 8932 + }, + { + "epoch": 0.6052578087946338, + "grad_norm": 5.868055820465088, + "learning_rate": 8.87986857416661e-05, + "loss": 0.9231, + "step": 8933 + }, + { + "epoch": 0.6053255640626058, + "grad_norm": 6.367112636566162, + "learning_rate": 8.879731672256828e-05, + "loss": 0.7726, + "step": 8934 + }, + { + "epoch": 0.6053933193305779, + "grad_norm": 5.738692760467529, + "learning_rate": 8.879594770347047e-05, + "loss": 0.6841, + "step": 8935 + }, + { + "epoch": 0.60546107459855, + "grad_norm": 6.935656547546387, + "learning_rate": 8.879457868437266e-05, + "loss": 0.9394, + "step": 8936 + }, + { + "epoch": 0.6055288298665221, + "grad_norm": 5.599362373352051, + "learning_rate": 8.879320966527484e-05, + "loss": 0.741, + "step": 8937 + }, + { + "epoch": 0.6055965851344942, + "grad_norm": 6.967918395996094, + "learning_rate": 8.879184064617702e-05, + "loss": 0.7936, + "step": 8938 + }, + { + "epoch": 0.6056643404024663, + "grad_norm": 7.082143306732178, + "learning_rate": 8.87904716270792e-05, + "loss": 0.9448, + "step": 8939 + }, + { + "epoch": 0.6057320956704384, + "grad_norm": 7.133969783782959, + "learning_rate": 8.878910260798139e-05, + "loss": 1.0402, + "step": 8940 + }, + { + "epoch": 0.6057998509384105, + "grad_norm": 6.228809356689453, + "learning_rate": 8.878773358888357e-05, + "loss": 0.9436, + "step": 8941 + }, + { + "epoch": 0.6058676062063826, + "grad_norm": 7.93583345413208, + "learning_rate": 8.878636456978575e-05, + "loss": 0.8431, + "step": 8942 + }, + { + "epoch": 0.6059353614743547, + "grad_norm": 6.536636829376221, + "learning_rate": 8.878499555068793e-05, + "loss": 0.6499, + "step": 8943 + }, + { + "epoch": 0.6060031167423268, + "grad_norm": 6.2305145263671875, + "learning_rate": 8.878362653159012e-05, + "loss": 0.8509, + "step": 8944 + }, + { + "epoch": 0.6060708720102989, + "grad_norm": 6.194677829742432, + "learning_rate": 8.87822575124923e-05, + "loss": 0.8863, + "step": 8945 + }, + { + "epoch": 0.6061386272782708, + "grad_norm": 6.955784797668457, + "learning_rate": 8.878088849339449e-05, + "loss": 0.669, + "step": 8946 + }, + { + "epoch": 0.6062063825462429, + "grad_norm": 6.535221099853516, + "learning_rate": 8.877951947429667e-05, + "loss": 0.9381, + "step": 8947 + }, + { + "epoch": 0.606274137814215, + "grad_norm": 6.1568803787231445, + "learning_rate": 8.877815045519885e-05, + "loss": 0.8616, + "step": 8948 + }, + { + "epoch": 0.6063418930821871, + "grad_norm": 6.541617393493652, + "learning_rate": 8.877678143610104e-05, + "loss": 0.8262, + "step": 8949 + }, + { + "epoch": 0.6064096483501592, + "grad_norm": 6.519277572631836, + "learning_rate": 8.877541241700322e-05, + "loss": 0.6887, + "step": 8950 + }, + { + "epoch": 0.6064774036181313, + "grad_norm": 5.049574375152588, + "learning_rate": 8.87740433979054e-05, + "loss": 0.5653, + "step": 8951 + }, + { + "epoch": 0.6065451588861034, + "grad_norm": 5.1171698570251465, + "learning_rate": 8.877267437880758e-05, + "loss": 0.9257, + "step": 8952 + }, + { + "epoch": 0.6066129141540755, + "grad_norm": 5.691270351409912, + "learning_rate": 8.877130535970978e-05, + "loss": 0.7008, + "step": 8953 + }, + { + "epoch": 0.6066806694220476, + "grad_norm": 6.948482990264893, + "learning_rate": 8.876993634061196e-05, + "loss": 1.0571, + "step": 8954 + }, + { + "epoch": 0.6067484246900197, + "grad_norm": 7.013044357299805, + "learning_rate": 8.876856732151414e-05, + "loss": 0.901, + "step": 8955 + }, + { + "epoch": 0.6068161799579918, + "grad_norm": 6.465219974517822, + "learning_rate": 8.876719830241632e-05, + "loss": 0.8805, + "step": 8956 + }, + { + "epoch": 0.6068839352259638, + "grad_norm": 6.187651634216309, + "learning_rate": 8.87658292833185e-05, + "loss": 0.9569, + "step": 8957 + }, + { + "epoch": 0.6069516904939359, + "grad_norm": 7.414092063903809, + "learning_rate": 8.876446026422069e-05, + "loss": 0.8365, + "step": 8958 + }, + { + "epoch": 0.607019445761908, + "grad_norm": 6.181675910949707, + "learning_rate": 8.876309124512287e-05, + "loss": 0.8135, + "step": 8959 + }, + { + "epoch": 0.6070872010298801, + "grad_norm": 5.536195278167725, + "learning_rate": 8.876172222602505e-05, + "loss": 0.8226, + "step": 8960 + }, + { + "epoch": 0.6071549562978522, + "grad_norm": 6.445333480834961, + "learning_rate": 8.876035320692723e-05, + "loss": 0.851, + "step": 8961 + }, + { + "epoch": 0.6072227115658242, + "grad_norm": 7.526693820953369, + "learning_rate": 8.875898418782943e-05, + "loss": 0.806, + "step": 8962 + }, + { + "epoch": 0.6072904668337963, + "grad_norm": 6.761377334594727, + "learning_rate": 8.87576151687316e-05, + "loss": 0.7834, + "step": 8963 + }, + { + "epoch": 0.6073582221017684, + "grad_norm": 6.632135391235352, + "learning_rate": 8.875624614963379e-05, + "loss": 0.8597, + "step": 8964 + }, + { + "epoch": 0.6074259773697405, + "grad_norm": 6.839773178100586, + "learning_rate": 8.875487713053598e-05, + "loss": 0.7056, + "step": 8965 + }, + { + "epoch": 0.6074937326377126, + "grad_norm": 6.412302017211914, + "learning_rate": 8.875350811143816e-05, + "loss": 0.8321, + "step": 8966 + }, + { + "epoch": 0.6075614879056846, + "grad_norm": 10.579608917236328, + "learning_rate": 8.875213909234034e-05, + "loss": 1.0393, + "step": 8967 + }, + { + "epoch": 0.6076292431736567, + "grad_norm": 7.9641265869140625, + "learning_rate": 8.875077007324253e-05, + "loss": 0.8742, + "step": 8968 + }, + { + "epoch": 0.6076969984416288, + "grad_norm": 5.490036487579346, + "learning_rate": 8.874940105414471e-05, + "loss": 0.9735, + "step": 8969 + }, + { + "epoch": 0.6077647537096009, + "grad_norm": 6.595509052276611, + "learning_rate": 8.87480320350469e-05, + "loss": 0.9047, + "step": 8970 + }, + { + "epoch": 0.607832508977573, + "grad_norm": 5.147292137145996, + "learning_rate": 8.874666301594908e-05, + "loss": 0.6643, + "step": 8971 + }, + { + "epoch": 0.6079002642455451, + "grad_norm": 6.266574859619141, + "learning_rate": 8.874529399685127e-05, + "loss": 0.832, + "step": 8972 + }, + { + "epoch": 0.6079680195135172, + "grad_norm": 5.882715225219727, + "learning_rate": 8.874392497775345e-05, + "loss": 0.9908, + "step": 8973 + }, + { + "epoch": 0.6080357747814893, + "grad_norm": 7.928326606750488, + "learning_rate": 8.874255595865563e-05, + "loss": 1.0295, + "step": 8974 + }, + { + "epoch": 0.6081035300494614, + "grad_norm": 8.489121437072754, + "learning_rate": 8.874118693955781e-05, + "loss": 0.9872, + "step": 8975 + }, + { + "epoch": 0.6081712853174335, + "grad_norm": 5.946715354919434, + "learning_rate": 8.873981792046e-05, + "loss": 1.0244, + "step": 8976 + }, + { + "epoch": 0.6082390405854056, + "grad_norm": 6.594414234161377, + "learning_rate": 8.873844890136218e-05, + "loss": 0.9772, + "step": 8977 + }, + { + "epoch": 0.6083067958533775, + "grad_norm": 5.421601295471191, + "learning_rate": 8.873707988226436e-05, + "loss": 0.8735, + "step": 8978 + }, + { + "epoch": 0.6083745511213496, + "grad_norm": 6.623301029205322, + "learning_rate": 8.873571086316655e-05, + "loss": 0.9308, + "step": 8979 + }, + { + "epoch": 0.6084423063893217, + "grad_norm": 7.260282516479492, + "learning_rate": 8.873434184406873e-05, + "loss": 0.8352, + "step": 8980 + }, + { + "epoch": 0.6085100616572938, + "grad_norm": 5.427891254425049, + "learning_rate": 8.873297282497092e-05, + "loss": 0.5448, + "step": 8981 + }, + { + "epoch": 0.6085778169252659, + "grad_norm": 6.445272922515869, + "learning_rate": 8.87316038058731e-05, + "loss": 0.7464, + "step": 8982 + }, + { + "epoch": 0.608645572193238, + "grad_norm": 5.404473304748535, + "learning_rate": 8.873023478677528e-05, + "loss": 0.8883, + "step": 8983 + }, + { + "epoch": 0.6087133274612101, + "grad_norm": 5.924696445465088, + "learning_rate": 8.872886576767746e-05, + "loss": 0.9852, + "step": 8984 + }, + { + "epoch": 0.6087810827291822, + "grad_norm": 7.119851112365723, + "learning_rate": 8.872749674857964e-05, + "loss": 0.8267, + "step": 8985 + }, + { + "epoch": 0.6088488379971543, + "grad_norm": 6.306662559509277, + "learning_rate": 8.872612772948183e-05, + "loss": 0.8916, + "step": 8986 + }, + { + "epoch": 0.6089165932651264, + "grad_norm": 6.999206066131592, + "learning_rate": 8.872475871038402e-05, + "loss": 0.9852, + "step": 8987 + }, + { + "epoch": 0.6089843485330985, + "grad_norm": 7.93400239944458, + "learning_rate": 8.87233896912862e-05, + "loss": 0.7336, + "step": 8988 + }, + { + "epoch": 0.6090521038010706, + "grad_norm": 6.08065938949585, + "learning_rate": 8.872202067218838e-05, + "loss": 0.8714, + "step": 8989 + }, + { + "epoch": 0.6091198590690426, + "grad_norm": 8.639623641967773, + "learning_rate": 8.872065165309057e-05, + "loss": 0.7973, + "step": 8990 + }, + { + "epoch": 0.6091876143370147, + "grad_norm": 6.663565158843994, + "learning_rate": 8.871928263399275e-05, + "loss": 0.7789, + "step": 8991 + }, + { + "epoch": 0.6092553696049868, + "grad_norm": 5.1584153175354, + "learning_rate": 8.871791361489493e-05, + "loss": 0.7933, + "step": 8992 + }, + { + "epoch": 0.6093231248729589, + "grad_norm": 5.59836483001709, + "learning_rate": 8.871654459579711e-05, + "loss": 0.731, + "step": 8993 + }, + { + "epoch": 0.609390880140931, + "grad_norm": 6.6651482582092285, + "learning_rate": 8.871517557669929e-05, + "loss": 0.624, + "step": 8994 + }, + { + "epoch": 0.609458635408903, + "grad_norm": 5.002302169799805, + "learning_rate": 8.871380655760148e-05, + "loss": 0.6816, + "step": 8995 + }, + { + "epoch": 0.6095263906768751, + "grad_norm": 7.68015718460083, + "learning_rate": 8.871243753850367e-05, + "loss": 0.7353, + "step": 8996 + }, + { + "epoch": 0.6095941459448472, + "grad_norm": 6.140766143798828, + "learning_rate": 8.871106851940585e-05, + "loss": 0.7172, + "step": 8997 + }, + { + "epoch": 0.6096619012128193, + "grad_norm": 6.370824813842773, + "learning_rate": 8.870969950030803e-05, + "loss": 0.7664, + "step": 8998 + }, + { + "epoch": 0.6097296564807914, + "grad_norm": 6.050932884216309, + "learning_rate": 8.870833048121022e-05, + "loss": 0.8182, + "step": 8999 + }, + { + "epoch": 0.6097974117487635, + "grad_norm": 6.733234882354736, + "learning_rate": 8.87069614621124e-05, + "loss": 0.8736, + "step": 9000 + }, + { + "epoch": 0.6098651670167355, + "grad_norm": 5.3225417137146, + "learning_rate": 8.870559244301458e-05, + "loss": 0.737, + "step": 9001 + }, + { + "epoch": 0.6099329222847076, + "grad_norm": 5.601250171661377, + "learning_rate": 8.870422342391676e-05, + "loss": 0.7802, + "step": 9002 + }, + { + "epoch": 0.6100006775526797, + "grad_norm": 4.972486972808838, + "learning_rate": 8.870285440481894e-05, + "loss": 0.6247, + "step": 9003 + }, + { + "epoch": 0.6100684328206518, + "grad_norm": 6.177496433258057, + "learning_rate": 8.870148538572114e-05, + "loss": 0.8745, + "step": 9004 + }, + { + "epoch": 0.6101361880886239, + "grad_norm": 5.5521321296691895, + "learning_rate": 8.870011636662332e-05, + "loss": 0.7281, + "step": 9005 + }, + { + "epoch": 0.610203943356596, + "grad_norm": 5.7942705154418945, + "learning_rate": 8.86987473475255e-05, + "loss": 0.8116, + "step": 9006 + }, + { + "epoch": 0.6102716986245681, + "grad_norm": 6.355384349822998, + "learning_rate": 8.869737832842768e-05, + "loss": 0.9577, + "step": 9007 + }, + { + "epoch": 0.6103394538925402, + "grad_norm": 9.047319412231445, + "learning_rate": 8.869600930932987e-05, + "loss": 0.7913, + "step": 9008 + }, + { + "epoch": 0.6104072091605123, + "grad_norm": 6.6220855712890625, + "learning_rate": 8.869464029023205e-05, + "loss": 0.8176, + "step": 9009 + }, + { + "epoch": 0.6104749644284844, + "grad_norm": 7.567444324493408, + "learning_rate": 8.869327127113423e-05, + "loss": 0.9745, + "step": 9010 + }, + { + "epoch": 0.6105427196964563, + "grad_norm": 7.12336540222168, + "learning_rate": 8.869190225203642e-05, + "loss": 0.7816, + "step": 9011 + }, + { + "epoch": 0.6106104749644284, + "grad_norm": 5.867663860321045, + "learning_rate": 8.86905332329386e-05, + "loss": 0.8209, + "step": 9012 + }, + { + "epoch": 0.6106782302324005, + "grad_norm": 6.362299919128418, + "learning_rate": 8.868916421384079e-05, + "loss": 0.8131, + "step": 9013 + }, + { + "epoch": 0.6107459855003726, + "grad_norm": 6.171093463897705, + "learning_rate": 8.868779519474298e-05, + "loss": 0.783, + "step": 9014 + }, + { + "epoch": 0.6108137407683447, + "grad_norm": 5.57711124420166, + "learning_rate": 8.868642617564516e-05, + "loss": 0.8022, + "step": 9015 + }, + { + "epoch": 0.6108814960363168, + "grad_norm": 6.855584144592285, + "learning_rate": 8.868505715654734e-05, + "loss": 0.8162, + "step": 9016 + }, + { + "epoch": 0.6109492513042889, + "grad_norm": 5.897914886474609, + "learning_rate": 8.868368813744952e-05, + "loss": 0.7343, + "step": 9017 + }, + { + "epoch": 0.611017006572261, + "grad_norm": 5.253844261169434, + "learning_rate": 8.868231911835171e-05, + "loss": 0.7384, + "step": 9018 + }, + { + "epoch": 0.6110847618402331, + "grad_norm": 5.258492469787598, + "learning_rate": 8.86809500992539e-05, + "loss": 0.7803, + "step": 9019 + }, + { + "epoch": 0.6111525171082052, + "grad_norm": 5.407173156738281, + "learning_rate": 8.867958108015607e-05, + "loss": 0.7416, + "step": 9020 + }, + { + "epoch": 0.6112202723761773, + "grad_norm": 7.5746283531188965, + "learning_rate": 8.867821206105826e-05, + "loss": 0.8327, + "step": 9021 + }, + { + "epoch": 0.6112880276441494, + "grad_norm": 6.094844818115234, + "learning_rate": 8.867684304196045e-05, + "loss": 0.76, + "step": 9022 + }, + { + "epoch": 0.6113557829121214, + "grad_norm": 6.436967849731445, + "learning_rate": 8.867547402286263e-05, + "loss": 0.8966, + "step": 9023 + }, + { + "epoch": 0.6114235381800935, + "grad_norm": 4.939077377319336, + "learning_rate": 8.867410500376481e-05, + "loss": 0.8912, + "step": 9024 + }, + { + "epoch": 0.6114912934480656, + "grad_norm": 6.9223127365112305, + "learning_rate": 8.867273598466699e-05, + "loss": 0.9135, + "step": 9025 + }, + { + "epoch": 0.6115590487160377, + "grad_norm": 5.3313517570495605, + "learning_rate": 8.867136696556917e-05, + "loss": 0.7065, + "step": 9026 + }, + { + "epoch": 0.6116268039840097, + "grad_norm": 5.880809783935547, + "learning_rate": 8.866999794647136e-05, + "loss": 0.8402, + "step": 9027 + }, + { + "epoch": 0.6116945592519818, + "grad_norm": 5.272537708282471, + "learning_rate": 8.866862892737354e-05, + "loss": 0.6471, + "step": 9028 + }, + { + "epoch": 0.6117623145199539, + "grad_norm": 4.6608147621154785, + "learning_rate": 8.866725990827572e-05, + "loss": 0.663, + "step": 9029 + }, + { + "epoch": 0.611830069787926, + "grad_norm": 7.082324981689453, + "learning_rate": 8.86658908891779e-05, + "loss": 0.7523, + "step": 9030 + }, + { + "epoch": 0.6118978250558981, + "grad_norm": 5.356766223907471, + "learning_rate": 8.86645218700801e-05, + "loss": 0.8098, + "step": 9031 + }, + { + "epoch": 0.6119655803238702, + "grad_norm": 4.575824737548828, + "learning_rate": 8.866315285098228e-05, + "loss": 0.7311, + "step": 9032 + }, + { + "epoch": 0.6120333355918423, + "grad_norm": 6.3496012687683105, + "learning_rate": 8.866178383188446e-05, + "loss": 0.9489, + "step": 9033 + }, + { + "epoch": 0.6121010908598143, + "grad_norm": 5.932507038116455, + "learning_rate": 8.866041481278664e-05, + "loss": 0.7362, + "step": 9034 + }, + { + "epoch": 0.6121688461277864, + "grad_norm": 6.566854000091553, + "learning_rate": 8.865904579368882e-05, + "loss": 0.9414, + "step": 9035 + }, + { + "epoch": 0.6122366013957585, + "grad_norm": 6.274519920349121, + "learning_rate": 8.865767677459101e-05, + "loss": 0.9277, + "step": 9036 + }, + { + "epoch": 0.6123043566637306, + "grad_norm": 6.512722492218018, + "learning_rate": 8.86563077554932e-05, + "loss": 0.8095, + "step": 9037 + }, + { + "epoch": 0.6123721119317027, + "grad_norm": 5.9804558753967285, + "learning_rate": 8.865493873639538e-05, + "loss": 0.7803, + "step": 9038 + }, + { + "epoch": 0.6124398671996748, + "grad_norm": 7.561446189880371, + "learning_rate": 8.865356971729756e-05, + "loss": 0.8143, + "step": 9039 + }, + { + "epoch": 0.6125076224676469, + "grad_norm": 6.611248016357422, + "learning_rate": 8.865220069819974e-05, + "loss": 0.813, + "step": 9040 + }, + { + "epoch": 0.612575377735619, + "grad_norm": 6.7652764320373535, + "learning_rate": 8.865083167910193e-05, + "loss": 0.8242, + "step": 9041 + }, + { + "epoch": 0.6126431330035911, + "grad_norm": 4.964203834533691, + "learning_rate": 8.864946266000411e-05, + "loss": 0.6155, + "step": 9042 + }, + { + "epoch": 0.6127108882715631, + "grad_norm": 5.260312080383301, + "learning_rate": 8.864809364090629e-05, + "loss": 0.694, + "step": 9043 + }, + { + "epoch": 0.6127786435395352, + "grad_norm": 6.220287799835205, + "learning_rate": 8.864672462180847e-05, + "loss": 0.7783, + "step": 9044 + }, + { + "epoch": 0.6128463988075072, + "grad_norm": 6.5388078689575195, + "learning_rate": 8.864535560271066e-05, + "loss": 0.6654, + "step": 9045 + }, + { + "epoch": 0.6129141540754793, + "grad_norm": 6.570968151092529, + "learning_rate": 8.864398658361284e-05, + "loss": 1.15, + "step": 9046 + }, + { + "epoch": 0.6129819093434514, + "grad_norm": 5.285862445831299, + "learning_rate": 8.864261756451503e-05, + "loss": 0.5951, + "step": 9047 + }, + { + "epoch": 0.6130496646114235, + "grad_norm": 6.997344017028809, + "learning_rate": 8.86412485454172e-05, + "loss": 0.8675, + "step": 9048 + }, + { + "epoch": 0.6131174198793956, + "grad_norm": 7.796441555023193, + "learning_rate": 8.863987952631939e-05, + "loss": 0.7794, + "step": 9049 + }, + { + "epoch": 0.6131851751473677, + "grad_norm": 5.723931789398193, + "learning_rate": 8.863851050722158e-05, + "loss": 0.7727, + "step": 9050 + }, + { + "epoch": 0.6132529304153398, + "grad_norm": 6.294017791748047, + "learning_rate": 8.863714148812376e-05, + "loss": 0.9198, + "step": 9051 + }, + { + "epoch": 0.6133206856833119, + "grad_norm": 7.23032808303833, + "learning_rate": 8.863577246902594e-05, + "loss": 0.9892, + "step": 9052 + }, + { + "epoch": 0.613388440951284, + "grad_norm": 5.533211708068848, + "learning_rate": 8.863440344992812e-05, + "loss": 0.7416, + "step": 9053 + }, + { + "epoch": 0.6134561962192561, + "grad_norm": 5.181722164154053, + "learning_rate": 8.863303443083031e-05, + "loss": 0.7471, + "step": 9054 + }, + { + "epoch": 0.6135239514872282, + "grad_norm": 5.428781986236572, + "learning_rate": 8.86316654117325e-05, + "loss": 0.8347, + "step": 9055 + }, + { + "epoch": 0.6135917067552003, + "grad_norm": 7.403633117675781, + "learning_rate": 8.863029639263468e-05, + "loss": 0.9809, + "step": 9056 + }, + { + "epoch": 0.6136594620231723, + "grad_norm": 6.061591625213623, + "learning_rate": 8.862892737353687e-05, + "loss": 0.7752, + "step": 9057 + }, + { + "epoch": 0.6137272172911444, + "grad_norm": 6.5160393714904785, + "learning_rate": 8.862755835443905e-05, + "loss": 0.8982, + "step": 9058 + }, + { + "epoch": 0.6137949725591165, + "grad_norm": 6.234467506408691, + "learning_rate": 8.862618933534123e-05, + "loss": 0.7428, + "step": 9059 + }, + { + "epoch": 0.6138627278270885, + "grad_norm": 7.634365081787109, + "learning_rate": 8.862482031624342e-05, + "loss": 0.8113, + "step": 9060 + }, + { + "epoch": 0.6139304830950606, + "grad_norm": 6.873602867126465, + "learning_rate": 8.86234512971456e-05, + "loss": 0.8789, + "step": 9061 + }, + { + "epoch": 0.6139982383630327, + "grad_norm": 8.263740539550781, + "learning_rate": 8.862208227804778e-05, + "loss": 1.0911, + "step": 9062 + }, + { + "epoch": 0.6140659936310048, + "grad_norm": 7.048012733459473, + "learning_rate": 8.862071325894996e-05, + "loss": 0.7947, + "step": 9063 + }, + { + "epoch": 0.6141337488989769, + "grad_norm": 6.902647018432617, + "learning_rate": 8.861934423985216e-05, + "loss": 0.9391, + "step": 9064 + }, + { + "epoch": 0.614201504166949, + "grad_norm": 7.542623043060303, + "learning_rate": 8.861797522075434e-05, + "loss": 0.8601, + "step": 9065 + }, + { + "epoch": 0.6142692594349211, + "grad_norm": 6.404265403747559, + "learning_rate": 8.861660620165652e-05, + "loss": 0.7602, + "step": 9066 + }, + { + "epoch": 0.6143370147028931, + "grad_norm": 5.523179054260254, + "learning_rate": 8.86152371825587e-05, + "loss": 0.7821, + "step": 9067 + }, + { + "epoch": 0.6144047699708652, + "grad_norm": 5.909379005432129, + "learning_rate": 8.86138681634609e-05, + "loss": 0.788, + "step": 9068 + }, + { + "epoch": 0.6144725252388373, + "grad_norm": 6.69690465927124, + "learning_rate": 8.861249914436307e-05, + "loss": 0.9958, + "step": 9069 + }, + { + "epoch": 0.6145402805068094, + "grad_norm": 4.888934135437012, + "learning_rate": 8.861113012526525e-05, + "loss": 0.6594, + "step": 9070 + }, + { + "epoch": 0.6146080357747815, + "grad_norm": 6.2297892570495605, + "learning_rate": 8.860976110616743e-05, + "loss": 0.7989, + "step": 9071 + }, + { + "epoch": 0.6146757910427536, + "grad_norm": 6.807466506958008, + "learning_rate": 8.860839208706962e-05, + "loss": 0.9594, + "step": 9072 + }, + { + "epoch": 0.6147435463107257, + "grad_norm": 5.524564266204834, + "learning_rate": 8.860702306797181e-05, + "loss": 0.9957, + "step": 9073 + }, + { + "epoch": 0.6148113015786978, + "grad_norm": 6.862370014190674, + "learning_rate": 8.860565404887399e-05, + "loss": 0.8455, + "step": 9074 + }, + { + "epoch": 0.6148790568466699, + "grad_norm": 6.885018825531006, + "learning_rate": 8.860428502977617e-05, + "loss": 0.8901, + "step": 9075 + }, + { + "epoch": 0.6149468121146419, + "grad_norm": 5.2585344314575195, + "learning_rate": 8.860291601067835e-05, + "loss": 0.6791, + "step": 9076 + }, + { + "epoch": 0.615014567382614, + "grad_norm": 6.821567058563232, + "learning_rate": 8.860154699158054e-05, + "loss": 1.1323, + "step": 9077 + }, + { + "epoch": 0.615082322650586, + "grad_norm": 5.113526344299316, + "learning_rate": 8.860017797248272e-05, + "loss": 0.8949, + "step": 9078 + }, + { + "epoch": 0.6151500779185581, + "grad_norm": 6.890782356262207, + "learning_rate": 8.85988089533849e-05, + "loss": 0.9682, + "step": 9079 + }, + { + "epoch": 0.6152178331865302, + "grad_norm": 6.634467124938965, + "learning_rate": 8.859743993428708e-05, + "loss": 0.8697, + "step": 9080 + }, + { + "epoch": 0.6152855884545023, + "grad_norm": 8.704490661621094, + "learning_rate": 8.859607091518927e-05, + "loss": 1.0182, + "step": 9081 + }, + { + "epoch": 0.6153533437224744, + "grad_norm": 6.126344680786133, + "learning_rate": 8.859470189609146e-05, + "loss": 0.8234, + "step": 9082 + }, + { + "epoch": 0.6154210989904465, + "grad_norm": 4.622169017791748, + "learning_rate": 8.859333287699364e-05, + "loss": 0.5793, + "step": 9083 + }, + { + "epoch": 0.6154888542584186, + "grad_norm": 6.331384181976318, + "learning_rate": 8.859196385789582e-05, + "loss": 0.7669, + "step": 9084 + }, + { + "epoch": 0.6155566095263907, + "grad_norm": 5.072127819061279, + "learning_rate": 8.8590594838798e-05, + "loss": 0.8827, + "step": 9085 + }, + { + "epoch": 0.6156243647943628, + "grad_norm": 5.524892330169678, + "learning_rate": 8.85892258197002e-05, + "loss": 0.5116, + "step": 9086 + }, + { + "epoch": 0.6156921200623349, + "grad_norm": 7.506863594055176, + "learning_rate": 8.858785680060237e-05, + "loss": 1.1547, + "step": 9087 + }, + { + "epoch": 0.615759875330307, + "grad_norm": 5.211267471313477, + "learning_rate": 8.858648778150455e-05, + "loss": 0.7001, + "step": 9088 + }, + { + "epoch": 0.615827630598279, + "grad_norm": 5.154802322387695, + "learning_rate": 8.858511876240674e-05, + "loss": 0.6498, + "step": 9089 + }, + { + "epoch": 0.6158953858662511, + "grad_norm": 6.631251335144043, + "learning_rate": 8.858374974330892e-05, + "loss": 0.7026, + "step": 9090 + }, + { + "epoch": 0.6159631411342232, + "grad_norm": 6.677391529083252, + "learning_rate": 8.858238072421111e-05, + "loss": 0.8582, + "step": 9091 + }, + { + "epoch": 0.6160308964021952, + "grad_norm": 5.569202423095703, + "learning_rate": 8.858101170511329e-05, + "loss": 0.7886, + "step": 9092 + }, + { + "epoch": 0.6160986516701673, + "grad_norm": 6.454385280609131, + "learning_rate": 8.857964268601547e-05, + "loss": 0.7088, + "step": 9093 + }, + { + "epoch": 0.6161664069381394, + "grad_norm": 6.360875606536865, + "learning_rate": 8.857827366691765e-05, + "loss": 0.7079, + "step": 9094 + }, + { + "epoch": 0.6162341622061115, + "grad_norm": 6.538075923919678, + "learning_rate": 8.857690464781983e-05, + "loss": 0.851, + "step": 9095 + }, + { + "epoch": 0.6163019174740836, + "grad_norm": 6.111706256866455, + "learning_rate": 8.857553562872202e-05, + "loss": 0.6732, + "step": 9096 + }, + { + "epoch": 0.6163696727420557, + "grad_norm": 6.8173956871032715, + "learning_rate": 8.85741666096242e-05, + "loss": 0.8164, + "step": 9097 + }, + { + "epoch": 0.6164374280100278, + "grad_norm": 6.984659194946289, + "learning_rate": 8.857279759052639e-05, + "loss": 0.7083, + "step": 9098 + }, + { + "epoch": 0.6165051832779999, + "grad_norm": 5.256351947784424, + "learning_rate": 8.857142857142857e-05, + "loss": 0.7592, + "step": 9099 + }, + { + "epoch": 0.616572938545972, + "grad_norm": 8.32116985321045, + "learning_rate": 8.857005955233076e-05, + "loss": 1.041, + "step": 9100 + }, + { + "epoch": 0.616640693813944, + "grad_norm": 5.583303928375244, + "learning_rate": 8.856869053323294e-05, + "loss": 0.9433, + "step": 9101 + }, + { + "epoch": 0.6167084490819161, + "grad_norm": 8.839009284973145, + "learning_rate": 8.856732151413512e-05, + "loss": 1.1276, + "step": 9102 + }, + { + "epoch": 0.6167762043498882, + "grad_norm": 6.0189528465271, + "learning_rate": 8.856595249503731e-05, + "loss": 0.8986, + "step": 9103 + }, + { + "epoch": 0.6168439596178603, + "grad_norm": 6.6191205978393555, + "learning_rate": 8.85645834759395e-05, + "loss": 0.8594, + "step": 9104 + }, + { + "epoch": 0.6169117148858324, + "grad_norm": 7.318183898925781, + "learning_rate": 8.856321445684167e-05, + "loss": 0.936, + "step": 9105 + }, + { + "epoch": 0.6169794701538045, + "grad_norm": 5.551211357116699, + "learning_rate": 8.856184543774387e-05, + "loss": 0.8986, + "step": 9106 + }, + { + "epoch": 0.6170472254217766, + "grad_norm": 6.483643054962158, + "learning_rate": 8.856047641864605e-05, + "loss": 0.8394, + "step": 9107 + }, + { + "epoch": 0.6171149806897487, + "grad_norm": 6.838339805603027, + "learning_rate": 8.855910739954823e-05, + "loss": 0.8305, + "step": 9108 + }, + { + "epoch": 0.6171827359577207, + "grad_norm": 6.734610557556152, + "learning_rate": 8.855773838045042e-05, + "loss": 0.8967, + "step": 9109 + }, + { + "epoch": 0.6172504912256928, + "grad_norm": 5.749598503112793, + "learning_rate": 8.85563693613526e-05, + "loss": 0.8226, + "step": 9110 + }, + { + "epoch": 0.6173182464936648, + "grad_norm": 5.9449872970581055, + "learning_rate": 8.855500034225478e-05, + "loss": 0.7528, + "step": 9111 + }, + { + "epoch": 0.6173860017616369, + "grad_norm": 7.123237133026123, + "learning_rate": 8.855363132315696e-05, + "loss": 0.8002, + "step": 9112 + }, + { + "epoch": 0.617453757029609, + "grad_norm": 6.046530723571777, + "learning_rate": 8.855226230405914e-05, + "loss": 0.7788, + "step": 9113 + }, + { + "epoch": 0.6175215122975811, + "grad_norm": 5.466145992279053, + "learning_rate": 8.855089328496134e-05, + "loss": 0.6303, + "step": 9114 + }, + { + "epoch": 0.6175892675655532, + "grad_norm": 6.014889717102051, + "learning_rate": 8.854952426586352e-05, + "loss": 0.9445, + "step": 9115 + }, + { + "epoch": 0.6176570228335253, + "grad_norm": 6.305761814117432, + "learning_rate": 8.85481552467657e-05, + "loss": 0.7314, + "step": 9116 + }, + { + "epoch": 0.6177247781014974, + "grad_norm": 7.205751895904541, + "learning_rate": 8.854678622766788e-05, + "loss": 0.7635, + "step": 9117 + }, + { + "epoch": 0.6177925333694695, + "grad_norm": 7.25909948348999, + "learning_rate": 8.854541720857006e-05, + "loss": 0.8655, + "step": 9118 + }, + { + "epoch": 0.6178602886374416, + "grad_norm": 6.141407489776611, + "learning_rate": 8.854404818947225e-05, + "loss": 0.7744, + "step": 9119 + }, + { + "epoch": 0.6179280439054137, + "grad_norm": 7.026790142059326, + "learning_rate": 8.854267917037443e-05, + "loss": 0.6679, + "step": 9120 + }, + { + "epoch": 0.6179957991733858, + "grad_norm": 6.5543437004089355, + "learning_rate": 8.854131015127661e-05, + "loss": 0.7657, + "step": 9121 + }, + { + "epoch": 0.6180635544413579, + "grad_norm": 6.620877742767334, + "learning_rate": 8.85399411321788e-05, + "loss": 0.7058, + "step": 9122 + }, + { + "epoch": 0.61813130970933, + "grad_norm": 6.877584457397461, + "learning_rate": 8.853857211308099e-05, + "loss": 0.8799, + "step": 9123 + }, + { + "epoch": 0.618199064977302, + "grad_norm": 6.836280822753906, + "learning_rate": 8.853720309398317e-05, + "loss": 0.7901, + "step": 9124 + }, + { + "epoch": 0.618266820245274, + "grad_norm": 5.966172695159912, + "learning_rate": 8.853583407488535e-05, + "loss": 0.6202, + "step": 9125 + }, + { + "epoch": 0.6183345755132461, + "grad_norm": 6.592352867126465, + "learning_rate": 8.853446505578753e-05, + "loss": 0.8176, + "step": 9126 + }, + { + "epoch": 0.6184023307812182, + "grad_norm": 7.010197639465332, + "learning_rate": 8.853309603668971e-05, + "loss": 0.8645, + "step": 9127 + }, + { + "epoch": 0.6184700860491903, + "grad_norm": 6.962997913360596, + "learning_rate": 8.85317270175919e-05, + "loss": 0.7481, + "step": 9128 + }, + { + "epoch": 0.6185378413171624, + "grad_norm": 6.8080244064331055, + "learning_rate": 8.853035799849408e-05, + "loss": 0.8811, + "step": 9129 + }, + { + "epoch": 0.6186055965851345, + "grad_norm": 5.936764240264893, + "learning_rate": 8.852898897939626e-05, + "loss": 0.6795, + "step": 9130 + }, + { + "epoch": 0.6186733518531066, + "grad_norm": 6.132038116455078, + "learning_rate": 8.852761996029844e-05, + "loss": 0.9207, + "step": 9131 + }, + { + "epoch": 0.6187411071210787, + "grad_norm": 6.451957702636719, + "learning_rate": 8.852625094120064e-05, + "loss": 0.7098, + "step": 9132 + }, + { + "epoch": 0.6188088623890508, + "grad_norm": 6.429556369781494, + "learning_rate": 8.852488192210282e-05, + "loss": 0.8215, + "step": 9133 + }, + { + "epoch": 0.6188766176570228, + "grad_norm": 5.414734363555908, + "learning_rate": 8.8523512903005e-05, + "loss": 0.7205, + "step": 9134 + }, + { + "epoch": 0.6189443729249949, + "grad_norm": 7.48563289642334, + "learning_rate": 8.852214388390718e-05, + "loss": 0.9738, + "step": 9135 + }, + { + "epoch": 0.619012128192967, + "grad_norm": 7.014744758605957, + "learning_rate": 8.852077486480936e-05, + "loss": 0.7494, + "step": 9136 + }, + { + "epoch": 0.6190798834609391, + "grad_norm": 9.400522232055664, + "learning_rate": 8.851940584571155e-05, + "loss": 0.8805, + "step": 9137 + }, + { + "epoch": 0.6191476387289112, + "grad_norm": 5.838070869445801, + "learning_rate": 8.851803682661373e-05, + "loss": 0.7817, + "step": 9138 + }, + { + "epoch": 0.6192153939968833, + "grad_norm": 6.750514984130859, + "learning_rate": 8.851666780751591e-05, + "loss": 0.8173, + "step": 9139 + }, + { + "epoch": 0.6192831492648554, + "grad_norm": 8.359286308288574, + "learning_rate": 8.85152987884181e-05, + "loss": 0.8902, + "step": 9140 + }, + { + "epoch": 0.6193509045328274, + "grad_norm": 4.75096321105957, + "learning_rate": 8.851392976932029e-05, + "loss": 0.6834, + "step": 9141 + }, + { + "epoch": 0.6194186598007995, + "grad_norm": 6.427238464355469, + "learning_rate": 8.851256075022247e-05, + "loss": 0.7946, + "step": 9142 + }, + { + "epoch": 0.6194864150687716, + "grad_norm": 6.961672782897949, + "learning_rate": 8.851119173112465e-05, + "loss": 0.6882, + "step": 9143 + }, + { + "epoch": 0.6195541703367436, + "grad_norm": 7.266171455383301, + "learning_rate": 8.850982271202683e-05, + "loss": 0.8905, + "step": 9144 + }, + { + "epoch": 0.6196219256047157, + "grad_norm": 6.961668491363525, + "learning_rate": 8.850845369292901e-05, + "loss": 0.7874, + "step": 9145 + }, + { + "epoch": 0.6196896808726878, + "grad_norm": 6.123990535736084, + "learning_rate": 8.85070846738312e-05, + "loss": 0.7126, + "step": 9146 + }, + { + "epoch": 0.6197574361406599, + "grad_norm": 7.221274375915527, + "learning_rate": 8.850571565473338e-05, + "loss": 0.8443, + "step": 9147 + }, + { + "epoch": 0.619825191408632, + "grad_norm": 6.273942947387695, + "learning_rate": 8.850434663563556e-05, + "loss": 0.8795, + "step": 9148 + }, + { + "epoch": 0.6198929466766041, + "grad_norm": 6.505329608917236, + "learning_rate": 8.850297761653776e-05, + "loss": 0.845, + "step": 9149 + }, + { + "epoch": 0.6199607019445762, + "grad_norm": 6.682892322540283, + "learning_rate": 8.850160859743994e-05, + "loss": 0.8908, + "step": 9150 + }, + { + "epoch": 0.6200284572125483, + "grad_norm": 6.669549942016602, + "learning_rate": 8.850023957834212e-05, + "loss": 0.8027, + "step": 9151 + }, + { + "epoch": 0.6200962124805204, + "grad_norm": 4.51635217666626, + "learning_rate": 8.849887055924431e-05, + "loss": 0.6746, + "step": 9152 + }, + { + "epoch": 0.6201639677484925, + "grad_norm": 7.118916988372803, + "learning_rate": 8.84975015401465e-05, + "loss": 1.1932, + "step": 9153 + }, + { + "epoch": 0.6202317230164646, + "grad_norm": 5.470358371734619, + "learning_rate": 8.849613252104867e-05, + "loss": 0.9683, + "step": 9154 + }, + { + "epoch": 0.6202994782844367, + "grad_norm": 5.723430633544922, + "learning_rate": 8.849476350195087e-05, + "loss": 0.9483, + "step": 9155 + }, + { + "epoch": 0.6203672335524087, + "grad_norm": 6.195518493652344, + "learning_rate": 8.849339448285305e-05, + "loss": 0.6756, + "step": 9156 + }, + { + "epoch": 0.6204349888203808, + "grad_norm": 5.686561584472656, + "learning_rate": 8.849202546375523e-05, + "loss": 0.6946, + "step": 9157 + }, + { + "epoch": 0.6205027440883528, + "grad_norm": 6.569991588592529, + "learning_rate": 8.849065644465741e-05, + "loss": 0.8295, + "step": 9158 + }, + { + "epoch": 0.6205704993563249, + "grad_norm": 7.306698799133301, + "learning_rate": 8.848928742555959e-05, + "loss": 1.0515, + "step": 9159 + }, + { + "epoch": 0.620638254624297, + "grad_norm": 7.042558670043945, + "learning_rate": 8.848791840646178e-05, + "loss": 1.0191, + "step": 9160 + }, + { + "epoch": 0.6207060098922691, + "grad_norm": 6.552324295043945, + "learning_rate": 8.848654938736396e-05, + "loss": 0.7424, + "step": 9161 + }, + { + "epoch": 0.6207737651602412, + "grad_norm": 7.646967887878418, + "learning_rate": 8.848518036826614e-05, + "loss": 0.8457, + "step": 9162 + }, + { + "epoch": 0.6208415204282133, + "grad_norm": 6.561422348022461, + "learning_rate": 8.848381134916832e-05, + "loss": 0.7605, + "step": 9163 + }, + { + "epoch": 0.6209092756961854, + "grad_norm": 7.425536155700684, + "learning_rate": 8.848244233007052e-05, + "loss": 0.8042, + "step": 9164 + }, + { + "epoch": 0.6209770309641575, + "grad_norm": 5.6792521476745605, + "learning_rate": 8.84810733109727e-05, + "loss": 0.8775, + "step": 9165 + }, + { + "epoch": 0.6210447862321296, + "grad_norm": 6.988245010375977, + "learning_rate": 8.847970429187488e-05, + "loss": 0.7295, + "step": 9166 + }, + { + "epoch": 0.6211125415001016, + "grad_norm": 4.709329605102539, + "learning_rate": 8.847833527277706e-05, + "loss": 0.6508, + "step": 9167 + }, + { + "epoch": 0.6211802967680737, + "grad_norm": 7.601302623748779, + "learning_rate": 8.847696625367924e-05, + "loss": 0.6558, + "step": 9168 + }, + { + "epoch": 0.6212480520360458, + "grad_norm": 6.5276007652282715, + "learning_rate": 8.847559723458143e-05, + "loss": 0.6851, + "step": 9169 + }, + { + "epoch": 0.6213158073040179, + "grad_norm": 6.256960391998291, + "learning_rate": 8.847422821548361e-05, + "loss": 0.9257, + "step": 9170 + }, + { + "epoch": 0.62138356257199, + "grad_norm": 7.133554935455322, + "learning_rate": 8.84728591963858e-05, + "loss": 1.0145, + "step": 9171 + }, + { + "epoch": 0.6214513178399621, + "grad_norm": 7.465946674346924, + "learning_rate": 8.847149017728797e-05, + "loss": 1.1211, + "step": 9172 + }, + { + "epoch": 0.6215190731079342, + "grad_norm": 7.24479341506958, + "learning_rate": 8.847012115819015e-05, + "loss": 0.787, + "step": 9173 + }, + { + "epoch": 0.6215868283759062, + "grad_norm": 6.052404403686523, + "learning_rate": 8.846875213909235e-05, + "loss": 0.7718, + "step": 9174 + }, + { + "epoch": 0.6216545836438783, + "grad_norm": 7.27697229385376, + "learning_rate": 8.846738311999453e-05, + "loss": 0.8102, + "step": 9175 + }, + { + "epoch": 0.6217223389118504, + "grad_norm": 6.269348621368408, + "learning_rate": 8.846601410089671e-05, + "loss": 0.9089, + "step": 9176 + }, + { + "epoch": 0.6217900941798225, + "grad_norm": 5.706981658935547, + "learning_rate": 8.846464508179889e-05, + "loss": 0.8213, + "step": 9177 + }, + { + "epoch": 0.6218578494477945, + "grad_norm": 7.0556817054748535, + "learning_rate": 8.846327606270108e-05, + "loss": 0.696, + "step": 9178 + }, + { + "epoch": 0.6219256047157666, + "grad_norm": 5.418951034545898, + "learning_rate": 8.846190704360326e-05, + "loss": 0.6344, + "step": 9179 + }, + { + "epoch": 0.6219933599837387, + "grad_norm": 6.63913106918335, + "learning_rate": 8.846053802450544e-05, + "loss": 0.8222, + "step": 9180 + }, + { + "epoch": 0.6220611152517108, + "grad_norm": 6.560824394226074, + "learning_rate": 8.845916900540762e-05, + "loss": 0.7443, + "step": 9181 + }, + { + "epoch": 0.6221288705196829, + "grad_norm": 6.946655750274658, + "learning_rate": 8.84577999863098e-05, + "loss": 1.0413, + "step": 9182 + }, + { + "epoch": 0.622196625787655, + "grad_norm": 5.037294864654541, + "learning_rate": 8.8456430967212e-05, + "loss": 0.7489, + "step": 9183 + }, + { + "epoch": 0.6222643810556271, + "grad_norm": 6.396673202514648, + "learning_rate": 8.845506194811418e-05, + "loss": 0.768, + "step": 9184 + }, + { + "epoch": 0.6223321363235992, + "grad_norm": 6.354964733123779, + "learning_rate": 8.845369292901636e-05, + "loss": 1.0378, + "step": 9185 + }, + { + "epoch": 0.6223998915915713, + "grad_norm": 7.494623184204102, + "learning_rate": 8.845232390991854e-05, + "loss": 0.763, + "step": 9186 + }, + { + "epoch": 0.6224676468595434, + "grad_norm": 6.109148025512695, + "learning_rate": 8.845095489082073e-05, + "loss": 0.7609, + "step": 9187 + }, + { + "epoch": 0.6225354021275155, + "grad_norm": 7.039491653442383, + "learning_rate": 8.844958587172291e-05, + "loss": 0.7999, + "step": 9188 + }, + { + "epoch": 0.6226031573954875, + "grad_norm": 5.7005486488342285, + "learning_rate": 8.84482168526251e-05, + "loss": 0.7632, + "step": 9189 + }, + { + "epoch": 0.6226709126634595, + "grad_norm": 6.816334247589111, + "learning_rate": 8.844684783352727e-05, + "loss": 0.6813, + "step": 9190 + }, + { + "epoch": 0.6227386679314316, + "grad_norm": 7.970419883728027, + "learning_rate": 8.844547881442946e-05, + "loss": 0.5728, + "step": 9191 + }, + { + "epoch": 0.6228064231994037, + "grad_norm": 6.688904285430908, + "learning_rate": 8.844410979533165e-05, + "loss": 0.7974, + "step": 9192 + }, + { + "epoch": 0.6228741784673758, + "grad_norm": 6.076619625091553, + "learning_rate": 8.844274077623383e-05, + "loss": 0.7217, + "step": 9193 + }, + { + "epoch": 0.6229419337353479, + "grad_norm": 7.09970760345459, + "learning_rate": 8.844137175713601e-05, + "loss": 0.8479, + "step": 9194 + }, + { + "epoch": 0.62300968900332, + "grad_norm": 7.291125297546387, + "learning_rate": 8.84400027380382e-05, + "loss": 0.9391, + "step": 9195 + }, + { + "epoch": 0.6230774442712921, + "grad_norm": 6.2532148361206055, + "learning_rate": 8.843863371894038e-05, + "loss": 0.9657, + "step": 9196 + }, + { + "epoch": 0.6231451995392642, + "grad_norm": 6.567989349365234, + "learning_rate": 8.843726469984256e-05, + "loss": 0.767, + "step": 9197 + }, + { + "epoch": 0.6232129548072363, + "grad_norm": 5.543100357055664, + "learning_rate": 8.843589568074476e-05, + "loss": 0.7537, + "step": 9198 + }, + { + "epoch": 0.6232807100752084, + "grad_norm": 7.001931190490723, + "learning_rate": 8.843452666164694e-05, + "loss": 0.8386, + "step": 9199 + }, + { + "epoch": 0.6233484653431804, + "grad_norm": 6.852741718292236, + "learning_rate": 8.843315764254912e-05, + "loss": 0.9314, + "step": 9200 + }, + { + "epoch": 0.6234162206111525, + "grad_norm": 6.225865364074707, + "learning_rate": 8.843178862345131e-05, + "loss": 0.8302, + "step": 9201 + }, + { + "epoch": 0.6234839758791246, + "grad_norm": 7.8117594718933105, + "learning_rate": 8.843041960435349e-05, + "loss": 0.9156, + "step": 9202 + }, + { + "epoch": 0.6235517311470967, + "grad_norm": 7.585949420928955, + "learning_rate": 8.842905058525567e-05, + "loss": 0.9901, + "step": 9203 + }, + { + "epoch": 0.6236194864150688, + "grad_norm": 5.7599945068359375, + "learning_rate": 8.842768156615785e-05, + "loss": 0.8697, + "step": 9204 + }, + { + "epoch": 0.6236872416830409, + "grad_norm": 6.114898204803467, + "learning_rate": 8.842631254706003e-05, + "loss": 0.917, + "step": 9205 + }, + { + "epoch": 0.623754996951013, + "grad_norm": 6.57565450668335, + "learning_rate": 8.842494352796223e-05, + "loss": 0.9872, + "step": 9206 + }, + { + "epoch": 0.623822752218985, + "grad_norm": 6.324807643890381, + "learning_rate": 8.842357450886441e-05, + "loss": 0.804, + "step": 9207 + }, + { + "epoch": 0.6238905074869571, + "grad_norm": 7.262860298156738, + "learning_rate": 8.842220548976659e-05, + "loss": 0.8051, + "step": 9208 + }, + { + "epoch": 0.6239582627549292, + "grad_norm": 6.4210968017578125, + "learning_rate": 8.842083647066877e-05, + "loss": 0.9117, + "step": 9209 + }, + { + "epoch": 0.6240260180229013, + "grad_norm": 5.8516154289245605, + "learning_rate": 8.841946745157096e-05, + "loss": 0.7292, + "step": 9210 + }, + { + "epoch": 0.6240937732908733, + "grad_norm": 7.249476909637451, + "learning_rate": 8.841809843247314e-05, + "loss": 0.7149, + "step": 9211 + }, + { + "epoch": 0.6241615285588454, + "grad_norm": 6.276209831237793, + "learning_rate": 8.841672941337532e-05, + "loss": 0.8576, + "step": 9212 + }, + { + "epoch": 0.6242292838268175, + "grad_norm": 7.875953674316406, + "learning_rate": 8.84153603942775e-05, + "loss": 0.8345, + "step": 9213 + }, + { + "epoch": 0.6242970390947896, + "grad_norm": 6.010235786437988, + "learning_rate": 8.841399137517968e-05, + "loss": 0.8376, + "step": 9214 + }, + { + "epoch": 0.6243647943627617, + "grad_norm": 6.451040744781494, + "learning_rate": 8.841262235608188e-05, + "loss": 0.8943, + "step": 9215 + }, + { + "epoch": 0.6244325496307338, + "grad_norm": 7.024655342102051, + "learning_rate": 8.841125333698406e-05, + "loss": 0.634, + "step": 9216 + }, + { + "epoch": 0.6245003048987059, + "grad_norm": 7.169036865234375, + "learning_rate": 8.840988431788624e-05, + "loss": 0.8139, + "step": 9217 + }, + { + "epoch": 0.624568060166678, + "grad_norm": 5.163486003875732, + "learning_rate": 8.840851529878842e-05, + "loss": 0.847, + "step": 9218 + }, + { + "epoch": 0.6246358154346501, + "grad_norm": 6.5144429206848145, + "learning_rate": 8.840714627969061e-05, + "loss": 0.8632, + "step": 9219 + }, + { + "epoch": 0.6247035707026222, + "grad_norm": 8.250146865844727, + "learning_rate": 8.84057772605928e-05, + "loss": 1.1546, + "step": 9220 + }, + { + "epoch": 0.6247713259705943, + "grad_norm": 5.385178565979004, + "learning_rate": 8.840440824149497e-05, + "loss": 1.2391, + "step": 9221 + }, + { + "epoch": 0.6248390812385664, + "grad_norm": 6.536712169647217, + "learning_rate": 8.840303922239715e-05, + "loss": 0.8697, + "step": 9222 + }, + { + "epoch": 0.6249068365065383, + "grad_norm": 5.161468982696533, + "learning_rate": 8.840167020329933e-05, + "loss": 0.6399, + "step": 9223 + }, + { + "epoch": 0.6249745917745104, + "grad_norm": 6.684597969055176, + "learning_rate": 8.840030118420153e-05, + "loss": 0.8301, + "step": 9224 + }, + { + "epoch": 0.6250423470424825, + "grad_norm": 7.047337055206299, + "learning_rate": 8.839893216510371e-05, + "loss": 1.0555, + "step": 9225 + }, + { + "epoch": 0.6251101023104546, + "grad_norm": 6.454337120056152, + "learning_rate": 8.839756314600589e-05, + "loss": 0.7768, + "step": 9226 + }, + { + "epoch": 0.6251778575784267, + "grad_norm": 5.521293640136719, + "learning_rate": 8.839619412690807e-05, + "loss": 0.8204, + "step": 9227 + }, + { + "epoch": 0.6252456128463988, + "grad_norm": 7.6041364669799805, + "learning_rate": 8.839482510781025e-05, + "loss": 1.0063, + "step": 9228 + }, + { + "epoch": 0.6253133681143709, + "grad_norm": 6.339493751525879, + "learning_rate": 8.839345608871244e-05, + "loss": 0.6371, + "step": 9229 + }, + { + "epoch": 0.625381123382343, + "grad_norm": 6.844937324523926, + "learning_rate": 8.839208706961462e-05, + "loss": 0.7956, + "step": 9230 + }, + { + "epoch": 0.6254488786503151, + "grad_norm": 5.3728461265563965, + "learning_rate": 8.83907180505168e-05, + "loss": 0.765, + "step": 9231 + }, + { + "epoch": 0.6255166339182872, + "grad_norm": 9.041521072387695, + "learning_rate": 8.838934903141898e-05, + "loss": 0.6906, + "step": 9232 + }, + { + "epoch": 0.6255843891862592, + "grad_norm": 7.101466178894043, + "learning_rate": 8.838798001232118e-05, + "loss": 1.174, + "step": 9233 + }, + { + "epoch": 0.6256521444542313, + "grad_norm": 7.049058437347412, + "learning_rate": 8.838661099322336e-05, + "loss": 0.7884, + "step": 9234 + }, + { + "epoch": 0.6257198997222034, + "grad_norm": 6.012722969055176, + "learning_rate": 8.838524197412554e-05, + "loss": 0.7543, + "step": 9235 + }, + { + "epoch": 0.6257876549901755, + "grad_norm": 8.046865463256836, + "learning_rate": 8.838387295502772e-05, + "loss": 0.7745, + "step": 9236 + }, + { + "epoch": 0.6258554102581476, + "grad_norm": 6.681391716003418, + "learning_rate": 8.83825039359299e-05, + "loss": 0.8218, + "step": 9237 + }, + { + "epoch": 0.6259231655261197, + "grad_norm": 8.432860374450684, + "learning_rate": 8.83811349168321e-05, + "loss": 0.84, + "step": 9238 + }, + { + "epoch": 0.6259909207940917, + "grad_norm": 7.283944606781006, + "learning_rate": 8.837976589773427e-05, + "loss": 0.7066, + "step": 9239 + }, + { + "epoch": 0.6260586760620638, + "grad_norm": 6.028378009796143, + "learning_rate": 8.837839687863645e-05, + "loss": 0.7066, + "step": 9240 + }, + { + "epoch": 0.6261264313300359, + "grad_norm": 5.715835094451904, + "learning_rate": 8.837702785953863e-05, + "loss": 0.9115, + "step": 9241 + }, + { + "epoch": 0.626194186598008, + "grad_norm": 5.851448059082031, + "learning_rate": 8.837565884044083e-05, + "loss": 0.7464, + "step": 9242 + }, + { + "epoch": 0.62626194186598, + "grad_norm": 7.112005710601807, + "learning_rate": 8.837428982134301e-05, + "loss": 0.8635, + "step": 9243 + }, + { + "epoch": 0.6263296971339521, + "grad_norm": 6.428948879241943, + "learning_rate": 8.837292080224519e-05, + "loss": 0.9229, + "step": 9244 + }, + { + "epoch": 0.6263974524019242, + "grad_norm": 5.358401298522949, + "learning_rate": 8.837155178314738e-05, + "loss": 0.5362, + "step": 9245 + }, + { + "epoch": 0.6264652076698963, + "grad_norm": 5.889663219451904, + "learning_rate": 8.837018276404956e-05, + "loss": 0.995, + "step": 9246 + }, + { + "epoch": 0.6265329629378684, + "grad_norm": 5.585958480834961, + "learning_rate": 8.836881374495174e-05, + "loss": 0.8504, + "step": 9247 + }, + { + "epoch": 0.6266007182058405, + "grad_norm": 6.147828102111816, + "learning_rate": 8.836744472585394e-05, + "loss": 0.8975, + "step": 9248 + }, + { + "epoch": 0.6266684734738126, + "grad_norm": 6.951436996459961, + "learning_rate": 8.836607570675612e-05, + "loss": 1.063, + "step": 9249 + }, + { + "epoch": 0.6267362287417847, + "grad_norm": 6.059296131134033, + "learning_rate": 8.83647066876583e-05, + "loss": 0.815, + "step": 9250 + }, + { + "epoch": 0.6268039840097568, + "grad_norm": 8.177648544311523, + "learning_rate": 8.836333766856048e-05, + "loss": 0.9035, + "step": 9251 + }, + { + "epoch": 0.6268717392777289, + "grad_norm": 5.385120868682861, + "learning_rate": 8.836196864946267e-05, + "loss": 0.5322, + "step": 9252 + }, + { + "epoch": 0.626939494545701, + "grad_norm": 9.279441833496094, + "learning_rate": 8.836059963036485e-05, + "loss": 0.8954, + "step": 9253 + }, + { + "epoch": 0.6270072498136731, + "grad_norm": 6.143721103668213, + "learning_rate": 8.835923061126703e-05, + "loss": 0.7225, + "step": 9254 + }, + { + "epoch": 0.627075005081645, + "grad_norm": 7.277685642242432, + "learning_rate": 8.835786159216921e-05, + "loss": 0.769, + "step": 9255 + }, + { + "epoch": 0.6271427603496171, + "grad_norm": 6.348362445831299, + "learning_rate": 8.835649257307141e-05, + "loss": 0.8076, + "step": 9256 + }, + { + "epoch": 0.6272105156175892, + "grad_norm": 5.717894077301025, + "learning_rate": 8.835512355397359e-05, + "loss": 0.5646, + "step": 9257 + }, + { + "epoch": 0.6272782708855613, + "grad_norm": 5.959596157073975, + "learning_rate": 8.835375453487577e-05, + "loss": 0.8393, + "step": 9258 + }, + { + "epoch": 0.6273460261535334, + "grad_norm": 6.298025131225586, + "learning_rate": 8.835238551577795e-05, + "loss": 0.6772, + "step": 9259 + }, + { + "epoch": 0.6274137814215055, + "grad_norm": 8.076531410217285, + "learning_rate": 8.835101649668013e-05, + "loss": 1.2792, + "step": 9260 + }, + { + "epoch": 0.6274815366894776, + "grad_norm": 7.445847034454346, + "learning_rate": 8.834964747758232e-05, + "loss": 0.8439, + "step": 9261 + }, + { + "epoch": 0.6275492919574497, + "grad_norm": 8.871906280517578, + "learning_rate": 8.83482784584845e-05, + "loss": 0.7631, + "step": 9262 + }, + { + "epoch": 0.6276170472254218, + "grad_norm": 5.552763938903809, + "learning_rate": 8.834690943938668e-05, + "loss": 0.7293, + "step": 9263 + }, + { + "epoch": 0.6276848024933939, + "grad_norm": 6.359529495239258, + "learning_rate": 8.834554042028886e-05, + "loss": 0.8246, + "step": 9264 + }, + { + "epoch": 0.627752557761366, + "grad_norm": 6.481078147888184, + "learning_rate": 8.834417140119106e-05, + "loss": 0.8931, + "step": 9265 + }, + { + "epoch": 0.627820313029338, + "grad_norm": 6.024886608123779, + "learning_rate": 8.834280238209324e-05, + "loss": 0.6816, + "step": 9266 + }, + { + "epoch": 0.6278880682973101, + "grad_norm": 6.775660514831543, + "learning_rate": 8.834143336299542e-05, + "loss": 0.9066, + "step": 9267 + }, + { + "epoch": 0.6279558235652822, + "grad_norm": 6.696643829345703, + "learning_rate": 8.83400643438976e-05, + "loss": 0.7173, + "step": 9268 + }, + { + "epoch": 0.6280235788332543, + "grad_norm": 7.639727592468262, + "learning_rate": 8.833869532479978e-05, + "loss": 0.772, + "step": 9269 + }, + { + "epoch": 0.6280913341012264, + "grad_norm": 7.364283084869385, + "learning_rate": 8.833732630570197e-05, + "loss": 0.8966, + "step": 9270 + }, + { + "epoch": 0.6281590893691985, + "grad_norm": 6.8663482666015625, + "learning_rate": 8.833595728660415e-05, + "loss": 0.9459, + "step": 9271 + }, + { + "epoch": 0.6282268446371705, + "grad_norm": 7.687761306762695, + "learning_rate": 8.833458826750633e-05, + "loss": 1.0611, + "step": 9272 + }, + { + "epoch": 0.6282945999051426, + "grad_norm": 6.051063060760498, + "learning_rate": 8.833321924840851e-05, + "loss": 0.6693, + "step": 9273 + }, + { + "epoch": 0.6283623551731147, + "grad_norm": 6.312000274658203, + "learning_rate": 8.833185022931071e-05, + "loss": 0.6403, + "step": 9274 + }, + { + "epoch": 0.6284301104410868, + "grad_norm": 6.270723819732666, + "learning_rate": 8.833048121021289e-05, + "loss": 0.9183, + "step": 9275 + }, + { + "epoch": 0.6284978657090589, + "grad_norm": 6.80443811416626, + "learning_rate": 8.832911219111507e-05, + "loss": 0.6951, + "step": 9276 + }, + { + "epoch": 0.628565620977031, + "grad_norm": 6.09138822555542, + "learning_rate": 8.832774317201725e-05, + "loss": 0.8251, + "step": 9277 + }, + { + "epoch": 0.628633376245003, + "grad_norm": 5.778343200683594, + "learning_rate": 8.832637415291943e-05, + "loss": 0.5598, + "step": 9278 + }, + { + "epoch": 0.6287011315129751, + "grad_norm": 6.845189571380615, + "learning_rate": 8.832500513382162e-05, + "loss": 0.9961, + "step": 9279 + }, + { + "epoch": 0.6287688867809472, + "grad_norm": 5.6591081619262695, + "learning_rate": 8.83236361147238e-05, + "loss": 0.9207, + "step": 9280 + }, + { + "epoch": 0.6288366420489193, + "grad_norm": 6.356192588806152, + "learning_rate": 8.832226709562598e-05, + "loss": 0.9117, + "step": 9281 + }, + { + "epoch": 0.6289043973168914, + "grad_norm": 6.646953582763672, + "learning_rate": 8.832089807652816e-05, + "loss": 0.8016, + "step": 9282 + }, + { + "epoch": 0.6289721525848635, + "grad_norm": 7.553640365600586, + "learning_rate": 8.831952905743034e-05, + "loss": 0.872, + "step": 9283 + }, + { + "epoch": 0.6290399078528356, + "grad_norm": 6.47942590713501, + "learning_rate": 8.831816003833254e-05, + "loss": 0.7643, + "step": 9284 + }, + { + "epoch": 0.6291076631208077, + "grad_norm": 4.924723148345947, + "learning_rate": 8.831679101923472e-05, + "loss": 0.673, + "step": 9285 + }, + { + "epoch": 0.6291754183887798, + "grad_norm": 6.718869686126709, + "learning_rate": 8.83154220001369e-05, + "loss": 0.9365, + "step": 9286 + }, + { + "epoch": 0.6292431736567519, + "grad_norm": 6.06726598739624, + "learning_rate": 8.831405298103908e-05, + "loss": 0.7183, + "step": 9287 + }, + { + "epoch": 0.6293109289247238, + "grad_norm": 6.576848030090332, + "learning_rate": 8.831268396194127e-05, + "loss": 0.8866, + "step": 9288 + }, + { + "epoch": 0.6293786841926959, + "grad_norm": 7.97581148147583, + "learning_rate": 8.831131494284345e-05, + "loss": 0.8479, + "step": 9289 + }, + { + "epoch": 0.629446439460668, + "grad_norm": 5.632781028747559, + "learning_rate": 8.830994592374563e-05, + "loss": 0.7944, + "step": 9290 + }, + { + "epoch": 0.6295141947286401, + "grad_norm": 7.232397556304932, + "learning_rate": 8.830857690464783e-05, + "loss": 0.9172, + "step": 9291 + }, + { + "epoch": 0.6295819499966122, + "grad_norm": 7.6181464195251465, + "learning_rate": 8.830720788555001e-05, + "loss": 0.6232, + "step": 9292 + }, + { + "epoch": 0.6296497052645843, + "grad_norm": 7.363900661468506, + "learning_rate": 8.830583886645219e-05, + "loss": 0.9468, + "step": 9293 + }, + { + "epoch": 0.6297174605325564, + "grad_norm": 6.03270149230957, + "learning_rate": 8.830446984735438e-05, + "loss": 0.9073, + "step": 9294 + }, + { + "epoch": 0.6297852158005285, + "grad_norm": 6.522334098815918, + "learning_rate": 8.830310082825656e-05, + "loss": 0.7139, + "step": 9295 + }, + { + "epoch": 0.6298529710685006, + "grad_norm": 7.849470615386963, + "learning_rate": 8.830173180915874e-05, + "loss": 1.0359, + "step": 9296 + }, + { + "epoch": 0.6299207263364727, + "grad_norm": 7.876535892486572, + "learning_rate": 8.830036279006094e-05, + "loss": 0.8647, + "step": 9297 + }, + { + "epoch": 0.6299884816044448, + "grad_norm": 5.959194183349609, + "learning_rate": 8.829899377096312e-05, + "loss": 0.9567, + "step": 9298 + }, + { + "epoch": 0.6300562368724169, + "grad_norm": 5.821471691131592, + "learning_rate": 8.82976247518653e-05, + "loss": 0.8456, + "step": 9299 + }, + { + "epoch": 0.6301239921403889, + "grad_norm": 5.807693004608154, + "learning_rate": 8.829625573276748e-05, + "loss": 0.8929, + "step": 9300 + }, + { + "epoch": 0.630191747408361, + "grad_norm": 6.666048049926758, + "learning_rate": 8.829488671366966e-05, + "loss": 0.891, + "step": 9301 + }, + { + "epoch": 0.6302595026763331, + "grad_norm": 6.061259746551514, + "learning_rate": 8.829351769457185e-05, + "loss": 0.7318, + "step": 9302 + }, + { + "epoch": 0.6303272579443052, + "grad_norm": 6.72253942489624, + "learning_rate": 8.829214867547403e-05, + "loss": 0.9176, + "step": 9303 + }, + { + "epoch": 0.6303950132122772, + "grad_norm": 6.540244102478027, + "learning_rate": 8.829077965637621e-05, + "loss": 0.7941, + "step": 9304 + }, + { + "epoch": 0.6304627684802493, + "grad_norm": 6.3027262687683105, + "learning_rate": 8.82894106372784e-05, + "loss": 0.7615, + "step": 9305 + }, + { + "epoch": 0.6305305237482214, + "grad_norm": 5.5668745040893555, + "learning_rate": 8.828804161818057e-05, + "loss": 0.7446, + "step": 9306 + }, + { + "epoch": 0.6305982790161935, + "grad_norm": 6.234310150146484, + "learning_rate": 8.828667259908277e-05, + "loss": 0.8601, + "step": 9307 + }, + { + "epoch": 0.6306660342841656, + "grad_norm": 6.879257678985596, + "learning_rate": 8.828530357998495e-05, + "loss": 0.8312, + "step": 9308 + }, + { + "epoch": 0.6307337895521377, + "grad_norm": 5.684000015258789, + "learning_rate": 8.828393456088713e-05, + "loss": 0.8129, + "step": 9309 + }, + { + "epoch": 0.6308015448201097, + "grad_norm": 6.523510932922363, + "learning_rate": 8.828256554178931e-05, + "loss": 1.0274, + "step": 9310 + }, + { + "epoch": 0.6308693000880818, + "grad_norm": 6.060532569885254, + "learning_rate": 8.82811965226915e-05, + "loss": 0.6513, + "step": 9311 + }, + { + "epoch": 0.6309370553560539, + "grad_norm": 6.2253828048706055, + "learning_rate": 8.827982750359368e-05, + "loss": 0.7539, + "step": 9312 + }, + { + "epoch": 0.631004810624026, + "grad_norm": 5.188565254211426, + "learning_rate": 8.827845848449586e-05, + "loss": 0.7904, + "step": 9313 + }, + { + "epoch": 0.6310725658919981, + "grad_norm": 6.081325054168701, + "learning_rate": 8.827708946539804e-05, + "loss": 0.8399, + "step": 9314 + }, + { + "epoch": 0.6311403211599702, + "grad_norm": 5.515079975128174, + "learning_rate": 8.827572044630022e-05, + "loss": 0.7415, + "step": 9315 + }, + { + "epoch": 0.6312080764279423, + "grad_norm": 6.149160385131836, + "learning_rate": 8.827435142720242e-05, + "loss": 0.6537, + "step": 9316 + }, + { + "epoch": 0.6312758316959144, + "grad_norm": 8.075578689575195, + "learning_rate": 8.82729824081046e-05, + "loss": 0.9744, + "step": 9317 + }, + { + "epoch": 0.6313435869638865, + "grad_norm": 5.570530414581299, + "learning_rate": 8.827161338900678e-05, + "loss": 0.6468, + "step": 9318 + }, + { + "epoch": 0.6314113422318586, + "grad_norm": 7.706857204437256, + "learning_rate": 8.827024436990896e-05, + "loss": 1.0308, + "step": 9319 + }, + { + "epoch": 0.6314790974998307, + "grad_norm": 8.566661834716797, + "learning_rate": 8.826887535081115e-05, + "loss": 0.6278, + "step": 9320 + }, + { + "epoch": 0.6315468527678026, + "grad_norm": 6.264912128448486, + "learning_rate": 8.826750633171333e-05, + "loss": 0.9, + "step": 9321 + }, + { + "epoch": 0.6316146080357747, + "grad_norm": 6.289927005767822, + "learning_rate": 8.826613731261551e-05, + "loss": 0.788, + "step": 9322 + }, + { + "epoch": 0.6316823633037468, + "grad_norm": 6.285764217376709, + "learning_rate": 8.82647682935177e-05, + "loss": 0.7928, + "step": 9323 + }, + { + "epoch": 0.6317501185717189, + "grad_norm": 6.555515766143799, + "learning_rate": 8.826339927441987e-05, + "loss": 0.7345, + "step": 9324 + }, + { + "epoch": 0.631817873839691, + "grad_norm": 7.84982967376709, + "learning_rate": 8.826203025532207e-05, + "loss": 0.917, + "step": 9325 + }, + { + "epoch": 0.6318856291076631, + "grad_norm": 5.471888065338135, + "learning_rate": 8.826066123622425e-05, + "loss": 0.7449, + "step": 9326 + }, + { + "epoch": 0.6319533843756352, + "grad_norm": 7.806143283843994, + "learning_rate": 8.825929221712643e-05, + "loss": 0.8559, + "step": 9327 + }, + { + "epoch": 0.6320211396436073, + "grad_norm": 6.160640716552734, + "learning_rate": 8.825792319802861e-05, + "loss": 0.7819, + "step": 9328 + }, + { + "epoch": 0.6320888949115794, + "grad_norm": 6.968262195587158, + "learning_rate": 8.825655417893079e-05, + "loss": 1.1055, + "step": 9329 + }, + { + "epoch": 0.6321566501795515, + "grad_norm": 5.455170154571533, + "learning_rate": 8.825518515983298e-05, + "loss": 0.7852, + "step": 9330 + }, + { + "epoch": 0.6322244054475236, + "grad_norm": 7.499693870544434, + "learning_rate": 8.825381614073516e-05, + "loss": 0.8827, + "step": 9331 + }, + { + "epoch": 0.6322921607154957, + "grad_norm": 6.363000869750977, + "learning_rate": 8.825244712163734e-05, + "loss": 0.9469, + "step": 9332 + }, + { + "epoch": 0.6323599159834677, + "grad_norm": 6.631052017211914, + "learning_rate": 8.825107810253952e-05, + "loss": 0.8891, + "step": 9333 + }, + { + "epoch": 0.6324276712514398, + "grad_norm": 5.2757697105407715, + "learning_rate": 8.824970908344172e-05, + "loss": 0.8287, + "step": 9334 + }, + { + "epoch": 0.6324954265194119, + "grad_norm": 8.268365859985352, + "learning_rate": 8.82483400643439e-05, + "loss": 0.9812, + "step": 9335 + }, + { + "epoch": 0.632563181787384, + "grad_norm": 6.276435852050781, + "learning_rate": 8.824697104524608e-05, + "loss": 0.9827, + "step": 9336 + }, + { + "epoch": 0.632630937055356, + "grad_norm": 5.93610954284668, + "learning_rate": 8.824560202614827e-05, + "loss": 0.8656, + "step": 9337 + }, + { + "epoch": 0.6326986923233281, + "grad_norm": 6.036667346954346, + "learning_rate": 8.824423300705045e-05, + "loss": 0.9812, + "step": 9338 + }, + { + "epoch": 0.6327664475913002, + "grad_norm": 5.460302829742432, + "learning_rate": 8.824286398795263e-05, + "loss": 0.7286, + "step": 9339 + }, + { + "epoch": 0.6328342028592723, + "grad_norm": 6.362276554107666, + "learning_rate": 8.824149496885483e-05, + "loss": 0.8244, + "step": 9340 + }, + { + "epoch": 0.6329019581272444, + "grad_norm": 7.416135311126709, + "learning_rate": 8.824012594975701e-05, + "loss": 0.7997, + "step": 9341 + }, + { + "epoch": 0.6329697133952165, + "grad_norm": 7.711816787719727, + "learning_rate": 8.823875693065919e-05, + "loss": 1.0054, + "step": 9342 + }, + { + "epoch": 0.6330374686631886, + "grad_norm": 5.800533771514893, + "learning_rate": 8.823738791156138e-05, + "loss": 1.1502, + "step": 9343 + }, + { + "epoch": 0.6331052239311606, + "grad_norm": 5.00458288192749, + "learning_rate": 8.823601889246356e-05, + "loss": 0.6607, + "step": 9344 + }, + { + "epoch": 0.6331729791991327, + "grad_norm": 6.7192277908325195, + "learning_rate": 8.823464987336574e-05, + "loss": 1.0327, + "step": 9345 + }, + { + "epoch": 0.6332407344671048, + "grad_norm": 6.815017223358154, + "learning_rate": 8.823328085426792e-05, + "loss": 0.7579, + "step": 9346 + }, + { + "epoch": 0.6333084897350769, + "grad_norm": 7.114835262298584, + "learning_rate": 8.82319118351701e-05, + "loss": 1.0215, + "step": 9347 + }, + { + "epoch": 0.633376245003049, + "grad_norm": 4.819394111633301, + "learning_rate": 8.82305428160723e-05, + "loss": 0.6611, + "step": 9348 + }, + { + "epoch": 0.6334440002710211, + "grad_norm": 7.824159145355225, + "learning_rate": 8.822917379697448e-05, + "loss": 1.0557, + "step": 9349 + }, + { + "epoch": 0.6335117555389932, + "grad_norm": 6.462610721588135, + "learning_rate": 8.822780477787666e-05, + "loss": 1.1399, + "step": 9350 + }, + { + "epoch": 0.6335795108069653, + "grad_norm": 5.1685709953308105, + "learning_rate": 8.822643575877884e-05, + "loss": 0.7156, + "step": 9351 + }, + { + "epoch": 0.6336472660749374, + "grad_norm": 6.346011638641357, + "learning_rate": 8.822506673968103e-05, + "loss": 0.8654, + "step": 9352 + }, + { + "epoch": 0.6337150213429094, + "grad_norm": 5.18143367767334, + "learning_rate": 8.822369772058321e-05, + "loss": 0.6909, + "step": 9353 + }, + { + "epoch": 0.6337827766108814, + "grad_norm": 5.92888069152832, + "learning_rate": 8.822232870148539e-05, + "loss": 0.7467, + "step": 9354 + }, + { + "epoch": 0.6338505318788535, + "grad_norm": 7.556412696838379, + "learning_rate": 8.822095968238757e-05, + "loss": 0.9099, + "step": 9355 + }, + { + "epoch": 0.6339182871468256, + "grad_norm": 5.749037742614746, + "learning_rate": 8.821959066328975e-05, + "loss": 0.7791, + "step": 9356 + }, + { + "epoch": 0.6339860424147977, + "grad_norm": 6.511964321136475, + "learning_rate": 8.821822164419195e-05, + "loss": 0.879, + "step": 9357 + }, + { + "epoch": 0.6340537976827698, + "grad_norm": 7.070270538330078, + "learning_rate": 8.821685262509413e-05, + "loss": 0.8151, + "step": 9358 + }, + { + "epoch": 0.6341215529507419, + "grad_norm": 7.1327409744262695, + "learning_rate": 8.821548360599631e-05, + "loss": 0.9491, + "step": 9359 + }, + { + "epoch": 0.634189308218714, + "grad_norm": 5.8238911628723145, + "learning_rate": 8.821411458689849e-05, + "loss": 0.6894, + "step": 9360 + }, + { + "epoch": 0.6342570634866861, + "grad_norm": 5.967693328857422, + "learning_rate": 8.821274556780067e-05, + "loss": 0.7576, + "step": 9361 + }, + { + "epoch": 0.6343248187546582, + "grad_norm": 6.3497395515441895, + "learning_rate": 8.821137654870286e-05, + "loss": 0.9133, + "step": 9362 + }, + { + "epoch": 0.6343925740226303, + "grad_norm": 5.200571060180664, + "learning_rate": 8.821000752960504e-05, + "loss": 0.6492, + "step": 9363 + }, + { + "epoch": 0.6344603292906024, + "grad_norm": 6.661485195159912, + "learning_rate": 8.820863851050722e-05, + "loss": 0.7363, + "step": 9364 + }, + { + "epoch": 0.6345280845585745, + "grad_norm": 5.2447733879089355, + "learning_rate": 8.82072694914094e-05, + "loss": 0.7672, + "step": 9365 + }, + { + "epoch": 0.6345958398265465, + "grad_norm": 6.811657905578613, + "learning_rate": 8.82059004723116e-05, + "loss": 0.9903, + "step": 9366 + }, + { + "epoch": 0.6346635950945186, + "grad_norm": 5.834871768951416, + "learning_rate": 8.820453145321378e-05, + "loss": 0.697, + "step": 9367 + }, + { + "epoch": 0.6347313503624907, + "grad_norm": 6.931889057159424, + "learning_rate": 8.820316243411596e-05, + "loss": 0.8039, + "step": 9368 + }, + { + "epoch": 0.6347991056304628, + "grad_norm": 5.947389602661133, + "learning_rate": 8.820179341501814e-05, + "loss": 0.7339, + "step": 9369 + }, + { + "epoch": 0.6348668608984348, + "grad_norm": 6.914769649505615, + "learning_rate": 8.820042439592032e-05, + "loss": 0.7366, + "step": 9370 + }, + { + "epoch": 0.6349346161664069, + "grad_norm": 7.74104118347168, + "learning_rate": 8.819905537682251e-05, + "loss": 1.1219, + "step": 9371 + }, + { + "epoch": 0.635002371434379, + "grad_norm": 6.544697284698486, + "learning_rate": 8.819768635772469e-05, + "loss": 0.7795, + "step": 9372 + }, + { + "epoch": 0.6350701267023511, + "grad_norm": 6.4533772468566895, + "learning_rate": 8.819631733862687e-05, + "loss": 0.8343, + "step": 9373 + }, + { + "epoch": 0.6351378819703232, + "grad_norm": 6.726015090942383, + "learning_rate": 8.819494831952905e-05, + "loss": 0.6306, + "step": 9374 + }, + { + "epoch": 0.6352056372382953, + "grad_norm": 6.396018981933594, + "learning_rate": 8.819357930043125e-05, + "loss": 0.7632, + "step": 9375 + }, + { + "epoch": 0.6352733925062674, + "grad_norm": 7.739027500152588, + "learning_rate": 8.819221028133343e-05, + "loss": 1.2215, + "step": 9376 + }, + { + "epoch": 0.6353411477742394, + "grad_norm": 6.827197551727295, + "learning_rate": 8.819084126223561e-05, + "loss": 1.0954, + "step": 9377 + }, + { + "epoch": 0.6354089030422115, + "grad_norm": 6.967258930206299, + "learning_rate": 8.818947224313779e-05, + "loss": 1.1225, + "step": 9378 + }, + { + "epoch": 0.6354766583101836, + "grad_norm": 7.872466564178467, + "learning_rate": 8.818810322403997e-05, + "loss": 0.8114, + "step": 9379 + }, + { + "epoch": 0.6355444135781557, + "grad_norm": 6.985890865325928, + "learning_rate": 8.818673420494216e-05, + "loss": 0.8177, + "step": 9380 + }, + { + "epoch": 0.6356121688461278, + "grad_norm": 7.022130966186523, + "learning_rate": 8.818536518584434e-05, + "loss": 0.7718, + "step": 9381 + }, + { + "epoch": 0.6356799241140999, + "grad_norm": 8.023907661437988, + "learning_rate": 8.818399616674652e-05, + "loss": 0.9181, + "step": 9382 + }, + { + "epoch": 0.635747679382072, + "grad_norm": 7.373246669769287, + "learning_rate": 8.818262714764872e-05, + "loss": 0.8733, + "step": 9383 + }, + { + "epoch": 0.6358154346500441, + "grad_norm": 5.75626802444458, + "learning_rate": 8.81812581285509e-05, + "loss": 0.6196, + "step": 9384 + }, + { + "epoch": 0.6358831899180162, + "grad_norm": 7.449457168579102, + "learning_rate": 8.817988910945308e-05, + "loss": 1.1738, + "step": 9385 + }, + { + "epoch": 0.6359509451859882, + "grad_norm": 6.01223087310791, + "learning_rate": 8.817852009035527e-05, + "loss": 0.6577, + "step": 9386 + }, + { + "epoch": 0.6360187004539603, + "grad_norm": 5.752939701080322, + "learning_rate": 8.817715107125745e-05, + "loss": 0.8448, + "step": 9387 + }, + { + "epoch": 0.6360864557219323, + "grad_norm": 6.657622337341309, + "learning_rate": 8.817578205215963e-05, + "loss": 0.8274, + "step": 9388 + }, + { + "epoch": 0.6361542109899044, + "grad_norm": 8.236910820007324, + "learning_rate": 8.817441303306183e-05, + "loss": 1.0927, + "step": 9389 + }, + { + "epoch": 0.6362219662578765, + "grad_norm": 6.963682651519775, + "learning_rate": 8.817304401396401e-05, + "loss": 1.0313, + "step": 9390 + }, + { + "epoch": 0.6362897215258486, + "grad_norm": 7.404387950897217, + "learning_rate": 8.817167499486619e-05, + "loss": 1.0147, + "step": 9391 + }, + { + "epoch": 0.6363574767938207, + "grad_norm": 6.6976470947265625, + "learning_rate": 8.817030597576837e-05, + "loss": 0.8769, + "step": 9392 + }, + { + "epoch": 0.6364252320617928, + "grad_norm": 5.77000617980957, + "learning_rate": 8.816893695667055e-05, + "loss": 0.8143, + "step": 9393 + }, + { + "epoch": 0.6364929873297649, + "grad_norm": 5.952037334442139, + "learning_rate": 8.816756793757274e-05, + "loss": 0.8243, + "step": 9394 + }, + { + "epoch": 0.636560742597737, + "grad_norm": 7.398003578186035, + "learning_rate": 8.816619891847492e-05, + "loss": 1.0528, + "step": 9395 + }, + { + "epoch": 0.6366284978657091, + "grad_norm": 7.800195693969727, + "learning_rate": 8.81648298993771e-05, + "loss": 0.8146, + "step": 9396 + }, + { + "epoch": 0.6366962531336812, + "grad_norm": 5.836297035217285, + "learning_rate": 8.816346088027928e-05, + "loss": 0.6748, + "step": 9397 + }, + { + "epoch": 0.6367640084016533, + "grad_norm": 8.607617378234863, + "learning_rate": 8.816209186118148e-05, + "loss": 1.1096, + "step": 9398 + }, + { + "epoch": 0.6368317636696254, + "grad_norm": 5.9134392738342285, + "learning_rate": 8.816072284208366e-05, + "loss": 0.8241, + "step": 9399 + }, + { + "epoch": 0.6368995189375974, + "grad_norm": 6.997758865356445, + "learning_rate": 8.815935382298584e-05, + "loss": 0.842, + "step": 9400 + }, + { + "epoch": 0.6369672742055695, + "grad_norm": 6.37071418762207, + "learning_rate": 8.815798480388802e-05, + "loss": 0.8297, + "step": 9401 + }, + { + "epoch": 0.6370350294735415, + "grad_norm": 5.971574306488037, + "learning_rate": 8.81566157847902e-05, + "loss": 0.7567, + "step": 9402 + }, + { + "epoch": 0.6371027847415136, + "grad_norm": 7.232084274291992, + "learning_rate": 8.815524676569239e-05, + "loss": 0.9095, + "step": 9403 + }, + { + "epoch": 0.6371705400094857, + "grad_norm": 4.349423885345459, + "learning_rate": 8.815387774659457e-05, + "loss": 0.568, + "step": 9404 + }, + { + "epoch": 0.6372382952774578, + "grad_norm": 5.8268208503723145, + "learning_rate": 8.815250872749675e-05, + "loss": 0.9203, + "step": 9405 + }, + { + "epoch": 0.6373060505454299, + "grad_norm": 7.490527153015137, + "learning_rate": 8.815113970839893e-05, + "loss": 1.1386, + "step": 9406 + }, + { + "epoch": 0.637373805813402, + "grad_norm": 7.359919548034668, + "learning_rate": 8.814977068930113e-05, + "loss": 1.1407, + "step": 9407 + }, + { + "epoch": 0.6374415610813741, + "grad_norm": 8.83666706085205, + "learning_rate": 8.814840167020331e-05, + "loss": 0.7869, + "step": 9408 + }, + { + "epoch": 0.6375093163493462, + "grad_norm": 8.88762378692627, + "learning_rate": 8.814703265110549e-05, + "loss": 1.1761, + "step": 9409 + }, + { + "epoch": 0.6375770716173182, + "grad_norm": 5.899496078491211, + "learning_rate": 8.814566363200767e-05, + "loss": 0.8142, + "step": 9410 + }, + { + "epoch": 0.6376448268852903, + "grad_norm": 5.661779880523682, + "learning_rate": 8.814429461290985e-05, + "loss": 0.7744, + "step": 9411 + }, + { + "epoch": 0.6377125821532624, + "grad_norm": 6.715488910675049, + "learning_rate": 8.814292559381204e-05, + "loss": 0.7039, + "step": 9412 + }, + { + "epoch": 0.6377803374212345, + "grad_norm": 6.482609272003174, + "learning_rate": 8.814155657471422e-05, + "loss": 0.6712, + "step": 9413 + }, + { + "epoch": 0.6378480926892066, + "grad_norm": 6.715174198150635, + "learning_rate": 8.81401875556164e-05, + "loss": 0.899, + "step": 9414 + }, + { + "epoch": 0.6379158479571787, + "grad_norm": 5.657138824462891, + "learning_rate": 8.813881853651858e-05, + "loss": 0.7513, + "step": 9415 + }, + { + "epoch": 0.6379836032251508, + "grad_norm": 5.905570983886719, + "learning_rate": 8.813744951742076e-05, + "loss": 0.8074, + "step": 9416 + }, + { + "epoch": 0.6380513584931229, + "grad_norm": 7.106529235839844, + "learning_rate": 8.813608049832296e-05, + "loss": 0.8908, + "step": 9417 + }, + { + "epoch": 0.638119113761095, + "grad_norm": 6.900413990020752, + "learning_rate": 8.813471147922514e-05, + "loss": 0.8776, + "step": 9418 + }, + { + "epoch": 0.638186869029067, + "grad_norm": 6.651158809661865, + "learning_rate": 8.813334246012732e-05, + "loss": 0.8336, + "step": 9419 + }, + { + "epoch": 0.638254624297039, + "grad_norm": 8.57502269744873, + "learning_rate": 8.81319734410295e-05, + "loss": 1.0396, + "step": 9420 + }, + { + "epoch": 0.6383223795650111, + "grad_norm": 4.871134281158447, + "learning_rate": 8.813060442193169e-05, + "loss": 0.7517, + "step": 9421 + }, + { + "epoch": 0.6383901348329832, + "grad_norm": 10.170848846435547, + "learning_rate": 8.812923540283387e-05, + "loss": 0.9677, + "step": 9422 + }, + { + "epoch": 0.6384578901009553, + "grad_norm": 6.10742712020874, + "learning_rate": 8.812786638373605e-05, + "loss": 0.9884, + "step": 9423 + }, + { + "epoch": 0.6385256453689274, + "grad_norm": 5.653887748718262, + "learning_rate": 8.812649736463823e-05, + "loss": 0.7325, + "step": 9424 + }, + { + "epoch": 0.6385934006368995, + "grad_norm": 6.501628398895264, + "learning_rate": 8.812512834554041e-05, + "loss": 0.8378, + "step": 9425 + }, + { + "epoch": 0.6386611559048716, + "grad_norm": 5.696453094482422, + "learning_rate": 8.812375932644261e-05, + "loss": 0.787, + "step": 9426 + }, + { + "epoch": 0.6387289111728437, + "grad_norm": 6.385643482208252, + "learning_rate": 8.812239030734479e-05, + "loss": 0.7208, + "step": 9427 + }, + { + "epoch": 0.6387966664408158, + "grad_norm": 8.655128479003906, + "learning_rate": 8.812102128824697e-05, + "loss": 0.5881, + "step": 9428 + }, + { + "epoch": 0.6388644217087879, + "grad_norm": 5.63440465927124, + "learning_rate": 8.811965226914916e-05, + "loss": 0.8304, + "step": 9429 + }, + { + "epoch": 0.63893217697676, + "grad_norm": 7.352899074554443, + "learning_rate": 8.811828325005134e-05, + "loss": 0.9048, + "step": 9430 + }, + { + "epoch": 0.6389999322447321, + "grad_norm": 6.372758865356445, + "learning_rate": 8.811691423095352e-05, + "loss": 0.8811, + "step": 9431 + }, + { + "epoch": 0.6390676875127042, + "grad_norm": 8.284529685974121, + "learning_rate": 8.811554521185572e-05, + "loss": 1.0255, + "step": 9432 + }, + { + "epoch": 0.6391354427806762, + "grad_norm": 4.690229892730713, + "learning_rate": 8.81141761927579e-05, + "loss": 0.6671, + "step": 9433 + }, + { + "epoch": 0.6392031980486483, + "grad_norm": 5.906257629394531, + "learning_rate": 8.811280717366008e-05, + "loss": 1.0164, + "step": 9434 + }, + { + "epoch": 0.6392709533166203, + "grad_norm": 5.012555122375488, + "learning_rate": 8.811143815456227e-05, + "loss": 0.5985, + "step": 9435 + }, + { + "epoch": 0.6393387085845924, + "grad_norm": 9.228821754455566, + "learning_rate": 8.811006913546445e-05, + "loss": 0.9541, + "step": 9436 + }, + { + "epoch": 0.6394064638525645, + "grad_norm": 6.516229152679443, + "learning_rate": 8.810870011636663e-05, + "loss": 1.0687, + "step": 9437 + }, + { + "epoch": 0.6394742191205366, + "grad_norm": 6.953182220458984, + "learning_rate": 8.810733109726881e-05, + "loss": 0.6515, + "step": 9438 + }, + { + "epoch": 0.6395419743885087, + "grad_norm": 5.848710536956787, + "learning_rate": 8.810596207817099e-05, + "loss": 0.9173, + "step": 9439 + }, + { + "epoch": 0.6396097296564808, + "grad_norm": 6.093873023986816, + "learning_rate": 8.810459305907319e-05, + "loss": 0.7612, + "step": 9440 + }, + { + "epoch": 0.6396774849244529, + "grad_norm": 6.799055099487305, + "learning_rate": 8.810322403997537e-05, + "loss": 0.9219, + "step": 9441 + }, + { + "epoch": 0.639745240192425, + "grad_norm": 6.2249436378479, + "learning_rate": 8.810185502087755e-05, + "loss": 0.8459, + "step": 9442 + }, + { + "epoch": 0.639812995460397, + "grad_norm": 9.087257385253906, + "learning_rate": 8.810048600177973e-05, + "loss": 0.7746, + "step": 9443 + }, + { + "epoch": 0.6398807507283691, + "grad_norm": 5.2963433265686035, + "learning_rate": 8.809911698268192e-05, + "loss": 1.0891, + "step": 9444 + }, + { + "epoch": 0.6399485059963412, + "grad_norm": 6.330840110778809, + "learning_rate": 8.80977479635841e-05, + "loss": 0.7877, + "step": 9445 + }, + { + "epoch": 0.6400162612643133, + "grad_norm": 6.288755416870117, + "learning_rate": 8.809637894448628e-05, + "loss": 0.8538, + "step": 9446 + }, + { + "epoch": 0.6400840165322854, + "grad_norm": 6.01615571975708, + "learning_rate": 8.809500992538846e-05, + "loss": 0.8595, + "step": 9447 + }, + { + "epoch": 0.6401517718002575, + "grad_norm": 5.402866840362549, + "learning_rate": 8.809364090629064e-05, + "loss": 0.7067, + "step": 9448 + }, + { + "epoch": 0.6402195270682296, + "grad_norm": 5.39243221282959, + "learning_rate": 8.809227188719284e-05, + "loss": 0.7097, + "step": 9449 + }, + { + "epoch": 0.6402872823362017, + "grad_norm": 5.97304630279541, + "learning_rate": 8.809090286809502e-05, + "loss": 0.8014, + "step": 9450 + }, + { + "epoch": 0.6403550376041737, + "grad_norm": 9.370304107666016, + "learning_rate": 8.80895338489972e-05, + "loss": 0.8021, + "step": 9451 + }, + { + "epoch": 0.6404227928721458, + "grad_norm": 5.073042869567871, + "learning_rate": 8.808816482989938e-05, + "loss": 0.7241, + "step": 9452 + }, + { + "epoch": 0.6404905481401179, + "grad_norm": 6.402749061584473, + "learning_rate": 8.808679581080157e-05, + "loss": 0.733, + "step": 9453 + }, + { + "epoch": 0.64055830340809, + "grad_norm": 5.919348239898682, + "learning_rate": 8.808542679170375e-05, + "loss": 1.0033, + "step": 9454 + }, + { + "epoch": 0.640626058676062, + "grad_norm": 5.996345520019531, + "learning_rate": 8.808405777260593e-05, + "loss": 0.5867, + "step": 9455 + }, + { + "epoch": 0.6406938139440341, + "grad_norm": 6.826650619506836, + "learning_rate": 8.808268875350811e-05, + "loss": 1.1324, + "step": 9456 + }, + { + "epoch": 0.6407615692120062, + "grad_norm": 5.9293999671936035, + "learning_rate": 8.808131973441029e-05, + "loss": 0.7688, + "step": 9457 + }, + { + "epoch": 0.6408293244799783, + "grad_norm": 7.4283833503723145, + "learning_rate": 8.807995071531249e-05, + "loss": 1.0136, + "step": 9458 + }, + { + "epoch": 0.6408970797479504, + "grad_norm": 6.848118782043457, + "learning_rate": 8.807858169621467e-05, + "loss": 0.8612, + "step": 9459 + }, + { + "epoch": 0.6409648350159225, + "grad_norm": 5.884200096130371, + "learning_rate": 8.807721267711685e-05, + "loss": 0.7734, + "step": 9460 + }, + { + "epoch": 0.6410325902838946, + "grad_norm": 6.456984996795654, + "learning_rate": 8.807584365801903e-05, + "loss": 0.899, + "step": 9461 + }, + { + "epoch": 0.6411003455518667, + "grad_norm": 6.896454334259033, + "learning_rate": 8.807447463892121e-05, + "loss": 0.9756, + "step": 9462 + }, + { + "epoch": 0.6411681008198388, + "grad_norm": 5.393690586090088, + "learning_rate": 8.80731056198234e-05, + "loss": 0.5639, + "step": 9463 + }, + { + "epoch": 0.6412358560878109, + "grad_norm": 5.285966873168945, + "learning_rate": 8.807173660072558e-05, + "loss": 0.6522, + "step": 9464 + }, + { + "epoch": 0.641303611355783, + "grad_norm": 5.970489501953125, + "learning_rate": 8.807036758162776e-05, + "loss": 0.6489, + "step": 9465 + }, + { + "epoch": 0.641371366623755, + "grad_norm": 6.003796100616455, + "learning_rate": 8.806899856252994e-05, + "loss": 0.8582, + "step": 9466 + }, + { + "epoch": 0.641439121891727, + "grad_norm": 5.609601974487305, + "learning_rate": 8.806762954343214e-05, + "loss": 0.6631, + "step": 9467 + }, + { + "epoch": 0.6415068771596991, + "grad_norm": 6.617072582244873, + "learning_rate": 8.806626052433432e-05, + "loss": 0.8057, + "step": 9468 + }, + { + "epoch": 0.6415746324276712, + "grad_norm": 5.329599857330322, + "learning_rate": 8.80648915052365e-05, + "loss": 0.7941, + "step": 9469 + }, + { + "epoch": 0.6416423876956433, + "grad_norm": 6.617512226104736, + "learning_rate": 8.806352248613868e-05, + "loss": 0.7594, + "step": 9470 + }, + { + "epoch": 0.6417101429636154, + "grad_norm": 5.257902145385742, + "learning_rate": 8.806215346704086e-05, + "loss": 0.8737, + "step": 9471 + }, + { + "epoch": 0.6417778982315875, + "grad_norm": 8.781728744506836, + "learning_rate": 8.806078444794305e-05, + "loss": 0.9662, + "step": 9472 + }, + { + "epoch": 0.6418456534995596, + "grad_norm": 5.8004889488220215, + "learning_rate": 8.805941542884523e-05, + "loss": 0.8442, + "step": 9473 + }, + { + "epoch": 0.6419134087675317, + "grad_norm": 7.527797698974609, + "learning_rate": 8.805804640974741e-05, + "loss": 0.8849, + "step": 9474 + }, + { + "epoch": 0.6419811640355038, + "grad_norm": 6.8633952140808105, + "learning_rate": 8.80566773906496e-05, + "loss": 0.9317, + "step": 9475 + }, + { + "epoch": 0.6420489193034759, + "grad_norm": 5.778538703918457, + "learning_rate": 8.805530837155179e-05, + "loss": 0.7787, + "step": 9476 + }, + { + "epoch": 0.6421166745714479, + "grad_norm": 5.698131084442139, + "learning_rate": 8.805393935245397e-05, + "loss": 0.7335, + "step": 9477 + }, + { + "epoch": 0.64218442983942, + "grad_norm": 7.117033958435059, + "learning_rate": 8.805257033335615e-05, + "loss": 0.9524, + "step": 9478 + }, + { + "epoch": 0.6422521851073921, + "grad_norm": 7.394477367401123, + "learning_rate": 8.805120131425834e-05, + "loss": 0.8617, + "step": 9479 + }, + { + "epoch": 0.6423199403753642, + "grad_norm": 6.299468517303467, + "learning_rate": 8.804983229516052e-05, + "loss": 0.7959, + "step": 9480 + }, + { + "epoch": 0.6423876956433363, + "grad_norm": 5.769999027252197, + "learning_rate": 8.80484632760627e-05, + "loss": 0.8217, + "step": 9481 + }, + { + "epoch": 0.6424554509113084, + "grad_norm": 6.29069709777832, + "learning_rate": 8.80470942569649e-05, + "loss": 0.8485, + "step": 9482 + }, + { + "epoch": 0.6425232061792805, + "grad_norm": 6.773715496063232, + "learning_rate": 8.804572523786708e-05, + "loss": 0.9625, + "step": 9483 + }, + { + "epoch": 0.6425909614472525, + "grad_norm": 5.882883071899414, + "learning_rate": 8.804435621876926e-05, + "loss": 0.977, + "step": 9484 + }, + { + "epoch": 0.6426587167152246, + "grad_norm": 5.250417709350586, + "learning_rate": 8.804298719967145e-05, + "loss": 0.844, + "step": 9485 + }, + { + "epoch": 0.6427264719831967, + "grad_norm": 6.4407267570495605, + "learning_rate": 8.804161818057363e-05, + "loss": 0.749, + "step": 9486 + }, + { + "epoch": 0.6427942272511687, + "grad_norm": 5.549936294555664, + "learning_rate": 8.804024916147581e-05, + "loss": 0.6762, + "step": 9487 + }, + { + "epoch": 0.6428619825191408, + "grad_norm": 4.843740463256836, + "learning_rate": 8.803888014237799e-05, + "loss": 0.6318, + "step": 9488 + }, + { + "epoch": 0.6429297377871129, + "grad_norm": 8.922931671142578, + "learning_rate": 8.803751112328017e-05, + "loss": 0.943, + "step": 9489 + }, + { + "epoch": 0.642997493055085, + "grad_norm": 5.983587265014648, + "learning_rate": 8.803614210418237e-05, + "loss": 0.6995, + "step": 9490 + }, + { + "epoch": 0.6430652483230571, + "grad_norm": 10.325361251831055, + "learning_rate": 8.803477308508455e-05, + "loss": 0.6907, + "step": 9491 + }, + { + "epoch": 0.6431330035910292, + "grad_norm": 7.284884452819824, + "learning_rate": 8.803340406598673e-05, + "loss": 0.8517, + "step": 9492 + }, + { + "epoch": 0.6432007588590013, + "grad_norm": 5.540932655334473, + "learning_rate": 8.803203504688891e-05, + "loss": 0.6415, + "step": 9493 + }, + { + "epoch": 0.6432685141269734, + "grad_norm": 8.60664176940918, + "learning_rate": 8.803066602779109e-05, + "loss": 0.8818, + "step": 9494 + }, + { + "epoch": 0.6433362693949455, + "grad_norm": 5.478827476501465, + "learning_rate": 8.802929700869328e-05, + "loss": 0.5589, + "step": 9495 + }, + { + "epoch": 0.6434040246629176, + "grad_norm": 6.415865421295166, + "learning_rate": 8.802792798959546e-05, + "loss": 0.748, + "step": 9496 + }, + { + "epoch": 0.6434717799308897, + "grad_norm": 7.106724262237549, + "learning_rate": 8.802655897049764e-05, + "loss": 1.0591, + "step": 9497 + }, + { + "epoch": 0.6435395351988618, + "grad_norm": 7.584676742553711, + "learning_rate": 8.802518995139982e-05, + "loss": 0.924, + "step": 9498 + }, + { + "epoch": 0.6436072904668338, + "grad_norm": 7.438377380371094, + "learning_rate": 8.802382093230202e-05, + "loss": 1.0451, + "step": 9499 + }, + { + "epoch": 0.6436750457348058, + "grad_norm": 6.775513648986816, + "learning_rate": 8.80224519132042e-05, + "loss": 0.8273, + "step": 9500 + }, + { + "epoch": 0.6437428010027779, + "grad_norm": 8.242133140563965, + "learning_rate": 8.802108289410638e-05, + "loss": 1.0045, + "step": 9501 + }, + { + "epoch": 0.64381055627075, + "grad_norm": 6.122509956359863, + "learning_rate": 8.801971387500856e-05, + "loss": 0.8026, + "step": 9502 + }, + { + "epoch": 0.6438783115387221, + "grad_norm": 5.884267330169678, + "learning_rate": 8.801834485591074e-05, + "loss": 0.7935, + "step": 9503 + }, + { + "epoch": 0.6439460668066942, + "grad_norm": 5.653980731964111, + "learning_rate": 8.801697583681293e-05, + "loss": 0.7223, + "step": 9504 + }, + { + "epoch": 0.6440138220746663, + "grad_norm": 9.43459701538086, + "learning_rate": 8.801560681771511e-05, + "loss": 0.7211, + "step": 9505 + }, + { + "epoch": 0.6440815773426384, + "grad_norm": 6.32805061340332, + "learning_rate": 8.801423779861729e-05, + "loss": 0.5007, + "step": 9506 + }, + { + "epoch": 0.6441493326106105, + "grad_norm": 5.424881458282471, + "learning_rate": 8.801286877951947e-05, + "loss": 0.8556, + "step": 9507 + }, + { + "epoch": 0.6442170878785826, + "grad_norm": 6.115161418914795, + "learning_rate": 8.801149976042167e-05, + "loss": 1.068, + "step": 9508 + }, + { + "epoch": 0.6442848431465547, + "grad_norm": 6.05783748626709, + "learning_rate": 8.801013074132385e-05, + "loss": 0.7372, + "step": 9509 + }, + { + "epoch": 0.6443525984145267, + "grad_norm": 5.965503692626953, + "learning_rate": 8.800876172222603e-05, + "loss": 0.7439, + "step": 9510 + }, + { + "epoch": 0.6444203536824988, + "grad_norm": 6.0100483894348145, + "learning_rate": 8.800739270312821e-05, + "loss": 0.8335, + "step": 9511 + }, + { + "epoch": 0.6444881089504709, + "grad_norm": 4.494331359863281, + "learning_rate": 8.800602368403039e-05, + "loss": 0.7585, + "step": 9512 + }, + { + "epoch": 0.644555864218443, + "grad_norm": 6.9478631019592285, + "learning_rate": 8.800465466493258e-05, + "loss": 0.9518, + "step": 9513 + }, + { + "epoch": 0.6446236194864151, + "grad_norm": 6.849409103393555, + "learning_rate": 8.800328564583476e-05, + "loss": 0.9584, + "step": 9514 + }, + { + "epoch": 0.6446913747543872, + "grad_norm": 5.837069034576416, + "learning_rate": 8.800191662673694e-05, + "loss": 0.7976, + "step": 9515 + }, + { + "epoch": 0.6447591300223592, + "grad_norm": 8.214330673217773, + "learning_rate": 8.800054760763912e-05, + "loss": 0.6255, + "step": 9516 + }, + { + "epoch": 0.6448268852903313, + "grad_norm": 5.174075126647949, + "learning_rate": 8.79991785885413e-05, + "loss": 0.5772, + "step": 9517 + }, + { + "epoch": 0.6448946405583034, + "grad_norm": 5.130014896392822, + "learning_rate": 8.79978095694435e-05, + "loss": 0.8923, + "step": 9518 + }, + { + "epoch": 0.6449623958262755, + "grad_norm": 6.434813499450684, + "learning_rate": 8.799644055034568e-05, + "loss": 0.9681, + "step": 9519 + }, + { + "epoch": 0.6450301510942476, + "grad_norm": 6.135867118835449, + "learning_rate": 8.799507153124786e-05, + "loss": 0.7337, + "step": 9520 + }, + { + "epoch": 0.6450979063622196, + "grad_norm": 5.493929386138916, + "learning_rate": 8.799370251215004e-05, + "loss": 0.6834, + "step": 9521 + }, + { + "epoch": 0.6451656616301917, + "grad_norm": 7.558812618255615, + "learning_rate": 8.799233349305223e-05, + "loss": 1.0415, + "step": 9522 + }, + { + "epoch": 0.6452334168981638, + "grad_norm": 6.610668659210205, + "learning_rate": 8.799096447395441e-05, + "loss": 0.6631, + "step": 9523 + }, + { + "epoch": 0.6453011721661359, + "grad_norm": 6.513849258422852, + "learning_rate": 8.798959545485659e-05, + "loss": 0.7852, + "step": 9524 + }, + { + "epoch": 0.645368927434108, + "grad_norm": 5.489187717437744, + "learning_rate": 8.798822643575879e-05, + "loss": 0.8824, + "step": 9525 + }, + { + "epoch": 0.6454366827020801, + "grad_norm": 6.717770576477051, + "learning_rate": 8.798685741666097e-05, + "loss": 0.8239, + "step": 9526 + }, + { + "epoch": 0.6455044379700522, + "grad_norm": 5.649533271789551, + "learning_rate": 8.798548839756315e-05, + "loss": 0.7458, + "step": 9527 + }, + { + "epoch": 0.6455721932380243, + "grad_norm": 4.854784965515137, + "learning_rate": 8.798411937846534e-05, + "loss": 0.6289, + "step": 9528 + }, + { + "epoch": 0.6456399485059964, + "grad_norm": 7.124615669250488, + "learning_rate": 8.798275035936752e-05, + "loss": 0.8035, + "step": 9529 + }, + { + "epoch": 0.6457077037739685, + "grad_norm": 7.39643669128418, + "learning_rate": 8.79813813402697e-05, + "loss": 0.776, + "step": 9530 + }, + { + "epoch": 0.6457754590419406, + "grad_norm": 5.295919418334961, + "learning_rate": 8.79800123211719e-05, + "loss": 0.8542, + "step": 9531 + }, + { + "epoch": 0.6458432143099126, + "grad_norm": 6.795876502990723, + "learning_rate": 8.797864330207408e-05, + "loss": 0.8777, + "step": 9532 + }, + { + "epoch": 0.6459109695778846, + "grad_norm": 6.2083001136779785, + "learning_rate": 8.797727428297626e-05, + "loss": 1.0176, + "step": 9533 + }, + { + "epoch": 0.6459787248458567, + "grad_norm": 7.269214153289795, + "learning_rate": 8.797590526387844e-05, + "loss": 1.0724, + "step": 9534 + }, + { + "epoch": 0.6460464801138288, + "grad_norm": 5.762153625488281, + "learning_rate": 8.797453624478062e-05, + "loss": 0.7247, + "step": 9535 + }, + { + "epoch": 0.6461142353818009, + "grad_norm": 5.321116924285889, + "learning_rate": 8.797316722568281e-05, + "loss": 0.6601, + "step": 9536 + }, + { + "epoch": 0.646181990649773, + "grad_norm": 5.3908586502075195, + "learning_rate": 8.797179820658499e-05, + "loss": 0.5765, + "step": 9537 + }, + { + "epoch": 0.6462497459177451, + "grad_norm": 6.827112197875977, + "learning_rate": 8.797042918748717e-05, + "loss": 0.8915, + "step": 9538 + }, + { + "epoch": 0.6463175011857172, + "grad_norm": 6.380746841430664, + "learning_rate": 8.796906016838935e-05, + "loss": 0.8016, + "step": 9539 + }, + { + "epoch": 0.6463852564536893, + "grad_norm": 6.202441215515137, + "learning_rate": 8.796769114929155e-05, + "loss": 0.8524, + "step": 9540 + }, + { + "epoch": 0.6464530117216614, + "grad_norm": 7.0272955894470215, + "learning_rate": 8.796632213019373e-05, + "loss": 0.7491, + "step": 9541 + }, + { + "epoch": 0.6465207669896335, + "grad_norm": 5.641077995300293, + "learning_rate": 8.79649531110959e-05, + "loss": 0.6814, + "step": 9542 + }, + { + "epoch": 0.6465885222576055, + "grad_norm": 5.427177429199219, + "learning_rate": 8.796358409199809e-05, + "loss": 0.6764, + "step": 9543 + }, + { + "epoch": 0.6466562775255776, + "grad_norm": 5.034342288970947, + "learning_rate": 8.796221507290027e-05, + "loss": 0.7099, + "step": 9544 + }, + { + "epoch": 0.6467240327935497, + "grad_norm": 7.89331579208374, + "learning_rate": 8.796084605380246e-05, + "loss": 0.7996, + "step": 9545 + }, + { + "epoch": 0.6467917880615218, + "grad_norm": 7.4129767417907715, + "learning_rate": 8.795947703470464e-05, + "loss": 0.6799, + "step": 9546 + }, + { + "epoch": 0.6468595433294939, + "grad_norm": 5.55999755859375, + "learning_rate": 8.795810801560682e-05, + "loss": 0.858, + "step": 9547 + }, + { + "epoch": 0.646927298597466, + "grad_norm": 5.925971031188965, + "learning_rate": 8.7956738996509e-05, + "loss": 0.7046, + "step": 9548 + }, + { + "epoch": 0.646995053865438, + "grad_norm": 5.659457206726074, + "learning_rate": 8.795536997741118e-05, + "loss": 0.8139, + "step": 9549 + }, + { + "epoch": 0.6470628091334101, + "grad_norm": 6.554767608642578, + "learning_rate": 8.795400095831338e-05, + "loss": 1.0621, + "step": 9550 + }, + { + "epoch": 0.6471305644013822, + "grad_norm": 5.473086357116699, + "learning_rate": 8.795263193921556e-05, + "loss": 0.8589, + "step": 9551 + }, + { + "epoch": 0.6471983196693543, + "grad_norm": 5.166335582733154, + "learning_rate": 8.795126292011774e-05, + "loss": 0.7418, + "step": 9552 + }, + { + "epoch": 0.6472660749373264, + "grad_norm": 10.33619213104248, + "learning_rate": 8.794989390101992e-05, + "loss": 0.7379, + "step": 9553 + }, + { + "epoch": 0.6473338302052984, + "grad_norm": 8.167377471923828, + "learning_rate": 8.794852488192211e-05, + "loss": 1.0573, + "step": 9554 + }, + { + "epoch": 0.6474015854732705, + "grad_norm": 9.861980438232422, + "learning_rate": 8.794715586282429e-05, + "loss": 0.9429, + "step": 9555 + }, + { + "epoch": 0.6474693407412426, + "grad_norm": 6.154029369354248, + "learning_rate": 8.794578684372647e-05, + "loss": 0.9017, + "step": 9556 + }, + { + "epoch": 0.6475370960092147, + "grad_norm": 5.818716526031494, + "learning_rate": 8.794441782462865e-05, + "loss": 0.8584, + "step": 9557 + }, + { + "epoch": 0.6476048512771868, + "grad_norm": 7.98982048034668, + "learning_rate": 8.794304880553083e-05, + "loss": 0.8423, + "step": 9558 + }, + { + "epoch": 0.6476726065451589, + "grad_norm": 6.663909435272217, + "learning_rate": 8.794167978643303e-05, + "loss": 1.0512, + "step": 9559 + }, + { + "epoch": 0.647740361813131, + "grad_norm": 4.941400051116943, + "learning_rate": 8.794031076733521e-05, + "loss": 0.6979, + "step": 9560 + }, + { + "epoch": 0.6478081170811031, + "grad_norm": 6.668032169342041, + "learning_rate": 8.793894174823739e-05, + "loss": 0.9286, + "step": 9561 + }, + { + "epoch": 0.6478758723490752, + "grad_norm": 6.0007100105285645, + "learning_rate": 8.793757272913957e-05, + "loss": 1.0924, + "step": 9562 + }, + { + "epoch": 0.6479436276170473, + "grad_norm": 5.80806303024292, + "learning_rate": 8.793620371004176e-05, + "loss": 0.8201, + "step": 9563 + }, + { + "epoch": 0.6480113828850194, + "grad_norm": 5.381270408630371, + "learning_rate": 8.793483469094394e-05, + "loss": 0.813, + "step": 9564 + }, + { + "epoch": 0.6480791381529913, + "grad_norm": 6.4966721534729, + "learning_rate": 8.793346567184612e-05, + "loss": 1.0333, + "step": 9565 + }, + { + "epoch": 0.6481468934209634, + "grad_norm": 6.251739978790283, + "learning_rate": 8.79320966527483e-05, + "loss": 0.8608, + "step": 9566 + }, + { + "epoch": 0.6482146486889355, + "grad_norm": 6.237257957458496, + "learning_rate": 8.793072763365048e-05, + "loss": 0.713, + "step": 9567 + }, + { + "epoch": 0.6482824039569076, + "grad_norm": 5.686729907989502, + "learning_rate": 8.792935861455268e-05, + "loss": 0.7128, + "step": 9568 + }, + { + "epoch": 0.6483501592248797, + "grad_norm": 7.520707130432129, + "learning_rate": 8.792798959545486e-05, + "loss": 0.8688, + "step": 9569 + }, + { + "epoch": 0.6484179144928518, + "grad_norm": 4.83900785446167, + "learning_rate": 8.792662057635704e-05, + "loss": 0.5782, + "step": 9570 + }, + { + "epoch": 0.6484856697608239, + "grad_norm": 8.14783763885498, + "learning_rate": 8.792525155725923e-05, + "loss": 0.5705, + "step": 9571 + }, + { + "epoch": 0.648553425028796, + "grad_norm": 5.105820655822754, + "learning_rate": 8.792388253816141e-05, + "loss": 0.7871, + "step": 9572 + }, + { + "epoch": 0.6486211802967681, + "grad_norm": 5.665055274963379, + "learning_rate": 8.792251351906359e-05, + "loss": 0.8448, + "step": 9573 + }, + { + "epoch": 0.6486889355647402, + "grad_norm": 7.109708309173584, + "learning_rate": 8.792114449996579e-05, + "loss": 0.6738, + "step": 9574 + }, + { + "epoch": 0.6487566908327123, + "grad_norm": 6.253261089324951, + "learning_rate": 8.791977548086797e-05, + "loss": 0.6266, + "step": 9575 + }, + { + "epoch": 0.6488244461006843, + "grad_norm": 7.64295768737793, + "learning_rate": 8.791840646177015e-05, + "loss": 1.0729, + "step": 9576 + }, + { + "epoch": 0.6488922013686564, + "grad_norm": 4.579455375671387, + "learning_rate": 8.791703744267234e-05, + "loss": 0.7472, + "step": 9577 + }, + { + "epoch": 0.6489599566366285, + "grad_norm": 6.2152791023254395, + "learning_rate": 8.791566842357452e-05, + "loss": 0.8086, + "step": 9578 + }, + { + "epoch": 0.6490277119046006, + "grad_norm": 6.039999008178711, + "learning_rate": 8.79142994044767e-05, + "loss": 0.6148, + "step": 9579 + }, + { + "epoch": 0.6490954671725727, + "grad_norm": 6.458342552185059, + "learning_rate": 8.791293038537888e-05, + "loss": 0.9861, + "step": 9580 + }, + { + "epoch": 0.6491632224405448, + "grad_norm": 7.025355815887451, + "learning_rate": 8.791156136628106e-05, + "loss": 0.9073, + "step": 9581 + }, + { + "epoch": 0.6492309777085168, + "grad_norm": 5.3977484703063965, + "learning_rate": 8.791019234718326e-05, + "loss": 0.7138, + "step": 9582 + }, + { + "epoch": 0.6492987329764889, + "grad_norm": 5.696099281311035, + "learning_rate": 8.790882332808544e-05, + "loss": 0.952, + "step": 9583 + }, + { + "epoch": 0.649366488244461, + "grad_norm": 5.6307501792907715, + "learning_rate": 8.790745430898762e-05, + "loss": 0.8517, + "step": 9584 + }, + { + "epoch": 0.6494342435124331, + "grad_norm": 8.366070747375488, + "learning_rate": 8.79060852898898e-05, + "loss": 0.9537, + "step": 9585 + }, + { + "epoch": 0.6495019987804052, + "grad_norm": 5.901167392730713, + "learning_rate": 8.790471627079199e-05, + "loss": 0.8325, + "step": 9586 + }, + { + "epoch": 0.6495697540483772, + "grad_norm": 11.905413627624512, + "learning_rate": 8.790334725169417e-05, + "loss": 0.8988, + "step": 9587 + }, + { + "epoch": 0.6496375093163493, + "grad_norm": 7.553168773651123, + "learning_rate": 8.790197823259635e-05, + "loss": 0.8318, + "step": 9588 + }, + { + "epoch": 0.6497052645843214, + "grad_norm": 6.661622047424316, + "learning_rate": 8.790060921349853e-05, + "loss": 0.7276, + "step": 9589 + }, + { + "epoch": 0.6497730198522935, + "grad_norm": 5.579341411590576, + "learning_rate": 8.789924019440071e-05, + "loss": 0.8388, + "step": 9590 + }, + { + "epoch": 0.6498407751202656, + "grad_norm": 6.08758020401001, + "learning_rate": 8.78978711753029e-05, + "loss": 0.9075, + "step": 9591 + }, + { + "epoch": 0.6499085303882377, + "grad_norm": 7.589080810546875, + "learning_rate": 8.789650215620509e-05, + "loss": 0.7852, + "step": 9592 + }, + { + "epoch": 0.6499762856562098, + "grad_norm": 6.479293346405029, + "learning_rate": 8.789513313710727e-05, + "loss": 0.768, + "step": 9593 + }, + { + "epoch": 0.6500440409241819, + "grad_norm": 5.796462535858154, + "learning_rate": 8.789376411800945e-05, + "loss": 0.7431, + "step": 9594 + }, + { + "epoch": 0.650111796192154, + "grad_norm": 5.195106506347656, + "learning_rate": 8.789239509891163e-05, + "loss": 0.9473, + "step": 9595 + }, + { + "epoch": 0.6501795514601261, + "grad_norm": 5.687699317932129, + "learning_rate": 8.789102607981382e-05, + "loss": 0.8332, + "step": 9596 + }, + { + "epoch": 0.6502473067280982, + "grad_norm": 5.329159259796143, + "learning_rate": 8.7889657060716e-05, + "loss": 0.8068, + "step": 9597 + }, + { + "epoch": 0.6503150619960701, + "grad_norm": 7.383645057678223, + "learning_rate": 8.788828804161818e-05, + "loss": 0.9984, + "step": 9598 + }, + { + "epoch": 0.6503828172640422, + "grad_norm": 7.364968776702881, + "learning_rate": 8.788691902252036e-05, + "loss": 0.7574, + "step": 9599 + }, + { + "epoch": 0.6504505725320143, + "grad_norm": 6.2040886878967285, + "learning_rate": 8.788555000342256e-05, + "loss": 0.6552, + "step": 9600 + }, + { + "epoch": 0.6505183277999864, + "grad_norm": 10.075439453125, + "learning_rate": 8.788418098432474e-05, + "loss": 0.659, + "step": 9601 + }, + { + "epoch": 0.6505860830679585, + "grad_norm": 6.742908954620361, + "learning_rate": 8.788281196522692e-05, + "loss": 0.6485, + "step": 9602 + }, + { + "epoch": 0.6506538383359306, + "grad_norm": 6.678378582000732, + "learning_rate": 8.78814429461291e-05, + "loss": 0.7113, + "step": 9603 + }, + { + "epoch": 0.6507215936039027, + "grad_norm": 5.988241195678711, + "learning_rate": 8.788007392703128e-05, + "loss": 0.8417, + "step": 9604 + }, + { + "epoch": 0.6507893488718748, + "grad_norm": 6.3953704833984375, + "learning_rate": 8.787870490793347e-05, + "loss": 0.8448, + "step": 9605 + }, + { + "epoch": 0.6508571041398469, + "grad_norm": 11.006570816040039, + "learning_rate": 8.787733588883565e-05, + "loss": 0.9258, + "step": 9606 + }, + { + "epoch": 0.650924859407819, + "grad_norm": 7.086367607116699, + "learning_rate": 8.787596686973783e-05, + "loss": 0.86, + "step": 9607 + }, + { + "epoch": 0.6509926146757911, + "grad_norm": 5.534137725830078, + "learning_rate": 8.787459785064001e-05, + "loss": 0.7984, + "step": 9608 + }, + { + "epoch": 0.6510603699437632, + "grad_norm": 6.710955619812012, + "learning_rate": 8.78732288315422e-05, + "loss": 0.9852, + "step": 9609 + }, + { + "epoch": 0.6511281252117352, + "grad_norm": 7.151968479156494, + "learning_rate": 8.787185981244439e-05, + "loss": 0.7383, + "step": 9610 + }, + { + "epoch": 0.6511958804797073, + "grad_norm": 6.063338279724121, + "learning_rate": 8.787049079334657e-05, + "loss": 0.8971, + "step": 9611 + }, + { + "epoch": 0.6512636357476794, + "grad_norm": 7.058009147644043, + "learning_rate": 8.786912177424875e-05, + "loss": 0.9314, + "step": 9612 + }, + { + "epoch": 0.6513313910156515, + "grad_norm": 8.007342338562012, + "learning_rate": 8.786775275515093e-05, + "loss": 0.6878, + "step": 9613 + }, + { + "epoch": 0.6513991462836235, + "grad_norm": 6.176419734954834, + "learning_rate": 8.786638373605312e-05, + "loss": 0.8863, + "step": 9614 + }, + { + "epoch": 0.6514669015515956, + "grad_norm": 5.785898685455322, + "learning_rate": 8.78650147169553e-05, + "loss": 1.0512, + "step": 9615 + }, + { + "epoch": 0.6515346568195677, + "grad_norm": 5.100857734680176, + "learning_rate": 8.786364569785748e-05, + "loss": 0.6969, + "step": 9616 + }, + { + "epoch": 0.6516024120875398, + "grad_norm": 6.292816638946533, + "learning_rate": 8.786227667875968e-05, + "loss": 0.8185, + "step": 9617 + }, + { + "epoch": 0.6516701673555119, + "grad_norm": 6.242623805999756, + "learning_rate": 8.786090765966186e-05, + "loss": 0.6833, + "step": 9618 + }, + { + "epoch": 0.651737922623484, + "grad_norm": 7.2665791511535645, + "learning_rate": 8.785953864056404e-05, + "loss": 1.0487, + "step": 9619 + }, + { + "epoch": 0.651805677891456, + "grad_norm": 5.973618507385254, + "learning_rate": 8.785816962146623e-05, + "loss": 0.8992, + "step": 9620 + }, + { + "epoch": 0.6518734331594281, + "grad_norm": 8.300751686096191, + "learning_rate": 8.785680060236841e-05, + "loss": 0.9334, + "step": 9621 + }, + { + "epoch": 0.6519411884274002, + "grad_norm": 5.687087535858154, + "learning_rate": 8.785543158327059e-05, + "loss": 0.8131, + "step": 9622 + }, + { + "epoch": 0.6520089436953723, + "grad_norm": 5.419602870941162, + "learning_rate": 8.785406256417279e-05, + "loss": 0.6697, + "step": 9623 + }, + { + "epoch": 0.6520766989633444, + "grad_norm": 7.112249851226807, + "learning_rate": 8.785269354507497e-05, + "loss": 0.8477, + "step": 9624 + }, + { + "epoch": 0.6521444542313165, + "grad_norm": 5.755582809448242, + "learning_rate": 8.785132452597715e-05, + "loss": 0.6766, + "step": 9625 + }, + { + "epoch": 0.6522122094992886, + "grad_norm": 5.415201187133789, + "learning_rate": 8.784995550687933e-05, + "loss": 0.6871, + "step": 9626 + }, + { + "epoch": 0.6522799647672607, + "grad_norm": 5.576985836029053, + "learning_rate": 8.78485864877815e-05, + "loss": 0.9001, + "step": 9627 + }, + { + "epoch": 0.6523477200352328, + "grad_norm": 5.954935073852539, + "learning_rate": 8.78472174686837e-05, + "loss": 0.6694, + "step": 9628 + }, + { + "epoch": 0.6524154753032049, + "grad_norm": 5.59356164932251, + "learning_rate": 8.784584844958588e-05, + "loss": 0.74, + "step": 9629 + }, + { + "epoch": 0.652483230571177, + "grad_norm": 5.1607346534729, + "learning_rate": 8.784447943048806e-05, + "loss": 0.7869, + "step": 9630 + }, + { + "epoch": 0.652550985839149, + "grad_norm": 6.959099769592285, + "learning_rate": 8.784311041139024e-05, + "loss": 0.8096, + "step": 9631 + }, + { + "epoch": 0.652618741107121, + "grad_norm": 6.869060516357422, + "learning_rate": 8.784174139229244e-05, + "loss": 0.9917, + "step": 9632 + }, + { + "epoch": 0.6526864963750931, + "grad_norm": 6.426820278167725, + "learning_rate": 8.784037237319462e-05, + "loss": 0.718, + "step": 9633 + }, + { + "epoch": 0.6527542516430652, + "grad_norm": 5.208813667297363, + "learning_rate": 8.78390033540968e-05, + "loss": 0.651, + "step": 9634 + }, + { + "epoch": 0.6528220069110373, + "grad_norm": 6.748283863067627, + "learning_rate": 8.783763433499898e-05, + "loss": 1.0808, + "step": 9635 + }, + { + "epoch": 0.6528897621790094, + "grad_norm": 6.567190647125244, + "learning_rate": 8.783626531590116e-05, + "loss": 0.9373, + "step": 9636 + }, + { + "epoch": 0.6529575174469815, + "grad_norm": 5.621391773223877, + "learning_rate": 8.783489629680335e-05, + "loss": 0.9382, + "step": 9637 + }, + { + "epoch": 0.6530252727149536, + "grad_norm": 7.807063102722168, + "learning_rate": 8.783352727770553e-05, + "loss": 0.8237, + "step": 9638 + }, + { + "epoch": 0.6530930279829257, + "grad_norm": 6.320357322692871, + "learning_rate": 8.783215825860771e-05, + "loss": 0.8245, + "step": 9639 + }, + { + "epoch": 0.6531607832508978, + "grad_norm": 6.225609302520752, + "learning_rate": 8.783078923950989e-05, + "loss": 0.932, + "step": 9640 + }, + { + "epoch": 0.6532285385188699, + "grad_norm": 5.9908366203308105, + "learning_rate": 8.782942022041209e-05, + "loss": 0.5793, + "step": 9641 + }, + { + "epoch": 0.653296293786842, + "grad_norm": 8.215045928955078, + "learning_rate": 8.782805120131427e-05, + "loss": 0.7771, + "step": 9642 + }, + { + "epoch": 0.653364049054814, + "grad_norm": 5.830068588256836, + "learning_rate": 8.782668218221645e-05, + "loss": 0.8499, + "step": 9643 + }, + { + "epoch": 0.6534318043227861, + "grad_norm": 5.979434013366699, + "learning_rate": 8.782531316311863e-05, + "loss": 0.7335, + "step": 9644 + }, + { + "epoch": 0.6534995595907582, + "grad_norm": 6.369264125823975, + "learning_rate": 8.782394414402081e-05, + "loss": 0.994, + "step": 9645 + }, + { + "epoch": 0.6535673148587303, + "grad_norm": 6.8189287185668945, + "learning_rate": 8.7822575124923e-05, + "loss": 0.7789, + "step": 9646 + }, + { + "epoch": 0.6536350701267023, + "grad_norm": 5.990640163421631, + "learning_rate": 8.782120610582518e-05, + "loss": 0.8369, + "step": 9647 + }, + { + "epoch": 0.6537028253946744, + "grad_norm": 6.319588661193848, + "learning_rate": 8.781983708672736e-05, + "loss": 0.9394, + "step": 9648 + }, + { + "epoch": 0.6537705806626465, + "grad_norm": 5.965325355529785, + "learning_rate": 8.781846806762954e-05, + "loss": 1.0256, + "step": 9649 + }, + { + "epoch": 0.6538383359306186, + "grad_norm": 6.193709373474121, + "learning_rate": 8.781709904853172e-05, + "loss": 0.7261, + "step": 9650 + }, + { + "epoch": 0.6539060911985907, + "grad_norm": 6.463015079498291, + "learning_rate": 8.781573002943392e-05, + "loss": 0.6503, + "step": 9651 + }, + { + "epoch": 0.6539738464665628, + "grad_norm": 5.9288554191589355, + "learning_rate": 8.78143610103361e-05, + "loss": 0.7345, + "step": 9652 + }, + { + "epoch": 0.6540416017345348, + "grad_norm": 6.894443035125732, + "learning_rate": 8.781299199123828e-05, + "loss": 0.7115, + "step": 9653 + }, + { + "epoch": 0.6541093570025069, + "grad_norm": 5.954974174499512, + "learning_rate": 8.781162297214046e-05, + "loss": 0.8184, + "step": 9654 + }, + { + "epoch": 0.654177112270479, + "grad_norm": 7.200540065765381, + "learning_rate": 8.781025395304265e-05, + "loss": 0.7661, + "step": 9655 + }, + { + "epoch": 0.6542448675384511, + "grad_norm": 5.560801982879639, + "learning_rate": 8.780888493394483e-05, + "loss": 0.6097, + "step": 9656 + }, + { + "epoch": 0.6543126228064232, + "grad_norm": 9.168293952941895, + "learning_rate": 8.780751591484701e-05, + "loss": 1.0032, + "step": 9657 + }, + { + "epoch": 0.6543803780743953, + "grad_norm": 6.599915027618408, + "learning_rate": 8.780614689574919e-05, + "loss": 0.9359, + "step": 9658 + }, + { + "epoch": 0.6544481333423674, + "grad_norm": 6.528720855712891, + "learning_rate": 8.780477787665137e-05, + "loss": 0.8039, + "step": 9659 + }, + { + "epoch": 0.6545158886103395, + "grad_norm": 7.300596714019775, + "learning_rate": 8.780340885755357e-05, + "loss": 0.823, + "step": 9660 + }, + { + "epoch": 0.6545836438783116, + "grad_norm": 6.100801467895508, + "learning_rate": 8.780203983845575e-05, + "loss": 0.7662, + "step": 9661 + }, + { + "epoch": 0.6546513991462837, + "grad_norm": 6.72039270401001, + "learning_rate": 8.780067081935793e-05, + "loss": 0.8888, + "step": 9662 + }, + { + "epoch": 0.6547191544142557, + "grad_norm": 5.6597065925598145, + "learning_rate": 8.779930180026012e-05, + "loss": 0.7939, + "step": 9663 + }, + { + "epoch": 0.6547869096822277, + "grad_norm": 5.955264568328857, + "learning_rate": 8.77979327811623e-05, + "loss": 0.8051, + "step": 9664 + }, + { + "epoch": 0.6548546649501998, + "grad_norm": 6.064132213592529, + "learning_rate": 8.779656376206448e-05, + "loss": 0.6377, + "step": 9665 + }, + { + "epoch": 0.6549224202181719, + "grad_norm": 6.713923931121826, + "learning_rate": 8.779519474296668e-05, + "loss": 0.9388, + "step": 9666 + }, + { + "epoch": 0.654990175486144, + "grad_norm": 8.160643577575684, + "learning_rate": 8.779382572386886e-05, + "loss": 1.0358, + "step": 9667 + }, + { + "epoch": 0.6550579307541161, + "grad_norm": 7.116758346557617, + "learning_rate": 8.779245670477104e-05, + "loss": 0.9486, + "step": 9668 + }, + { + "epoch": 0.6551256860220882, + "grad_norm": 9.230103492736816, + "learning_rate": 8.779108768567323e-05, + "loss": 0.5998, + "step": 9669 + }, + { + "epoch": 0.6551934412900603, + "grad_norm": 7.513571739196777, + "learning_rate": 8.778971866657541e-05, + "loss": 1.0098, + "step": 9670 + }, + { + "epoch": 0.6552611965580324, + "grad_norm": 6.080422878265381, + "learning_rate": 8.778834964747759e-05, + "loss": 0.7787, + "step": 9671 + }, + { + "epoch": 0.6553289518260045, + "grad_norm": 6.582731246948242, + "learning_rate": 8.778698062837977e-05, + "loss": 0.9038, + "step": 9672 + }, + { + "epoch": 0.6553967070939766, + "grad_norm": 6.8090620040893555, + "learning_rate": 8.778561160928196e-05, + "loss": 0.7586, + "step": 9673 + }, + { + "epoch": 0.6554644623619487, + "grad_norm": 8.279650688171387, + "learning_rate": 8.778424259018415e-05, + "loss": 0.8105, + "step": 9674 + }, + { + "epoch": 0.6555322176299208, + "grad_norm": 5.152581691741943, + "learning_rate": 8.778287357108633e-05, + "loss": 0.7084, + "step": 9675 + }, + { + "epoch": 0.6555999728978928, + "grad_norm": 6.42199182510376, + "learning_rate": 8.77815045519885e-05, + "loss": 0.7351, + "step": 9676 + }, + { + "epoch": 0.6556677281658649, + "grad_norm": 6.543702125549316, + "learning_rate": 8.778013553289069e-05, + "loss": 0.8382, + "step": 9677 + }, + { + "epoch": 0.655735483433837, + "grad_norm": 4.9357171058654785, + "learning_rate": 8.777876651379288e-05, + "loss": 0.8202, + "step": 9678 + }, + { + "epoch": 0.655803238701809, + "grad_norm": 5.525670051574707, + "learning_rate": 8.777739749469506e-05, + "loss": 0.7993, + "step": 9679 + }, + { + "epoch": 0.6558709939697811, + "grad_norm": 6.35006046295166, + "learning_rate": 8.777602847559724e-05, + "loss": 0.925, + "step": 9680 + }, + { + "epoch": 0.6559387492377532, + "grad_norm": 6.477513790130615, + "learning_rate": 8.777465945649942e-05, + "loss": 0.8741, + "step": 9681 + }, + { + "epoch": 0.6560065045057253, + "grad_norm": 6.017436981201172, + "learning_rate": 8.77732904374016e-05, + "loss": 0.8689, + "step": 9682 + }, + { + "epoch": 0.6560742597736974, + "grad_norm": 5.3170599937438965, + "learning_rate": 8.77719214183038e-05, + "loss": 0.6794, + "step": 9683 + }, + { + "epoch": 0.6561420150416695, + "grad_norm": 7.529482841491699, + "learning_rate": 8.777055239920598e-05, + "loss": 1.0313, + "step": 9684 + }, + { + "epoch": 0.6562097703096416, + "grad_norm": 5.189818859100342, + "learning_rate": 8.776918338010816e-05, + "loss": 0.7463, + "step": 9685 + }, + { + "epoch": 0.6562775255776137, + "grad_norm": 6.473649024963379, + "learning_rate": 8.776781436101034e-05, + "loss": 0.9591, + "step": 9686 + }, + { + "epoch": 0.6563452808455857, + "grad_norm": 7.400620937347412, + "learning_rate": 8.776644534191253e-05, + "loss": 0.8153, + "step": 9687 + }, + { + "epoch": 0.6564130361135578, + "grad_norm": 5.564664840698242, + "learning_rate": 8.776507632281471e-05, + "loss": 0.8807, + "step": 9688 + }, + { + "epoch": 0.6564807913815299, + "grad_norm": 6.968201160430908, + "learning_rate": 8.776370730371689e-05, + "loss": 0.8327, + "step": 9689 + }, + { + "epoch": 0.656548546649502, + "grad_norm": 8.013174057006836, + "learning_rate": 8.776233828461907e-05, + "loss": 0.8123, + "step": 9690 + }, + { + "epoch": 0.6566163019174741, + "grad_norm": 6.545875072479248, + "learning_rate": 8.776096926552125e-05, + "loss": 0.7223, + "step": 9691 + }, + { + "epoch": 0.6566840571854462, + "grad_norm": 6.459381580352783, + "learning_rate": 8.775960024642345e-05, + "loss": 1.0484, + "step": 9692 + }, + { + "epoch": 0.6567518124534183, + "grad_norm": 6.893752098083496, + "learning_rate": 8.775823122732563e-05, + "loss": 0.841, + "step": 9693 + }, + { + "epoch": 0.6568195677213904, + "grad_norm": 6.36741828918457, + "learning_rate": 8.77568622082278e-05, + "loss": 0.7719, + "step": 9694 + }, + { + "epoch": 0.6568873229893625, + "grad_norm": 6.700324058532715, + "learning_rate": 8.775549318912999e-05, + "loss": 0.8612, + "step": 9695 + }, + { + "epoch": 0.6569550782573345, + "grad_norm": 5.423570156097412, + "learning_rate": 8.775412417003218e-05, + "loss": 0.7947, + "step": 9696 + }, + { + "epoch": 0.6570228335253065, + "grad_norm": 6.15387487411499, + "learning_rate": 8.775275515093436e-05, + "loss": 0.7978, + "step": 9697 + }, + { + "epoch": 0.6570905887932786, + "grad_norm": 6.407439708709717, + "learning_rate": 8.775138613183654e-05, + "loss": 0.8016, + "step": 9698 + }, + { + "epoch": 0.6571583440612507, + "grad_norm": 5.311725616455078, + "learning_rate": 8.775001711273872e-05, + "loss": 0.564, + "step": 9699 + }, + { + "epoch": 0.6572260993292228, + "grad_norm": 8.463066101074219, + "learning_rate": 8.77486480936409e-05, + "loss": 0.9351, + "step": 9700 + }, + { + "epoch": 0.6572938545971949, + "grad_norm": 6.503357410430908, + "learning_rate": 8.77472790745431e-05, + "loss": 0.8545, + "step": 9701 + }, + { + "epoch": 0.657361609865167, + "grad_norm": 4.668941020965576, + "learning_rate": 8.774591005544528e-05, + "loss": 0.5757, + "step": 9702 + }, + { + "epoch": 0.6574293651331391, + "grad_norm": 7.31153678894043, + "learning_rate": 8.774454103634746e-05, + "loss": 0.8225, + "step": 9703 + }, + { + "epoch": 0.6574971204011112, + "grad_norm": 7.220789432525635, + "learning_rate": 8.774317201724964e-05, + "loss": 1.2222, + "step": 9704 + }, + { + "epoch": 0.6575648756690833, + "grad_norm": 8.469976425170898, + "learning_rate": 8.774180299815182e-05, + "loss": 0.8927, + "step": 9705 + }, + { + "epoch": 0.6576326309370554, + "grad_norm": 5.122034072875977, + "learning_rate": 8.774043397905401e-05, + "loss": 0.6429, + "step": 9706 + }, + { + "epoch": 0.6577003862050275, + "grad_norm": 7.948295593261719, + "learning_rate": 8.773906495995619e-05, + "loss": 0.8037, + "step": 9707 + }, + { + "epoch": 0.6577681414729996, + "grad_norm": 8.193120956420898, + "learning_rate": 8.773769594085837e-05, + "loss": 0.8298, + "step": 9708 + }, + { + "epoch": 0.6578358967409716, + "grad_norm": 5.10132360458374, + "learning_rate": 8.773632692176057e-05, + "loss": 0.6058, + "step": 9709 + }, + { + "epoch": 0.6579036520089437, + "grad_norm": 7.845650672912598, + "learning_rate": 8.773495790266275e-05, + "loss": 0.9913, + "step": 9710 + }, + { + "epoch": 0.6579714072769158, + "grad_norm": 6.442379951477051, + "learning_rate": 8.773358888356493e-05, + "loss": 0.7329, + "step": 9711 + }, + { + "epoch": 0.6580391625448878, + "grad_norm": 6.904378414154053, + "learning_rate": 8.773221986446712e-05, + "loss": 0.9929, + "step": 9712 + }, + { + "epoch": 0.6581069178128599, + "grad_norm": 7.309523105621338, + "learning_rate": 8.77308508453693e-05, + "loss": 0.8333, + "step": 9713 + }, + { + "epoch": 0.658174673080832, + "grad_norm": 5.351863861083984, + "learning_rate": 8.772948182627148e-05, + "loss": 0.718, + "step": 9714 + }, + { + "epoch": 0.6582424283488041, + "grad_norm": 6.481664180755615, + "learning_rate": 8.772811280717367e-05, + "loss": 0.7649, + "step": 9715 + }, + { + "epoch": 0.6583101836167762, + "grad_norm": 6.753706932067871, + "learning_rate": 8.772674378807586e-05, + "loss": 0.6513, + "step": 9716 + }, + { + "epoch": 0.6583779388847483, + "grad_norm": 6.8181047439575195, + "learning_rate": 8.772537476897804e-05, + "loss": 0.8805, + "step": 9717 + }, + { + "epoch": 0.6584456941527204, + "grad_norm": 6.623274803161621, + "learning_rate": 8.772400574988022e-05, + "loss": 1.0795, + "step": 9718 + }, + { + "epoch": 0.6585134494206925, + "grad_norm": 5.954155445098877, + "learning_rate": 8.772263673078241e-05, + "loss": 0.7594, + "step": 9719 + }, + { + "epoch": 0.6585812046886645, + "grad_norm": 5.537624835968018, + "learning_rate": 8.772126771168459e-05, + "loss": 0.7049, + "step": 9720 + }, + { + "epoch": 0.6586489599566366, + "grad_norm": 6.186052322387695, + "learning_rate": 8.771989869258677e-05, + "loss": 0.8639, + "step": 9721 + }, + { + "epoch": 0.6587167152246087, + "grad_norm": 6.996501445770264, + "learning_rate": 8.771852967348895e-05, + "loss": 0.9438, + "step": 9722 + }, + { + "epoch": 0.6587844704925808, + "grad_norm": 6.926022052764893, + "learning_rate": 8.771716065439113e-05, + "loss": 0.7466, + "step": 9723 + }, + { + "epoch": 0.6588522257605529, + "grad_norm": 5.302411079406738, + "learning_rate": 8.771579163529332e-05, + "loss": 0.7562, + "step": 9724 + }, + { + "epoch": 0.658919981028525, + "grad_norm": 6.900167465209961, + "learning_rate": 8.77144226161955e-05, + "loss": 0.8841, + "step": 9725 + }, + { + "epoch": 0.6589877362964971, + "grad_norm": 5.357388019561768, + "learning_rate": 8.771305359709769e-05, + "loss": 0.7033, + "step": 9726 + }, + { + "epoch": 0.6590554915644692, + "grad_norm": 5.690728187561035, + "learning_rate": 8.771168457799987e-05, + "loss": 0.7668, + "step": 9727 + }, + { + "epoch": 0.6591232468324412, + "grad_norm": 6.126123428344727, + "learning_rate": 8.771031555890206e-05, + "loss": 0.7644, + "step": 9728 + }, + { + "epoch": 0.6591910021004133, + "grad_norm": 6.951844692230225, + "learning_rate": 8.770894653980424e-05, + "loss": 0.6453, + "step": 9729 + }, + { + "epoch": 0.6592587573683854, + "grad_norm": 7.219118118286133, + "learning_rate": 8.770757752070642e-05, + "loss": 0.7606, + "step": 9730 + }, + { + "epoch": 0.6593265126363574, + "grad_norm": 7.441622734069824, + "learning_rate": 8.77062085016086e-05, + "loss": 1.0002, + "step": 9731 + }, + { + "epoch": 0.6593942679043295, + "grad_norm": 7.127715110778809, + "learning_rate": 8.770483948251078e-05, + "loss": 0.9236, + "step": 9732 + }, + { + "epoch": 0.6594620231723016, + "grad_norm": 5.041599750518799, + "learning_rate": 8.770347046341298e-05, + "loss": 0.5938, + "step": 9733 + }, + { + "epoch": 0.6595297784402737, + "grad_norm": 5.044239044189453, + "learning_rate": 8.770210144431516e-05, + "loss": 0.7437, + "step": 9734 + }, + { + "epoch": 0.6595975337082458, + "grad_norm": 6.069377422332764, + "learning_rate": 8.770073242521734e-05, + "loss": 0.6742, + "step": 9735 + }, + { + "epoch": 0.6596652889762179, + "grad_norm": 7.166933059692383, + "learning_rate": 8.769936340611952e-05, + "loss": 0.9085, + "step": 9736 + }, + { + "epoch": 0.65973304424419, + "grad_norm": 6.837136268615723, + "learning_rate": 8.76979943870217e-05, + "loss": 0.7626, + "step": 9737 + }, + { + "epoch": 0.6598007995121621, + "grad_norm": 5.012059211730957, + "learning_rate": 8.769662536792389e-05, + "loss": 0.6234, + "step": 9738 + }, + { + "epoch": 0.6598685547801342, + "grad_norm": 5.913621425628662, + "learning_rate": 8.769525634882607e-05, + "loss": 0.9147, + "step": 9739 + }, + { + "epoch": 0.6599363100481063, + "grad_norm": 6.618444919586182, + "learning_rate": 8.769388732972825e-05, + "loss": 1.0521, + "step": 9740 + }, + { + "epoch": 0.6600040653160784, + "grad_norm": 6.666975975036621, + "learning_rate": 8.769251831063043e-05, + "loss": 0.8245, + "step": 9741 + }, + { + "epoch": 0.6600718205840505, + "grad_norm": 6.840112686157227, + "learning_rate": 8.769114929153263e-05, + "loss": 0.8726, + "step": 9742 + }, + { + "epoch": 0.6601395758520225, + "grad_norm": 7.093020915985107, + "learning_rate": 8.76897802724348e-05, + "loss": 0.9649, + "step": 9743 + }, + { + "epoch": 0.6602073311199946, + "grad_norm": 6.4766845703125, + "learning_rate": 8.768841125333699e-05, + "loss": 0.7608, + "step": 9744 + }, + { + "epoch": 0.6602750863879666, + "grad_norm": 6.574507236480713, + "learning_rate": 8.768704223423917e-05, + "loss": 0.9587, + "step": 9745 + }, + { + "epoch": 0.6603428416559387, + "grad_norm": 6.037952423095703, + "learning_rate": 8.768567321514135e-05, + "loss": 0.8102, + "step": 9746 + }, + { + "epoch": 0.6604105969239108, + "grad_norm": 5.707187175750732, + "learning_rate": 8.768430419604354e-05, + "loss": 0.7906, + "step": 9747 + }, + { + "epoch": 0.6604783521918829, + "grad_norm": 5.039308547973633, + "learning_rate": 8.768293517694572e-05, + "loss": 0.7443, + "step": 9748 + }, + { + "epoch": 0.660546107459855, + "grad_norm": 6.081298351287842, + "learning_rate": 8.76815661578479e-05, + "loss": 0.8813, + "step": 9749 + }, + { + "epoch": 0.6606138627278271, + "grad_norm": 6.228826522827148, + "learning_rate": 8.768019713875008e-05, + "loss": 0.755, + "step": 9750 + }, + { + "epoch": 0.6606816179957992, + "grad_norm": 6.391602993011475, + "learning_rate": 8.767882811965228e-05, + "loss": 1.0464, + "step": 9751 + }, + { + "epoch": 0.6607493732637713, + "grad_norm": 6.1509318351745605, + "learning_rate": 8.767745910055446e-05, + "loss": 0.8464, + "step": 9752 + }, + { + "epoch": 0.6608171285317433, + "grad_norm": 7.630395412445068, + "learning_rate": 8.767609008145664e-05, + "loss": 0.7216, + "step": 9753 + }, + { + "epoch": 0.6608848837997154, + "grad_norm": 6.150453090667725, + "learning_rate": 8.767472106235882e-05, + "loss": 0.747, + "step": 9754 + }, + { + "epoch": 0.6609526390676875, + "grad_norm": 6.866214752197266, + "learning_rate": 8.7673352043261e-05, + "loss": 0.7093, + "step": 9755 + }, + { + "epoch": 0.6610203943356596, + "grad_norm": 5.30288553237915, + "learning_rate": 8.767198302416319e-05, + "loss": 0.8519, + "step": 9756 + }, + { + "epoch": 0.6610881496036317, + "grad_norm": 7.960119724273682, + "learning_rate": 8.767061400506537e-05, + "loss": 1.0947, + "step": 9757 + }, + { + "epoch": 0.6611559048716038, + "grad_norm": 6.809545993804932, + "learning_rate": 8.766924498596755e-05, + "loss": 0.7943, + "step": 9758 + }, + { + "epoch": 0.6612236601395759, + "grad_norm": 7.028867721557617, + "learning_rate": 8.766787596686975e-05, + "loss": 0.8684, + "step": 9759 + }, + { + "epoch": 0.661291415407548, + "grad_norm": 6.686776161193848, + "learning_rate": 8.766650694777193e-05, + "loss": 0.7796, + "step": 9760 + }, + { + "epoch": 0.66135917067552, + "grad_norm": 5.486933708190918, + "learning_rate": 8.76651379286741e-05, + "loss": 0.5741, + "step": 9761 + }, + { + "epoch": 0.6614269259434921, + "grad_norm": 6.500797748565674, + "learning_rate": 8.76637689095763e-05, + "loss": 1.0022, + "step": 9762 + }, + { + "epoch": 0.6614946812114642, + "grad_norm": 8.424103736877441, + "learning_rate": 8.766239989047848e-05, + "loss": 0.9727, + "step": 9763 + }, + { + "epoch": 0.6615624364794362, + "grad_norm": 5.002140998840332, + "learning_rate": 8.766103087138066e-05, + "loss": 0.5935, + "step": 9764 + }, + { + "epoch": 0.6616301917474083, + "grad_norm": 6.721834659576416, + "learning_rate": 8.765966185228285e-05, + "loss": 0.7682, + "step": 9765 + }, + { + "epoch": 0.6616979470153804, + "grad_norm": 6.448259353637695, + "learning_rate": 8.765829283318503e-05, + "loss": 0.6641, + "step": 9766 + }, + { + "epoch": 0.6617657022833525, + "grad_norm": 5.544651985168457, + "learning_rate": 8.765692381408722e-05, + "loss": 0.6952, + "step": 9767 + }, + { + "epoch": 0.6618334575513246, + "grad_norm": 7.139290809631348, + "learning_rate": 8.76555547949894e-05, + "loss": 0.8574, + "step": 9768 + }, + { + "epoch": 0.6619012128192967, + "grad_norm": 7.421456813812256, + "learning_rate": 8.765418577589158e-05, + "loss": 0.9057, + "step": 9769 + }, + { + "epoch": 0.6619689680872688, + "grad_norm": 6.075616359710693, + "learning_rate": 8.765281675679377e-05, + "loss": 0.7174, + "step": 9770 + }, + { + "epoch": 0.6620367233552409, + "grad_norm": 5.6527323722839355, + "learning_rate": 8.765144773769595e-05, + "loss": 0.6707, + "step": 9771 + }, + { + "epoch": 0.662104478623213, + "grad_norm": 5.773962020874023, + "learning_rate": 8.765007871859813e-05, + "loss": 0.714, + "step": 9772 + }, + { + "epoch": 0.6621722338911851, + "grad_norm": 5.672632694244385, + "learning_rate": 8.764870969950031e-05, + "loss": 0.6529, + "step": 9773 + }, + { + "epoch": 0.6622399891591572, + "grad_norm": 7.087960243225098, + "learning_rate": 8.76473406804025e-05, + "loss": 0.8998, + "step": 9774 + }, + { + "epoch": 0.6623077444271293, + "grad_norm": 6.828940391540527, + "learning_rate": 8.764597166130468e-05, + "loss": 0.9055, + "step": 9775 + }, + { + "epoch": 0.6623754996951013, + "grad_norm": 5.858781814575195, + "learning_rate": 8.764460264220687e-05, + "loss": 0.7853, + "step": 9776 + }, + { + "epoch": 0.6624432549630733, + "grad_norm": 6.570882797241211, + "learning_rate": 8.764323362310905e-05, + "loss": 1.1202, + "step": 9777 + }, + { + "epoch": 0.6625110102310454, + "grad_norm": 6.794251918792725, + "learning_rate": 8.764186460401123e-05, + "loss": 0.7795, + "step": 9778 + }, + { + "epoch": 0.6625787654990175, + "grad_norm": 5.775513648986816, + "learning_rate": 8.764049558491342e-05, + "loss": 0.8422, + "step": 9779 + }, + { + "epoch": 0.6626465207669896, + "grad_norm": 5.90095329284668, + "learning_rate": 8.76391265658156e-05, + "loss": 0.7624, + "step": 9780 + }, + { + "epoch": 0.6627142760349617, + "grad_norm": 6.769818305969238, + "learning_rate": 8.763775754671778e-05, + "loss": 0.7761, + "step": 9781 + }, + { + "epoch": 0.6627820313029338, + "grad_norm": 5.702734470367432, + "learning_rate": 8.763638852761996e-05, + "loss": 0.6399, + "step": 9782 + }, + { + "epoch": 0.6628497865709059, + "grad_norm": 8.020594596862793, + "learning_rate": 8.763501950852214e-05, + "loss": 0.8082, + "step": 9783 + }, + { + "epoch": 0.662917541838878, + "grad_norm": 6.043667316436768, + "learning_rate": 8.763365048942434e-05, + "loss": 0.69, + "step": 9784 + }, + { + "epoch": 0.6629852971068501, + "grad_norm": 6.844956874847412, + "learning_rate": 8.763228147032652e-05, + "loss": 0.7028, + "step": 9785 + }, + { + "epoch": 0.6630530523748221, + "grad_norm": 6.554644584655762, + "learning_rate": 8.76309124512287e-05, + "loss": 1.0118, + "step": 9786 + }, + { + "epoch": 0.6631208076427942, + "grad_norm": 6.549968242645264, + "learning_rate": 8.762954343213088e-05, + "loss": 0.8027, + "step": 9787 + }, + { + "epoch": 0.6631885629107663, + "grad_norm": 5.934162616729736, + "learning_rate": 8.762817441303307e-05, + "loss": 0.5964, + "step": 9788 + }, + { + "epoch": 0.6632563181787384, + "grad_norm": 6.02857780456543, + "learning_rate": 8.762680539393525e-05, + "loss": 0.997, + "step": 9789 + }, + { + "epoch": 0.6633240734467105, + "grad_norm": 7.619875907897949, + "learning_rate": 8.762543637483743e-05, + "loss": 1.0416, + "step": 9790 + }, + { + "epoch": 0.6633918287146826, + "grad_norm": 7.058054447174072, + "learning_rate": 8.762406735573961e-05, + "loss": 0.7885, + "step": 9791 + }, + { + "epoch": 0.6634595839826547, + "grad_norm": 7.8069353103637695, + "learning_rate": 8.762269833664179e-05, + "loss": 0.8077, + "step": 9792 + }, + { + "epoch": 0.6635273392506268, + "grad_norm": 6.246554851531982, + "learning_rate": 8.762132931754399e-05, + "loss": 0.8138, + "step": 9793 + }, + { + "epoch": 0.6635950945185988, + "grad_norm": 7.489177227020264, + "learning_rate": 8.761996029844617e-05, + "loss": 0.8638, + "step": 9794 + }, + { + "epoch": 0.6636628497865709, + "grad_norm": 7.148414134979248, + "learning_rate": 8.761859127934835e-05, + "loss": 0.7998, + "step": 9795 + }, + { + "epoch": 0.663730605054543, + "grad_norm": 6.611279010772705, + "learning_rate": 8.761722226025053e-05, + "loss": 0.6272, + "step": 9796 + }, + { + "epoch": 0.663798360322515, + "grad_norm": 7.739448070526123, + "learning_rate": 8.761585324115272e-05, + "loss": 0.7987, + "step": 9797 + }, + { + "epoch": 0.6638661155904871, + "grad_norm": 4.832591533660889, + "learning_rate": 8.76144842220549e-05, + "loss": 0.5663, + "step": 9798 + }, + { + "epoch": 0.6639338708584592, + "grad_norm": 4.939646244049072, + "learning_rate": 8.761311520295708e-05, + "loss": 0.5951, + "step": 9799 + }, + { + "epoch": 0.6640016261264313, + "grad_norm": 6.326164245605469, + "learning_rate": 8.761174618385926e-05, + "loss": 0.9925, + "step": 9800 + }, + { + "epoch": 0.6640693813944034, + "grad_norm": 5.790089130401611, + "learning_rate": 8.761037716476144e-05, + "loss": 1.0175, + "step": 9801 + }, + { + "epoch": 0.6641371366623755, + "grad_norm": 6.139710903167725, + "learning_rate": 8.760900814566364e-05, + "loss": 0.7322, + "step": 9802 + }, + { + "epoch": 0.6642048919303476, + "grad_norm": 6.73042106628418, + "learning_rate": 8.760763912656582e-05, + "loss": 0.7969, + "step": 9803 + }, + { + "epoch": 0.6642726471983197, + "grad_norm": 6.9577765464782715, + "learning_rate": 8.7606270107468e-05, + "loss": 0.7294, + "step": 9804 + }, + { + "epoch": 0.6643404024662918, + "grad_norm": 7.609181880950928, + "learning_rate": 8.760490108837019e-05, + "loss": 0.9833, + "step": 9805 + }, + { + "epoch": 0.6644081577342639, + "grad_norm": 5.679005146026611, + "learning_rate": 8.760353206927237e-05, + "loss": 0.8292, + "step": 9806 + }, + { + "epoch": 0.664475913002236, + "grad_norm": 6.903763771057129, + "learning_rate": 8.760216305017455e-05, + "loss": 0.8298, + "step": 9807 + }, + { + "epoch": 0.664543668270208, + "grad_norm": 5.705727577209473, + "learning_rate": 8.760079403107674e-05, + "loss": 0.8319, + "step": 9808 + }, + { + "epoch": 0.6646114235381801, + "grad_norm": 5.971374988555908, + "learning_rate": 8.759942501197892e-05, + "loss": 0.8034, + "step": 9809 + }, + { + "epoch": 0.6646791788061521, + "grad_norm": 4.7771806716918945, + "learning_rate": 8.75980559928811e-05, + "loss": 0.8539, + "step": 9810 + }, + { + "epoch": 0.6647469340741242, + "grad_norm": 6.891615867614746, + "learning_rate": 8.75966869737833e-05, + "loss": 0.667, + "step": 9811 + }, + { + "epoch": 0.6648146893420963, + "grad_norm": 6.467392444610596, + "learning_rate": 8.759531795468548e-05, + "loss": 0.8197, + "step": 9812 + }, + { + "epoch": 0.6648824446100684, + "grad_norm": 5.87471342086792, + "learning_rate": 8.759394893558766e-05, + "loss": 0.9105, + "step": 9813 + }, + { + "epoch": 0.6649501998780405, + "grad_norm": 5.994086265563965, + "learning_rate": 8.759257991648984e-05, + "loss": 0.8333, + "step": 9814 + }, + { + "epoch": 0.6650179551460126, + "grad_norm": 6.536230564117432, + "learning_rate": 8.759121089739202e-05, + "loss": 1.0726, + "step": 9815 + }, + { + "epoch": 0.6650857104139847, + "grad_norm": 7.012213230133057, + "learning_rate": 8.758984187829421e-05, + "loss": 1.0442, + "step": 9816 + }, + { + "epoch": 0.6651534656819568, + "grad_norm": 6.069978713989258, + "learning_rate": 8.75884728591964e-05, + "loss": 0.6364, + "step": 9817 + }, + { + "epoch": 0.6652212209499289, + "grad_norm": 5.200802803039551, + "learning_rate": 8.758710384009858e-05, + "loss": 0.6563, + "step": 9818 + }, + { + "epoch": 0.665288976217901, + "grad_norm": 6.931329727172852, + "learning_rate": 8.758573482100076e-05, + "loss": 0.8925, + "step": 9819 + }, + { + "epoch": 0.665356731485873, + "grad_norm": 6.1655731201171875, + "learning_rate": 8.758436580190295e-05, + "loss": 0.7658, + "step": 9820 + }, + { + "epoch": 0.6654244867538451, + "grad_norm": 6.437480926513672, + "learning_rate": 8.758299678280513e-05, + "loss": 0.8279, + "step": 9821 + }, + { + "epoch": 0.6654922420218172, + "grad_norm": 5.941578388214111, + "learning_rate": 8.758162776370731e-05, + "loss": 0.7095, + "step": 9822 + }, + { + "epoch": 0.6655599972897893, + "grad_norm": 6.083083152770996, + "learning_rate": 8.758025874460949e-05, + "loss": 0.9792, + "step": 9823 + }, + { + "epoch": 0.6656277525577614, + "grad_norm": 5.512528419494629, + "learning_rate": 8.757888972551167e-05, + "loss": 0.8771, + "step": 9824 + }, + { + "epoch": 0.6656955078257335, + "grad_norm": 5.142529487609863, + "learning_rate": 8.757752070641386e-05, + "loss": 0.7935, + "step": 9825 + }, + { + "epoch": 0.6657632630937055, + "grad_norm": 5.460729122161865, + "learning_rate": 8.757615168731604e-05, + "loss": 0.9105, + "step": 9826 + }, + { + "epoch": 0.6658310183616776, + "grad_norm": 8.098207473754883, + "learning_rate": 8.757478266821823e-05, + "loss": 0.8831, + "step": 9827 + }, + { + "epoch": 0.6658987736296497, + "grad_norm": 5.392515659332275, + "learning_rate": 8.75734136491204e-05, + "loss": 0.8327, + "step": 9828 + }, + { + "epoch": 0.6659665288976218, + "grad_norm": 4.9883294105529785, + "learning_rate": 8.75720446300226e-05, + "loss": 0.7598, + "step": 9829 + }, + { + "epoch": 0.6660342841655938, + "grad_norm": 5.96735954284668, + "learning_rate": 8.757067561092478e-05, + "loss": 0.9694, + "step": 9830 + }, + { + "epoch": 0.6661020394335659, + "grad_norm": 6.818420886993408, + "learning_rate": 8.756930659182696e-05, + "loss": 0.9571, + "step": 9831 + }, + { + "epoch": 0.666169794701538, + "grad_norm": 6.297762870788574, + "learning_rate": 8.756793757272914e-05, + "loss": 0.6912, + "step": 9832 + }, + { + "epoch": 0.6662375499695101, + "grad_norm": 6.834626197814941, + "learning_rate": 8.756656855363132e-05, + "loss": 0.8697, + "step": 9833 + }, + { + "epoch": 0.6663053052374822, + "grad_norm": 6.614404678344727, + "learning_rate": 8.756519953453351e-05, + "loss": 0.9481, + "step": 9834 + }, + { + "epoch": 0.6663730605054543, + "grad_norm": 6.524447441101074, + "learning_rate": 8.75638305154357e-05, + "loss": 0.7346, + "step": 9835 + }, + { + "epoch": 0.6664408157734264, + "grad_norm": 7.389026641845703, + "learning_rate": 8.756246149633788e-05, + "loss": 1.0821, + "step": 9836 + }, + { + "epoch": 0.6665085710413985, + "grad_norm": 5.654766082763672, + "learning_rate": 8.756109247724006e-05, + "loss": 0.8859, + "step": 9837 + }, + { + "epoch": 0.6665763263093706, + "grad_norm": 6.345546722412109, + "learning_rate": 8.755972345814224e-05, + "loss": 0.8386, + "step": 9838 + }, + { + "epoch": 0.6666440815773427, + "grad_norm": 5.268699645996094, + "learning_rate": 8.755835443904443e-05, + "loss": 0.6937, + "step": 9839 + }, + { + "epoch": 0.6667118368453148, + "grad_norm": 5.590981960296631, + "learning_rate": 8.755698541994661e-05, + "loss": 0.7624, + "step": 9840 + }, + { + "epoch": 0.6667795921132869, + "grad_norm": 7.142122268676758, + "learning_rate": 8.755561640084879e-05, + "loss": 0.807, + "step": 9841 + }, + { + "epoch": 0.666847347381259, + "grad_norm": 7.878788948059082, + "learning_rate": 8.755424738175097e-05, + "loss": 0.6766, + "step": 9842 + }, + { + "epoch": 0.6669151026492309, + "grad_norm": 7.469075679779053, + "learning_rate": 8.755287836265316e-05, + "loss": 0.7736, + "step": 9843 + }, + { + "epoch": 0.666982857917203, + "grad_norm": 6.271559715270996, + "learning_rate": 8.755150934355535e-05, + "loss": 1.1519, + "step": 9844 + }, + { + "epoch": 0.6670506131851751, + "grad_norm": 5.424012660980225, + "learning_rate": 8.755014032445753e-05, + "loss": 0.6141, + "step": 9845 + }, + { + "epoch": 0.6671183684531472, + "grad_norm": 6.098863124847412, + "learning_rate": 8.75487713053597e-05, + "loss": 0.8144, + "step": 9846 + }, + { + "epoch": 0.6671861237211193, + "grad_norm": 7.099893569946289, + "learning_rate": 8.754740228626189e-05, + "loss": 0.897, + "step": 9847 + }, + { + "epoch": 0.6672538789890914, + "grad_norm": 4.874231338500977, + "learning_rate": 8.754603326716408e-05, + "loss": 0.7885, + "step": 9848 + }, + { + "epoch": 0.6673216342570635, + "grad_norm": 7.58192777633667, + "learning_rate": 8.754466424806626e-05, + "loss": 0.7326, + "step": 9849 + }, + { + "epoch": 0.6673893895250356, + "grad_norm": 5.8943047523498535, + "learning_rate": 8.754329522896844e-05, + "loss": 0.8714, + "step": 9850 + }, + { + "epoch": 0.6674571447930077, + "grad_norm": 7.8243865966796875, + "learning_rate": 8.754192620987063e-05, + "loss": 0.749, + "step": 9851 + }, + { + "epoch": 0.6675249000609798, + "grad_norm": 6.121888160705566, + "learning_rate": 8.754055719077282e-05, + "loss": 0.7823, + "step": 9852 + }, + { + "epoch": 0.6675926553289518, + "grad_norm": 8.671948432922363, + "learning_rate": 8.7539188171675e-05, + "loss": 0.7592, + "step": 9853 + }, + { + "epoch": 0.6676604105969239, + "grad_norm": 7.32505989074707, + "learning_rate": 8.753781915257719e-05, + "loss": 0.816, + "step": 9854 + }, + { + "epoch": 0.667728165864896, + "grad_norm": 5.141097545623779, + "learning_rate": 8.753645013347937e-05, + "loss": 0.913, + "step": 9855 + }, + { + "epoch": 0.6677959211328681, + "grad_norm": 6.519028186798096, + "learning_rate": 8.753508111438155e-05, + "loss": 0.8542, + "step": 9856 + }, + { + "epoch": 0.6678636764008402, + "grad_norm": 5.298619747161865, + "learning_rate": 8.753371209528374e-05, + "loss": 0.8773, + "step": 9857 + }, + { + "epoch": 0.6679314316688123, + "grad_norm": 4.540358066558838, + "learning_rate": 8.753234307618592e-05, + "loss": 0.5887, + "step": 9858 + }, + { + "epoch": 0.6679991869367843, + "grad_norm": 7.409801483154297, + "learning_rate": 8.75309740570881e-05, + "loss": 0.7, + "step": 9859 + }, + { + "epoch": 0.6680669422047564, + "grad_norm": 7.78483772277832, + "learning_rate": 8.752960503799028e-05, + "loss": 0.9825, + "step": 9860 + }, + { + "epoch": 0.6681346974727285, + "grad_norm": 4.964968204498291, + "learning_rate": 8.752823601889248e-05, + "loss": 0.8111, + "step": 9861 + }, + { + "epoch": 0.6682024527407006, + "grad_norm": 6.426743984222412, + "learning_rate": 8.752686699979466e-05, + "loss": 1.0148, + "step": 9862 + }, + { + "epoch": 0.6682702080086727, + "grad_norm": 5.606266975402832, + "learning_rate": 8.752549798069684e-05, + "loss": 0.7848, + "step": 9863 + }, + { + "epoch": 0.6683379632766447, + "grad_norm": 6.663419723510742, + "learning_rate": 8.752412896159902e-05, + "loss": 0.5602, + "step": 9864 + }, + { + "epoch": 0.6684057185446168, + "grad_norm": 5.719392776489258, + "learning_rate": 8.75227599425012e-05, + "loss": 0.6434, + "step": 9865 + }, + { + "epoch": 0.6684734738125889, + "grad_norm": 6.0621747970581055, + "learning_rate": 8.75213909234034e-05, + "loss": 0.7172, + "step": 9866 + }, + { + "epoch": 0.668541229080561, + "grad_norm": 7.197140216827393, + "learning_rate": 8.752002190430557e-05, + "loss": 0.9543, + "step": 9867 + }, + { + "epoch": 0.6686089843485331, + "grad_norm": 7.329906940460205, + "learning_rate": 8.751865288520775e-05, + "loss": 0.8873, + "step": 9868 + }, + { + "epoch": 0.6686767396165052, + "grad_norm": 8.936700820922852, + "learning_rate": 8.751728386610994e-05, + "loss": 0.8918, + "step": 9869 + }, + { + "epoch": 0.6687444948844773, + "grad_norm": 5.39838171005249, + "learning_rate": 8.751591484701212e-05, + "loss": 1.0986, + "step": 9870 + }, + { + "epoch": 0.6688122501524494, + "grad_norm": 7.213238716125488, + "learning_rate": 8.751454582791431e-05, + "loss": 0.9257, + "step": 9871 + }, + { + "epoch": 0.6688800054204215, + "grad_norm": 7.646907806396484, + "learning_rate": 8.751317680881649e-05, + "loss": 0.8563, + "step": 9872 + }, + { + "epoch": 0.6689477606883936, + "grad_norm": 6.506889343261719, + "learning_rate": 8.751180778971867e-05, + "loss": 0.8348, + "step": 9873 + }, + { + "epoch": 0.6690155159563657, + "grad_norm": 8.101831436157227, + "learning_rate": 8.751043877062085e-05, + "loss": 0.8127, + "step": 9874 + }, + { + "epoch": 0.6690832712243376, + "grad_norm": 7.347453594207764, + "learning_rate": 8.750906975152304e-05, + "loss": 0.768, + "step": 9875 + }, + { + "epoch": 0.6691510264923097, + "grad_norm": 6.852962970733643, + "learning_rate": 8.750770073242522e-05, + "loss": 0.8699, + "step": 9876 + }, + { + "epoch": 0.6692187817602818, + "grad_norm": 5.527287006378174, + "learning_rate": 8.75063317133274e-05, + "loss": 0.6164, + "step": 9877 + }, + { + "epoch": 0.6692865370282539, + "grad_norm": 6.611326217651367, + "learning_rate": 8.750496269422959e-05, + "loss": 0.715, + "step": 9878 + }, + { + "epoch": 0.669354292296226, + "grad_norm": 6.488336086273193, + "learning_rate": 8.750359367513177e-05, + "loss": 0.9815, + "step": 9879 + }, + { + "epoch": 0.6694220475641981, + "grad_norm": 8.057877540588379, + "learning_rate": 8.750222465603396e-05, + "loss": 0.9369, + "step": 9880 + }, + { + "epoch": 0.6694898028321702, + "grad_norm": 7.809643745422363, + "learning_rate": 8.750085563693614e-05, + "loss": 0.7953, + "step": 9881 + }, + { + "epoch": 0.6695575581001423, + "grad_norm": 5.484494209289551, + "learning_rate": 8.749948661783832e-05, + "loss": 0.8527, + "step": 9882 + }, + { + "epoch": 0.6696253133681144, + "grad_norm": 8.870565414428711, + "learning_rate": 8.74981175987405e-05, + "loss": 0.7632, + "step": 9883 + }, + { + "epoch": 0.6696930686360865, + "grad_norm": 5.734538555145264, + "learning_rate": 8.74967485796427e-05, + "loss": 0.9434, + "step": 9884 + }, + { + "epoch": 0.6697608239040586, + "grad_norm": 6.731936454772949, + "learning_rate": 8.749537956054487e-05, + "loss": 0.9988, + "step": 9885 + }, + { + "epoch": 0.6698285791720306, + "grad_norm": 6.063337326049805, + "learning_rate": 8.749401054144706e-05, + "loss": 0.6882, + "step": 9886 + }, + { + "epoch": 0.6698963344400027, + "grad_norm": 6.489291667938232, + "learning_rate": 8.749264152234924e-05, + "loss": 0.8978, + "step": 9887 + }, + { + "epoch": 0.6699640897079748, + "grad_norm": 6.419568061828613, + "learning_rate": 8.749127250325142e-05, + "loss": 0.84, + "step": 9888 + }, + { + "epoch": 0.6700318449759469, + "grad_norm": 7.085015773773193, + "learning_rate": 8.748990348415361e-05, + "loss": 1.0001, + "step": 9889 + }, + { + "epoch": 0.670099600243919, + "grad_norm": 5.279534339904785, + "learning_rate": 8.748853446505579e-05, + "loss": 0.613, + "step": 9890 + }, + { + "epoch": 0.670167355511891, + "grad_norm": 7.26226282119751, + "learning_rate": 8.748716544595797e-05, + "loss": 0.7144, + "step": 9891 + }, + { + "epoch": 0.6702351107798631, + "grad_norm": 5.066533088684082, + "learning_rate": 8.748579642686015e-05, + "loss": 0.8349, + "step": 9892 + }, + { + "epoch": 0.6703028660478352, + "grad_norm": 5.847897052764893, + "learning_rate": 8.748442740776233e-05, + "loss": 0.778, + "step": 9893 + }, + { + "epoch": 0.6703706213158073, + "grad_norm": 7.026902198791504, + "learning_rate": 8.748305838866452e-05, + "loss": 0.7131, + "step": 9894 + }, + { + "epoch": 0.6704383765837794, + "grad_norm": 9.44549560546875, + "learning_rate": 8.74816893695667e-05, + "loss": 0.7093, + "step": 9895 + }, + { + "epoch": 0.6705061318517515, + "grad_norm": 5.592066764831543, + "learning_rate": 8.748032035046889e-05, + "loss": 0.7979, + "step": 9896 + }, + { + "epoch": 0.6705738871197235, + "grad_norm": 7.530812740325928, + "learning_rate": 8.747895133137108e-05, + "loss": 0.7899, + "step": 9897 + }, + { + "epoch": 0.6706416423876956, + "grad_norm": 7.456013202667236, + "learning_rate": 8.747758231227326e-05, + "loss": 0.9772, + "step": 9898 + }, + { + "epoch": 0.6707093976556677, + "grad_norm": 7.631428241729736, + "learning_rate": 8.747621329317544e-05, + "loss": 0.9485, + "step": 9899 + }, + { + "epoch": 0.6707771529236398, + "grad_norm": 5.919754981994629, + "learning_rate": 8.747484427407763e-05, + "loss": 0.8696, + "step": 9900 + }, + { + "epoch": 0.6708449081916119, + "grad_norm": 6.412426948547363, + "learning_rate": 8.747347525497981e-05, + "loss": 0.8179, + "step": 9901 + }, + { + "epoch": 0.670912663459584, + "grad_norm": 5.753444671630859, + "learning_rate": 8.7472106235882e-05, + "loss": 0.7817, + "step": 9902 + }, + { + "epoch": 0.6709804187275561, + "grad_norm": 6.622903347015381, + "learning_rate": 8.747073721678419e-05, + "loss": 0.8184, + "step": 9903 + }, + { + "epoch": 0.6710481739955282, + "grad_norm": 8.03353500366211, + "learning_rate": 8.746936819768637e-05, + "loss": 0.9982, + "step": 9904 + }, + { + "epoch": 0.6711159292635003, + "grad_norm": 7.467156410217285, + "learning_rate": 8.746799917858855e-05, + "loss": 0.6797, + "step": 9905 + }, + { + "epoch": 0.6711836845314724, + "grad_norm": 5.091195106506348, + "learning_rate": 8.746663015949073e-05, + "loss": 0.6776, + "step": 9906 + }, + { + "epoch": 0.6712514397994445, + "grad_norm": 6.8875627517700195, + "learning_rate": 8.746526114039292e-05, + "loss": 0.926, + "step": 9907 + }, + { + "epoch": 0.6713191950674164, + "grad_norm": 6.183489799499512, + "learning_rate": 8.74638921212951e-05, + "loss": 0.9849, + "step": 9908 + }, + { + "epoch": 0.6713869503353885, + "grad_norm": 6.648115634918213, + "learning_rate": 8.746252310219728e-05, + "loss": 0.789, + "step": 9909 + }, + { + "epoch": 0.6714547056033606, + "grad_norm": 5.949080944061279, + "learning_rate": 8.746115408309946e-05, + "loss": 0.6008, + "step": 9910 + }, + { + "epoch": 0.6715224608713327, + "grad_norm": 6.88314151763916, + "learning_rate": 8.745978506400164e-05, + "loss": 0.758, + "step": 9911 + }, + { + "epoch": 0.6715902161393048, + "grad_norm": 7.288704872131348, + "learning_rate": 8.745841604490384e-05, + "loss": 0.8002, + "step": 9912 + }, + { + "epoch": 0.6716579714072769, + "grad_norm": 6.439271926879883, + "learning_rate": 8.745704702580602e-05, + "loss": 0.9295, + "step": 9913 + }, + { + "epoch": 0.671725726675249, + "grad_norm": 6.891064643859863, + "learning_rate": 8.74556780067082e-05, + "loss": 0.7798, + "step": 9914 + }, + { + "epoch": 0.6717934819432211, + "grad_norm": 7.882637977600098, + "learning_rate": 8.745430898761038e-05, + "loss": 1.0254, + "step": 9915 + }, + { + "epoch": 0.6718612372111932, + "grad_norm": 6.752426624298096, + "learning_rate": 8.745293996851256e-05, + "loss": 0.7995, + "step": 9916 + }, + { + "epoch": 0.6719289924791653, + "grad_norm": 5.643410682678223, + "learning_rate": 8.745157094941475e-05, + "loss": 0.7354, + "step": 9917 + }, + { + "epoch": 0.6719967477471374, + "grad_norm": 7.521575927734375, + "learning_rate": 8.745020193031693e-05, + "loss": 0.7256, + "step": 9918 + }, + { + "epoch": 0.6720645030151094, + "grad_norm": 7.460186004638672, + "learning_rate": 8.744883291121911e-05, + "loss": 1.063, + "step": 9919 + }, + { + "epoch": 0.6721322582830815, + "grad_norm": 4.6668477058410645, + "learning_rate": 8.74474638921213e-05, + "loss": 0.6914, + "step": 9920 + }, + { + "epoch": 0.6722000135510536, + "grad_norm": 7.0220160484313965, + "learning_rate": 8.744609487302349e-05, + "loss": 0.7925, + "step": 9921 + }, + { + "epoch": 0.6722677688190257, + "grad_norm": 7.594117641448975, + "learning_rate": 8.744472585392567e-05, + "loss": 0.9337, + "step": 9922 + }, + { + "epoch": 0.6723355240869978, + "grad_norm": 7.195069313049316, + "learning_rate": 8.744335683482785e-05, + "loss": 0.9651, + "step": 9923 + }, + { + "epoch": 0.6724032793549698, + "grad_norm": 5.2893853187561035, + "learning_rate": 8.744198781573003e-05, + "loss": 0.7974, + "step": 9924 + }, + { + "epoch": 0.6724710346229419, + "grad_norm": 7.59480619430542, + "learning_rate": 8.744061879663221e-05, + "loss": 0.9182, + "step": 9925 + }, + { + "epoch": 0.672538789890914, + "grad_norm": 5.582269668579102, + "learning_rate": 8.74392497775344e-05, + "loss": 0.8509, + "step": 9926 + }, + { + "epoch": 0.6726065451588861, + "grad_norm": 6.992897033691406, + "learning_rate": 8.743788075843658e-05, + "loss": 0.8779, + "step": 9927 + }, + { + "epoch": 0.6726743004268582, + "grad_norm": 7.686391353607178, + "learning_rate": 8.743651173933876e-05, + "loss": 1.0611, + "step": 9928 + }, + { + "epoch": 0.6727420556948303, + "grad_norm": 6.090497016906738, + "learning_rate": 8.743514272024095e-05, + "loss": 0.9857, + "step": 9929 + }, + { + "epoch": 0.6728098109628023, + "grad_norm": 7.6027936935424805, + "learning_rate": 8.743377370114314e-05, + "loss": 1.0939, + "step": 9930 + }, + { + "epoch": 0.6728775662307744, + "grad_norm": 6.2990593910217285, + "learning_rate": 8.743240468204532e-05, + "loss": 0.8225, + "step": 9931 + }, + { + "epoch": 0.6729453214987465, + "grad_norm": 6.154484272003174, + "learning_rate": 8.74310356629475e-05, + "loss": 1.2153, + "step": 9932 + }, + { + "epoch": 0.6730130767667186, + "grad_norm": 5.471065044403076, + "learning_rate": 8.742966664384968e-05, + "loss": 0.9146, + "step": 9933 + }, + { + "epoch": 0.6730808320346907, + "grad_norm": 7.006470680236816, + "learning_rate": 8.742829762475186e-05, + "loss": 0.8704, + "step": 9934 + }, + { + "epoch": 0.6731485873026628, + "grad_norm": 6.060577869415283, + "learning_rate": 8.742692860565405e-05, + "loss": 1.0097, + "step": 9935 + }, + { + "epoch": 0.6732163425706349, + "grad_norm": 6.05491304397583, + "learning_rate": 8.742555958655623e-05, + "loss": 0.7488, + "step": 9936 + }, + { + "epoch": 0.673284097838607, + "grad_norm": 4.715060710906982, + "learning_rate": 8.742419056745842e-05, + "loss": 0.6577, + "step": 9937 + }, + { + "epoch": 0.6733518531065791, + "grad_norm": 5.644432544708252, + "learning_rate": 8.74228215483606e-05, + "loss": 0.6954, + "step": 9938 + }, + { + "epoch": 0.6734196083745512, + "grad_norm": 6.377723217010498, + "learning_rate": 8.742145252926279e-05, + "loss": 0.7948, + "step": 9939 + }, + { + "epoch": 0.6734873636425232, + "grad_norm": 4.606221675872803, + "learning_rate": 8.742008351016497e-05, + "loss": 0.8975, + "step": 9940 + }, + { + "epoch": 0.6735551189104952, + "grad_norm": 7.170515537261963, + "learning_rate": 8.741871449106715e-05, + "loss": 0.8839, + "step": 9941 + }, + { + "epoch": 0.6736228741784673, + "grad_norm": 5.995856761932373, + "learning_rate": 8.741734547196933e-05, + "loss": 0.8133, + "step": 9942 + }, + { + "epoch": 0.6736906294464394, + "grad_norm": 5.5055036544799805, + "learning_rate": 8.741597645287152e-05, + "loss": 0.5468, + "step": 9943 + }, + { + "epoch": 0.6737583847144115, + "grad_norm": 8.358444213867188, + "learning_rate": 8.74146074337737e-05, + "loss": 0.9229, + "step": 9944 + }, + { + "epoch": 0.6738261399823836, + "grad_norm": 6.350693225860596, + "learning_rate": 8.741323841467588e-05, + "loss": 0.9045, + "step": 9945 + }, + { + "epoch": 0.6738938952503557, + "grad_norm": 8.909754753112793, + "learning_rate": 8.741186939557808e-05, + "loss": 0.8389, + "step": 9946 + }, + { + "epoch": 0.6739616505183278, + "grad_norm": 6.272004127502441, + "learning_rate": 8.741050037648026e-05, + "loss": 0.8733, + "step": 9947 + }, + { + "epoch": 0.6740294057862999, + "grad_norm": 7.538585662841797, + "learning_rate": 8.740913135738244e-05, + "loss": 0.7856, + "step": 9948 + }, + { + "epoch": 0.674097161054272, + "grad_norm": 8.257597923278809, + "learning_rate": 8.740776233828463e-05, + "loss": 0.9473, + "step": 9949 + }, + { + "epoch": 0.6741649163222441, + "grad_norm": 8.649487495422363, + "learning_rate": 8.740639331918681e-05, + "loss": 0.874, + "step": 9950 + }, + { + "epoch": 0.6742326715902162, + "grad_norm": 7.1884765625, + "learning_rate": 8.7405024300089e-05, + "loss": 0.8108, + "step": 9951 + }, + { + "epoch": 0.6743004268581883, + "grad_norm": 6.152512550354004, + "learning_rate": 8.740365528099117e-05, + "loss": 1.0727, + "step": 9952 + }, + { + "epoch": 0.6743681821261603, + "grad_norm": 6.419736385345459, + "learning_rate": 8.740228626189337e-05, + "loss": 0.6842, + "step": 9953 + }, + { + "epoch": 0.6744359373941324, + "grad_norm": 7.59826135635376, + "learning_rate": 8.740091724279555e-05, + "loss": 0.9312, + "step": 9954 + }, + { + "epoch": 0.6745036926621045, + "grad_norm": 6.706602573394775, + "learning_rate": 8.739954822369773e-05, + "loss": 0.9033, + "step": 9955 + }, + { + "epoch": 0.6745714479300766, + "grad_norm": 6.8712992668151855, + "learning_rate": 8.739817920459991e-05, + "loss": 0.8354, + "step": 9956 + }, + { + "epoch": 0.6746392031980486, + "grad_norm": 4.86088228225708, + "learning_rate": 8.739681018550209e-05, + "loss": 0.7645, + "step": 9957 + }, + { + "epoch": 0.6747069584660207, + "grad_norm": 4.672707557678223, + "learning_rate": 8.739544116640428e-05, + "loss": 0.5008, + "step": 9958 + }, + { + "epoch": 0.6747747137339928, + "grad_norm": 7.2205634117126465, + "learning_rate": 8.739407214730646e-05, + "loss": 0.6439, + "step": 9959 + }, + { + "epoch": 0.6748424690019649, + "grad_norm": 7.193275451660156, + "learning_rate": 8.739270312820864e-05, + "loss": 0.6493, + "step": 9960 + }, + { + "epoch": 0.674910224269937, + "grad_norm": 7.060335159301758, + "learning_rate": 8.739133410911082e-05, + "loss": 1.0086, + "step": 9961 + }, + { + "epoch": 0.6749779795379091, + "grad_norm": 8.672296524047852, + "learning_rate": 8.738996509001302e-05, + "loss": 0.8854, + "step": 9962 + }, + { + "epoch": 0.6750457348058811, + "grad_norm": 5.923908710479736, + "learning_rate": 8.73885960709152e-05, + "loss": 0.7789, + "step": 9963 + }, + { + "epoch": 0.6751134900738532, + "grad_norm": 5.861504554748535, + "learning_rate": 8.738722705181738e-05, + "loss": 0.7219, + "step": 9964 + }, + { + "epoch": 0.6751812453418253, + "grad_norm": 8.418654441833496, + "learning_rate": 8.738585803271956e-05, + "loss": 0.7766, + "step": 9965 + }, + { + "epoch": 0.6752490006097974, + "grad_norm": 6.130760669708252, + "learning_rate": 8.738448901362174e-05, + "loss": 0.6711, + "step": 9966 + }, + { + "epoch": 0.6753167558777695, + "grad_norm": 6.2231526374816895, + "learning_rate": 8.738311999452393e-05, + "loss": 0.764, + "step": 9967 + }, + { + "epoch": 0.6753845111457416, + "grad_norm": 6.3183417320251465, + "learning_rate": 8.738175097542611e-05, + "loss": 0.6871, + "step": 9968 + }, + { + "epoch": 0.6754522664137137, + "grad_norm": 7.590784549713135, + "learning_rate": 8.73803819563283e-05, + "loss": 0.8666, + "step": 9969 + }, + { + "epoch": 0.6755200216816858, + "grad_norm": 5.8160600662231445, + "learning_rate": 8.737901293723047e-05, + "loss": 1.0468, + "step": 9970 + }, + { + "epoch": 0.6755877769496579, + "grad_norm": 7.236125469207764, + "learning_rate": 8.737764391813266e-05, + "loss": 0.6594, + "step": 9971 + }, + { + "epoch": 0.67565553221763, + "grad_norm": 7.984396934509277, + "learning_rate": 8.737627489903485e-05, + "loss": 0.8579, + "step": 9972 + }, + { + "epoch": 0.675723287485602, + "grad_norm": 6.760086536407471, + "learning_rate": 8.737490587993703e-05, + "loss": 0.6263, + "step": 9973 + }, + { + "epoch": 0.675791042753574, + "grad_norm": 6.506689071655273, + "learning_rate": 8.737353686083921e-05, + "loss": 0.8551, + "step": 9974 + }, + { + "epoch": 0.6758587980215461, + "grad_norm": 6.718931674957275, + "learning_rate": 8.737216784174139e-05, + "loss": 0.8464, + "step": 9975 + }, + { + "epoch": 0.6759265532895182, + "grad_norm": 7.459061145782471, + "learning_rate": 8.737079882264358e-05, + "loss": 0.8845, + "step": 9976 + }, + { + "epoch": 0.6759943085574903, + "grad_norm": 6.885556221008301, + "learning_rate": 8.736942980354576e-05, + "loss": 0.963, + "step": 9977 + }, + { + "epoch": 0.6760620638254624, + "grad_norm": 6.1340861320495605, + "learning_rate": 8.736806078444794e-05, + "loss": 0.8624, + "step": 9978 + }, + { + "epoch": 0.6761298190934345, + "grad_norm": 6.667585372924805, + "learning_rate": 8.736669176535012e-05, + "loss": 0.9265, + "step": 9979 + }, + { + "epoch": 0.6761975743614066, + "grad_norm": 5.923494338989258, + "learning_rate": 8.73653227462523e-05, + "loss": 0.9644, + "step": 9980 + }, + { + "epoch": 0.6762653296293787, + "grad_norm": 6.134291648864746, + "learning_rate": 8.73639537271545e-05, + "loss": 0.821, + "step": 9981 + }, + { + "epoch": 0.6763330848973508, + "grad_norm": 8.043211936950684, + "learning_rate": 8.736258470805668e-05, + "loss": 1.1772, + "step": 9982 + }, + { + "epoch": 0.6764008401653229, + "grad_norm": 5.358396053314209, + "learning_rate": 8.736121568895886e-05, + "loss": 0.8436, + "step": 9983 + }, + { + "epoch": 0.676468595433295, + "grad_norm": 6.659102916717529, + "learning_rate": 8.735984666986104e-05, + "loss": 0.9908, + "step": 9984 + }, + { + "epoch": 0.676536350701267, + "grad_norm": 5.888321876525879, + "learning_rate": 8.735847765076323e-05, + "loss": 0.8068, + "step": 9985 + }, + { + "epoch": 0.6766041059692391, + "grad_norm": 6.159606456756592, + "learning_rate": 8.735710863166541e-05, + "loss": 0.7639, + "step": 9986 + }, + { + "epoch": 0.6766718612372112, + "grad_norm": 6.044656276702881, + "learning_rate": 8.73557396125676e-05, + "loss": 1.0384, + "step": 9987 + }, + { + "epoch": 0.6767396165051833, + "grad_norm": 6.538012504577637, + "learning_rate": 8.735437059346978e-05, + "loss": 0.9542, + "step": 9988 + }, + { + "epoch": 0.6768073717731553, + "grad_norm": 6.6876301765441895, + "learning_rate": 8.735300157437196e-05, + "loss": 0.8471, + "step": 9989 + }, + { + "epoch": 0.6768751270411274, + "grad_norm": 7.488297462463379, + "learning_rate": 8.735163255527415e-05, + "loss": 0.797, + "step": 9990 + }, + { + "epoch": 0.6769428823090995, + "grad_norm": 5.536472320556641, + "learning_rate": 8.735026353617633e-05, + "loss": 0.7013, + "step": 9991 + }, + { + "epoch": 0.6770106375770716, + "grad_norm": 6.263519763946533, + "learning_rate": 8.734889451707851e-05, + "loss": 0.991, + "step": 9992 + }, + { + "epoch": 0.6770783928450437, + "grad_norm": 5.862089157104492, + "learning_rate": 8.73475254979807e-05, + "loss": 0.7879, + "step": 9993 + }, + { + "epoch": 0.6771461481130158, + "grad_norm": 4.833024501800537, + "learning_rate": 8.734615647888288e-05, + "loss": 0.7024, + "step": 9994 + }, + { + "epoch": 0.6772139033809879, + "grad_norm": 6.159411430358887, + "learning_rate": 8.734478745978506e-05, + "loss": 0.7301, + "step": 9995 + }, + { + "epoch": 0.67728165864896, + "grad_norm": 5.821317672729492, + "learning_rate": 8.734341844068726e-05, + "loss": 0.7049, + "step": 9996 + }, + { + "epoch": 0.677349413916932, + "grad_norm": 6.7138872146606445, + "learning_rate": 8.734204942158944e-05, + "loss": 0.9978, + "step": 9997 + }, + { + "epoch": 0.6774171691849041, + "grad_norm": 6.190992832183838, + "learning_rate": 8.734068040249162e-05, + "loss": 0.8829, + "step": 9998 + }, + { + "epoch": 0.6774849244528762, + "grad_norm": 6.912006378173828, + "learning_rate": 8.733931138339381e-05, + "loss": 0.7127, + "step": 9999 + }, + { + "epoch": 0.6775526797208483, + "grad_norm": 7.386782169342041, + "learning_rate": 8.7337942364296e-05, + "loss": 0.7233, + "step": 10000 + }, + { + "epoch": 0.6776204349888204, + "grad_norm": 6.710719585418701, + "learning_rate": 8.733657334519817e-05, + "loss": 0.8965, + "step": 10001 + }, + { + "epoch": 0.6776881902567925, + "grad_norm": 5.841215133666992, + "learning_rate": 8.733520432610035e-05, + "loss": 1.0521, + "step": 10002 + }, + { + "epoch": 0.6777559455247646, + "grad_norm": 7.529298782348633, + "learning_rate": 8.733383530700253e-05, + "loss": 0.9908, + "step": 10003 + }, + { + "epoch": 0.6778237007927367, + "grad_norm": 6.102165699005127, + "learning_rate": 8.733246628790473e-05, + "loss": 0.7755, + "step": 10004 + }, + { + "epoch": 0.6778914560607088, + "grad_norm": 5.879965782165527, + "learning_rate": 8.733109726880691e-05, + "loss": 0.6404, + "step": 10005 + }, + { + "epoch": 0.6779592113286808, + "grad_norm": 5.979146480560303, + "learning_rate": 8.732972824970909e-05, + "loss": 0.6719, + "step": 10006 + }, + { + "epoch": 0.6780269665966528, + "grad_norm": 6.233981609344482, + "learning_rate": 8.732835923061127e-05, + "loss": 0.7691, + "step": 10007 + }, + { + "epoch": 0.6780947218646249, + "grad_norm": 5.87860107421875, + "learning_rate": 8.732699021151346e-05, + "loss": 0.611, + "step": 10008 + }, + { + "epoch": 0.678162477132597, + "grad_norm": 5.905231952667236, + "learning_rate": 8.732562119241564e-05, + "loss": 0.8869, + "step": 10009 + }, + { + "epoch": 0.6782302324005691, + "grad_norm": 5.827250003814697, + "learning_rate": 8.732425217331782e-05, + "loss": 0.8758, + "step": 10010 + }, + { + "epoch": 0.6782979876685412, + "grad_norm": 6.833671569824219, + "learning_rate": 8.732288315422e-05, + "loss": 0.653, + "step": 10011 + }, + { + "epoch": 0.6783657429365133, + "grad_norm": 6.360918045043945, + "learning_rate": 8.732151413512218e-05, + "loss": 0.8103, + "step": 10012 + }, + { + "epoch": 0.6784334982044854, + "grad_norm": 5.429161548614502, + "learning_rate": 8.732014511602438e-05, + "loss": 0.742, + "step": 10013 + }, + { + "epoch": 0.6785012534724575, + "grad_norm": 7.001778602600098, + "learning_rate": 8.731877609692656e-05, + "loss": 0.7378, + "step": 10014 + }, + { + "epoch": 0.6785690087404296, + "grad_norm": 6.182924270629883, + "learning_rate": 8.731740707782874e-05, + "loss": 0.9072, + "step": 10015 + }, + { + "epoch": 0.6786367640084017, + "grad_norm": 6.020389080047607, + "learning_rate": 8.731603805873092e-05, + "loss": 0.7117, + "step": 10016 + }, + { + "epoch": 0.6787045192763738, + "grad_norm": 6.754500865936279, + "learning_rate": 8.731466903963311e-05, + "loss": 1.0786, + "step": 10017 + }, + { + "epoch": 0.6787722745443459, + "grad_norm": 6.181823253631592, + "learning_rate": 8.73133000205353e-05, + "loss": 0.9011, + "step": 10018 + }, + { + "epoch": 0.678840029812318, + "grad_norm": 5.791422367095947, + "learning_rate": 8.731193100143747e-05, + "loss": 0.6102, + "step": 10019 + }, + { + "epoch": 0.67890778508029, + "grad_norm": 6.784679412841797, + "learning_rate": 8.731056198233965e-05, + "loss": 0.7262, + "step": 10020 + }, + { + "epoch": 0.6789755403482621, + "grad_norm": 6.07905387878418, + "learning_rate": 8.730919296324183e-05, + "loss": 1.0617, + "step": 10021 + }, + { + "epoch": 0.6790432956162341, + "grad_norm": 6.993971347808838, + "learning_rate": 8.730782394414403e-05, + "loss": 0.9212, + "step": 10022 + }, + { + "epoch": 0.6791110508842062, + "grad_norm": 6.363950729370117, + "learning_rate": 8.730645492504621e-05, + "loss": 1.0471, + "step": 10023 + }, + { + "epoch": 0.6791788061521783, + "grad_norm": 7.276193618774414, + "learning_rate": 8.730508590594839e-05, + "loss": 0.8397, + "step": 10024 + }, + { + "epoch": 0.6792465614201504, + "grad_norm": 6.6545491218566895, + "learning_rate": 8.730371688685057e-05, + "loss": 0.6311, + "step": 10025 + }, + { + "epoch": 0.6793143166881225, + "grad_norm": 4.888131141662598, + "learning_rate": 8.730234786775275e-05, + "loss": 0.5807, + "step": 10026 + }, + { + "epoch": 0.6793820719560946, + "grad_norm": 5.920746326446533, + "learning_rate": 8.730097884865494e-05, + "loss": 0.624, + "step": 10027 + }, + { + "epoch": 0.6794498272240667, + "grad_norm": 5.996628761291504, + "learning_rate": 8.729960982955712e-05, + "loss": 0.831, + "step": 10028 + }, + { + "epoch": 0.6795175824920388, + "grad_norm": 7.457335472106934, + "learning_rate": 8.72982408104593e-05, + "loss": 0.7734, + "step": 10029 + }, + { + "epoch": 0.6795853377600108, + "grad_norm": 5.239184856414795, + "learning_rate": 8.729687179136148e-05, + "loss": 0.6022, + "step": 10030 + }, + { + "epoch": 0.6796530930279829, + "grad_norm": 8.256555557250977, + "learning_rate": 8.729550277226368e-05, + "loss": 1.265, + "step": 10031 + }, + { + "epoch": 0.679720848295955, + "grad_norm": 5.370911121368408, + "learning_rate": 8.729413375316586e-05, + "loss": 0.847, + "step": 10032 + }, + { + "epoch": 0.6797886035639271, + "grad_norm": 6.657277584075928, + "learning_rate": 8.729276473406804e-05, + "loss": 1.0096, + "step": 10033 + }, + { + "epoch": 0.6798563588318992, + "grad_norm": 7.417891502380371, + "learning_rate": 8.729139571497022e-05, + "loss": 0.8401, + "step": 10034 + }, + { + "epoch": 0.6799241140998713, + "grad_norm": 6.959234237670898, + "learning_rate": 8.72900266958724e-05, + "loss": 0.8181, + "step": 10035 + }, + { + "epoch": 0.6799918693678434, + "grad_norm": 6.10018253326416, + "learning_rate": 8.72886576767746e-05, + "loss": 0.7132, + "step": 10036 + }, + { + "epoch": 0.6800596246358155, + "grad_norm": 6.869701385498047, + "learning_rate": 8.728728865767677e-05, + "loss": 0.6435, + "step": 10037 + }, + { + "epoch": 0.6801273799037875, + "grad_norm": 6.735001087188721, + "learning_rate": 8.728591963857895e-05, + "loss": 0.628, + "step": 10038 + }, + { + "epoch": 0.6801951351717596, + "grad_norm": 5.84961462020874, + "learning_rate": 8.728455061948115e-05, + "loss": 0.7503, + "step": 10039 + }, + { + "epoch": 0.6802628904397316, + "grad_norm": 6.914517879486084, + "learning_rate": 8.728318160038333e-05, + "loss": 0.8171, + "step": 10040 + }, + { + "epoch": 0.6803306457077037, + "grad_norm": 8.97480297088623, + "learning_rate": 8.728181258128551e-05, + "loss": 0.9635, + "step": 10041 + }, + { + "epoch": 0.6803984009756758, + "grad_norm": 5.28425407409668, + "learning_rate": 8.72804435621877e-05, + "loss": 0.656, + "step": 10042 + }, + { + "epoch": 0.6804661562436479, + "grad_norm": 6.638722896575928, + "learning_rate": 8.727907454308988e-05, + "loss": 0.797, + "step": 10043 + }, + { + "epoch": 0.68053391151162, + "grad_norm": 9.306808471679688, + "learning_rate": 8.727770552399206e-05, + "loss": 1.0276, + "step": 10044 + }, + { + "epoch": 0.6806016667795921, + "grad_norm": 5.549346446990967, + "learning_rate": 8.727633650489426e-05, + "loss": 0.7556, + "step": 10045 + }, + { + "epoch": 0.6806694220475642, + "grad_norm": 9.099546432495117, + "learning_rate": 8.727496748579644e-05, + "loss": 0.9813, + "step": 10046 + }, + { + "epoch": 0.6807371773155363, + "grad_norm": 6.115594863891602, + "learning_rate": 8.727359846669862e-05, + "loss": 0.7509, + "step": 10047 + }, + { + "epoch": 0.6808049325835084, + "grad_norm": 6.244608402252197, + "learning_rate": 8.72722294476008e-05, + "loss": 0.8274, + "step": 10048 + }, + { + "epoch": 0.6808726878514805, + "grad_norm": 5.933996677398682, + "learning_rate": 8.727086042850298e-05, + "loss": 0.9218, + "step": 10049 + }, + { + "epoch": 0.6809404431194526, + "grad_norm": 6.216856002807617, + "learning_rate": 8.726949140940517e-05, + "loss": 0.8434, + "step": 10050 + }, + { + "epoch": 0.6810081983874247, + "grad_norm": 8.29095458984375, + "learning_rate": 8.726812239030735e-05, + "loss": 0.9376, + "step": 10051 + }, + { + "epoch": 0.6810759536553967, + "grad_norm": 6.203293323516846, + "learning_rate": 8.726675337120953e-05, + "loss": 0.865, + "step": 10052 + }, + { + "epoch": 0.6811437089233688, + "grad_norm": 7.393670082092285, + "learning_rate": 8.726538435211171e-05, + "loss": 0.9512, + "step": 10053 + }, + { + "epoch": 0.6812114641913409, + "grad_norm": 5.229176044464111, + "learning_rate": 8.726401533301391e-05, + "loss": 0.6983, + "step": 10054 + }, + { + "epoch": 0.6812792194593129, + "grad_norm": 6.016887187957764, + "learning_rate": 8.726264631391609e-05, + "loss": 0.8935, + "step": 10055 + }, + { + "epoch": 0.681346974727285, + "grad_norm": 8.189292907714844, + "learning_rate": 8.726127729481827e-05, + "loss": 0.9643, + "step": 10056 + }, + { + "epoch": 0.6814147299952571, + "grad_norm": 7.380198001861572, + "learning_rate": 8.725990827572045e-05, + "loss": 0.9616, + "step": 10057 + }, + { + "epoch": 0.6814824852632292, + "grad_norm": 5.0216546058654785, + "learning_rate": 8.725853925662263e-05, + "loss": 0.9492, + "step": 10058 + }, + { + "epoch": 0.6815502405312013, + "grad_norm": 4.686762809753418, + "learning_rate": 8.725717023752482e-05, + "loss": 0.7308, + "step": 10059 + }, + { + "epoch": 0.6816179957991734, + "grad_norm": 7.183378219604492, + "learning_rate": 8.7255801218427e-05, + "loss": 0.7267, + "step": 10060 + }, + { + "epoch": 0.6816857510671455, + "grad_norm": 6.899569988250732, + "learning_rate": 8.725443219932918e-05, + "loss": 1.2609, + "step": 10061 + }, + { + "epoch": 0.6817535063351176, + "grad_norm": 6.386919021606445, + "learning_rate": 8.725306318023136e-05, + "loss": 0.8907, + "step": 10062 + }, + { + "epoch": 0.6818212616030896, + "grad_norm": 6.532101631164551, + "learning_rate": 8.725169416113356e-05, + "loss": 0.8595, + "step": 10063 + }, + { + "epoch": 0.6818890168710617, + "grad_norm": 6.360471725463867, + "learning_rate": 8.725032514203574e-05, + "loss": 0.8667, + "step": 10064 + }, + { + "epoch": 0.6819567721390338, + "grad_norm": 6.6123480796813965, + "learning_rate": 8.724895612293792e-05, + "loss": 0.9812, + "step": 10065 + }, + { + "epoch": 0.6820245274070059, + "grad_norm": 7.219352722167969, + "learning_rate": 8.72475871038401e-05, + "loss": 0.8871, + "step": 10066 + }, + { + "epoch": 0.682092282674978, + "grad_norm": 5.686796188354492, + "learning_rate": 8.724621808474228e-05, + "loss": 0.6971, + "step": 10067 + }, + { + "epoch": 0.6821600379429501, + "grad_norm": 5.568495273590088, + "learning_rate": 8.724484906564447e-05, + "loss": 0.6986, + "step": 10068 + }, + { + "epoch": 0.6822277932109222, + "grad_norm": 6.997779369354248, + "learning_rate": 8.724348004654665e-05, + "loss": 1.0181, + "step": 10069 + }, + { + "epoch": 0.6822955484788943, + "grad_norm": 6.177464962005615, + "learning_rate": 8.724211102744883e-05, + "loss": 0.8675, + "step": 10070 + }, + { + "epoch": 0.6823633037468663, + "grad_norm": 5.105398654937744, + "learning_rate": 8.724074200835101e-05, + "loss": 0.7752, + "step": 10071 + }, + { + "epoch": 0.6824310590148384, + "grad_norm": 5.883255481719971, + "learning_rate": 8.723937298925321e-05, + "loss": 0.8301, + "step": 10072 + }, + { + "epoch": 0.6824988142828105, + "grad_norm": 5.932136058807373, + "learning_rate": 8.723800397015539e-05, + "loss": 0.7768, + "step": 10073 + }, + { + "epoch": 0.6825665695507825, + "grad_norm": 6.31164026260376, + "learning_rate": 8.723663495105757e-05, + "loss": 1.0803, + "step": 10074 + }, + { + "epoch": 0.6826343248187546, + "grad_norm": 4.6186089515686035, + "learning_rate": 8.723526593195975e-05, + "loss": 0.7403, + "step": 10075 + }, + { + "epoch": 0.6827020800867267, + "grad_norm": 6.847713470458984, + "learning_rate": 8.723389691286193e-05, + "loss": 0.8813, + "step": 10076 + }, + { + "epoch": 0.6827698353546988, + "grad_norm": 6.186002254486084, + "learning_rate": 8.723252789376412e-05, + "loss": 0.7373, + "step": 10077 + }, + { + "epoch": 0.6828375906226709, + "grad_norm": 5.436232089996338, + "learning_rate": 8.72311588746663e-05, + "loss": 0.8996, + "step": 10078 + }, + { + "epoch": 0.682905345890643, + "grad_norm": 7.013981819152832, + "learning_rate": 8.722978985556848e-05, + "loss": 0.6978, + "step": 10079 + }, + { + "epoch": 0.6829731011586151, + "grad_norm": 6.762590408325195, + "learning_rate": 8.722842083647066e-05, + "loss": 0.8004, + "step": 10080 + }, + { + "epoch": 0.6830408564265872, + "grad_norm": 6.398361682891846, + "learning_rate": 8.722705181737284e-05, + "loss": 0.8928, + "step": 10081 + }, + { + "epoch": 0.6831086116945593, + "grad_norm": 5.5384111404418945, + "learning_rate": 8.722568279827504e-05, + "loss": 0.6761, + "step": 10082 + }, + { + "epoch": 0.6831763669625314, + "grad_norm": 6.1554179191589355, + "learning_rate": 8.722431377917722e-05, + "loss": 0.8894, + "step": 10083 + }, + { + "epoch": 0.6832441222305035, + "grad_norm": 7.378477096557617, + "learning_rate": 8.72229447600794e-05, + "loss": 0.776, + "step": 10084 + }, + { + "epoch": 0.6833118774984756, + "grad_norm": 5.925946235656738, + "learning_rate": 8.72215757409816e-05, + "loss": 0.978, + "step": 10085 + }, + { + "epoch": 0.6833796327664476, + "grad_norm": 9.17393684387207, + "learning_rate": 8.722020672188377e-05, + "loss": 0.9067, + "step": 10086 + }, + { + "epoch": 0.6834473880344196, + "grad_norm": 5.253411293029785, + "learning_rate": 8.721883770278595e-05, + "loss": 0.822, + "step": 10087 + }, + { + "epoch": 0.6835151433023917, + "grad_norm": 5.1418046951293945, + "learning_rate": 8.721746868368815e-05, + "loss": 0.7038, + "step": 10088 + }, + { + "epoch": 0.6835828985703638, + "grad_norm": 6.0158281326293945, + "learning_rate": 8.721609966459033e-05, + "loss": 1.0841, + "step": 10089 + }, + { + "epoch": 0.6836506538383359, + "grad_norm": 5.677688121795654, + "learning_rate": 8.721473064549251e-05, + "loss": 0.7498, + "step": 10090 + }, + { + "epoch": 0.683718409106308, + "grad_norm": 7.7817254066467285, + "learning_rate": 8.72133616263947e-05, + "loss": 0.5788, + "step": 10091 + }, + { + "epoch": 0.6837861643742801, + "grad_norm": 5.363152503967285, + "learning_rate": 8.721199260729688e-05, + "loss": 0.7424, + "step": 10092 + }, + { + "epoch": 0.6838539196422522, + "grad_norm": 6.539010524749756, + "learning_rate": 8.721062358819906e-05, + "loss": 0.8402, + "step": 10093 + }, + { + "epoch": 0.6839216749102243, + "grad_norm": 5.907912254333496, + "learning_rate": 8.720925456910124e-05, + "loss": 0.9475, + "step": 10094 + }, + { + "epoch": 0.6839894301781964, + "grad_norm": 6.317841529846191, + "learning_rate": 8.720788555000344e-05, + "loss": 0.7352, + "step": 10095 + }, + { + "epoch": 0.6840571854461684, + "grad_norm": 6.071649074554443, + "learning_rate": 8.720651653090562e-05, + "loss": 0.7539, + "step": 10096 + }, + { + "epoch": 0.6841249407141405, + "grad_norm": 7.052052974700928, + "learning_rate": 8.72051475118078e-05, + "loss": 0.788, + "step": 10097 + }, + { + "epoch": 0.6841926959821126, + "grad_norm": 5.975690841674805, + "learning_rate": 8.720377849270998e-05, + "loss": 0.8239, + "step": 10098 + }, + { + "epoch": 0.6842604512500847, + "grad_norm": 5.96523904800415, + "learning_rate": 8.720240947361216e-05, + "loss": 0.9525, + "step": 10099 + }, + { + "epoch": 0.6843282065180568, + "grad_norm": 6.296563148498535, + "learning_rate": 8.720104045451435e-05, + "loss": 0.691, + "step": 10100 + }, + { + "epoch": 0.6843959617860289, + "grad_norm": 5.338788986206055, + "learning_rate": 8.719967143541653e-05, + "loss": 0.7297, + "step": 10101 + }, + { + "epoch": 0.684463717054001, + "grad_norm": 6.04310417175293, + "learning_rate": 8.719830241631871e-05, + "loss": 0.8976, + "step": 10102 + }, + { + "epoch": 0.684531472321973, + "grad_norm": 7.260922431945801, + "learning_rate": 8.71969333972209e-05, + "loss": 0.804, + "step": 10103 + }, + { + "epoch": 0.6845992275899451, + "grad_norm": 7.797060489654541, + "learning_rate": 8.719556437812307e-05, + "loss": 1.1672, + "step": 10104 + }, + { + "epoch": 0.6846669828579172, + "grad_norm": 4.863615989685059, + "learning_rate": 8.719419535902527e-05, + "loss": 0.7793, + "step": 10105 + }, + { + "epoch": 0.6847347381258893, + "grad_norm": 6.105317115783691, + "learning_rate": 8.719282633992745e-05, + "loss": 0.7928, + "step": 10106 + }, + { + "epoch": 0.6848024933938613, + "grad_norm": 5.737043380737305, + "learning_rate": 8.719145732082963e-05, + "loss": 0.627, + "step": 10107 + }, + { + "epoch": 0.6848702486618334, + "grad_norm": 5.305082321166992, + "learning_rate": 8.719008830173181e-05, + "loss": 0.6404, + "step": 10108 + }, + { + "epoch": 0.6849380039298055, + "grad_norm": 6.310640335083008, + "learning_rate": 8.7188719282634e-05, + "loss": 1.0716, + "step": 10109 + }, + { + "epoch": 0.6850057591977776, + "grad_norm": 6.286160469055176, + "learning_rate": 8.718735026353618e-05, + "loss": 0.8632, + "step": 10110 + }, + { + "epoch": 0.6850735144657497, + "grad_norm": 6.600961208343506, + "learning_rate": 8.718598124443836e-05, + "loss": 0.7053, + "step": 10111 + }, + { + "epoch": 0.6851412697337218, + "grad_norm": 7.745927333831787, + "learning_rate": 8.718461222534054e-05, + "loss": 1.3105, + "step": 10112 + }, + { + "epoch": 0.6852090250016939, + "grad_norm": 7.073805332183838, + "learning_rate": 8.718324320624272e-05, + "loss": 1.0302, + "step": 10113 + }, + { + "epoch": 0.685276780269666, + "grad_norm": 6.142061233520508, + "learning_rate": 8.718187418714492e-05, + "loss": 0.7433, + "step": 10114 + }, + { + "epoch": 0.6853445355376381, + "grad_norm": 6.279247760772705, + "learning_rate": 8.71805051680471e-05, + "loss": 0.9076, + "step": 10115 + }, + { + "epoch": 0.6854122908056102, + "grad_norm": 6.235780239105225, + "learning_rate": 8.717913614894928e-05, + "loss": 1.0124, + "step": 10116 + }, + { + "epoch": 0.6854800460735823, + "grad_norm": 7.218427658081055, + "learning_rate": 8.717776712985146e-05, + "loss": 0.8519, + "step": 10117 + }, + { + "epoch": 0.6855478013415544, + "grad_norm": 5.481386661529541, + "learning_rate": 8.717639811075365e-05, + "loss": 0.6997, + "step": 10118 + }, + { + "epoch": 0.6856155566095264, + "grad_norm": 6.176963806152344, + "learning_rate": 8.717502909165583e-05, + "loss": 0.7735, + "step": 10119 + }, + { + "epoch": 0.6856833118774984, + "grad_norm": 5.452063083648682, + "learning_rate": 8.717366007255801e-05, + "loss": 0.6307, + "step": 10120 + }, + { + "epoch": 0.6857510671454705, + "grad_norm": 8.103320121765137, + "learning_rate": 8.71722910534602e-05, + "loss": 0.794, + "step": 10121 + }, + { + "epoch": 0.6858188224134426, + "grad_norm": 7.873292922973633, + "learning_rate": 8.717092203436237e-05, + "loss": 0.8941, + "step": 10122 + }, + { + "epoch": 0.6858865776814147, + "grad_norm": 6.97474479675293, + "learning_rate": 8.716955301526457e-05, + "loss": 0.627, + "step": 10123 + }, + { + "epoch": 0.6859543329493868, + "grad_norm": 7.643387794494629, + "learning_rate": 8.716818399616675e-05, + "loss": 0.8326, + "step": 10124 + }, + { + "epoch": 0.6860220882173589, + "grad_norm": 8.497008323669434, + "learning_rate": 8.716681497706893e-05, + "loss": 0.959, + "step": 10125 + }, + { + "epoch": 0.686089843485331, + "grad_norm": 6.50831413269043, + "learning_rate": 8.716544595797111e-05, + "loss": 0.6411, + "step": 10126 + }, + { + "epoch": 0.6861575987533031, + "grad_norm": 7.354664325714111, + "learning_rate": 8.71640769388733e-05, + "loss": 0.9987, + "step": 10127 + }, + { + "epoch": 0.6862253540212752, + "grad_norm": 7.188365936279297, + "learning_rate": 8.716270791977548e-05, + "loss": 0.9559, + "step": 10128 + }, + { + "epoch": 0.6862931092892472, + "grad_norm": 6.465545177459717, + "learning_rate": 8.716133890067766e-05, + "loss": 0.9607, + "step": 10129 + }, + { + "epoch": 0.6863608645572193, + "grad_norm": 5.860640048980713, + "learning_rate": 8.715996988157984e-05, + "loss": 1.0831, + "step": 10130 + }, + { + "epoch": 0.6864286198251914, + "grad_norm": 5.8733296394348145, + "learning_rate": 8.715860086248204e-05, + "loss": 0.94, + "step": 10131 + }, + { + "epoch": 0.6864963750931635, + "grad_norm": 6.498860836029053, + "learning_rate": 8.715723184338422e-05, + "loss": 0.9037, + "step": 10132 + }, + { + "epoch": 0.6865641303611356, + "grad_norm": 6.163070201873779, + "learning_rate": 8.71558628242864e-05, + "loss": 0.7879, + "step": 10133 + }, + { + "epoch": 0.6866318856291077, + "grad_norm": 5.94657564163208, + "learning_rate": 8.715449380518859e-05, + "loss": 0.8599, + "step": 10134 + }, + { + "epoch": 0.6866996408970798, + "grad_norm": 6.023927211761475, + "learning_rate": 8.715312478609077e-05, + "loss": 0.7475, + "step": 10135 + }, + { + "epoch": 0.6867673961650518, + "grad_norm": 6.270586967468262, + "learning_rate": 8.715175576699295e-05, + "loss": 0.754, + "step": 10136 + }, + { + "epoch": 0.6868351514330239, + "grad_norm": 6.190057277679443, + "learning_rate": 8.715038674789515e-05, + "loss": 0.6548, + "step": 10137 + }, + { + "epoch": 0.686902906700996, + "grad_norm": 5.793675899505615, + "learning_rate": 8.714901772879733e-05, + "loss": 0.5634, + "step": 10138 + }, + { + "epoch": 0.686970661968968, + "grad_norm": 6.0643310546875, + "learning_rate": 8.714764870969951e-05, + "loss": 1.0238, + "step": 10139 + }, + { + "epoch": 0.6870384172369401, + "grad_norm": 5.128119945526123, + "learning_rate": 8.714627969060169e-05, + "loss": 0.6071, + "step": 10140 + }, + { + "epoch": 0.6871061725049122, + "grad_norm": 6.435021877288818, + "learning_rate": 8.714491067150388e-05, + "loss": 0.7108, + "step": 10141 + }, + { + "epoch": 0.6871739277728843, + "grad_norm": 7.8082122802734375, + "learning_rate": 8.714354165240606e-05, + "loss": 1.0643, + "step": 10142 + }, + { + "epoch": 0.6872416830408564, + "grad_norm": 8.007335662841797, + "learning_rate": 8.714217263330824e-05, + "loss": 0.962, + "step": 10143 + }, + { + "epoch": 0.6873094383088285, + "grad_norm": 6.111318588256836, + "learning_rate": 8.714080361421042e-05, + "loss": 0.8697, + "step": 10144 + }, + { + "epoch": 0.6873771935768006, + "grad_norm": 5.64456844329834, + "learning_rate": 8.71394345951126e-05, + "loss": 1.128, + "step": 10145 + }, + { + "epoch": 0.6874449488447727, + "grad_norm": 7.482577323913574, + "learning_rate": 8.71380655760148e-05, + "loss": 0.7496, + "step": 10146 + }, + { + "epoch": 0.6875127041127448, + "grad_norm": 5.837367534637451, + "learning_rate": 8.713669655691698e-05, + "loss": 0.7851, + "step": 10147 + }, + { + "epoch": 0.6875804593807169, + "grad_norm": 5.765466213226318, + "learning_rate": 8.713532753781916e-05, + "loss": 0.718, + "step": 10148 + }, + { + "epoch": 0.687648214648689, + "grad_norm": 7.491219520568848, + "learning_rate": 8.713395851872134e-05, + "loss": 1.0697, + "step": 10149 + }, + { + "epoch": 0.6877159699166611, + "grad_norm": 6.724207401275635, + "learning_rate": 8.713258949962353e-05, + "loss": 0.7963, + "step": 10150 + }, + { + "epoch": 0.6877837251846332, + "grad_norm": 6.340019226074219, + "learning_rate": 8.713122048052571e-05, + "loss": 1.0246, + "step": 10151 + }, + { + "epoch": 0.6878514804526051, + "grad_norm": 4.966742038726807, + "learning_rate": 8.712985146142789e-05, + "loss": 0.9185, + "step": 10152 + }, + { + "epoch": 0.6879192357205772, + "grad_norm": 6.308014869689941, + "learning_rate": 8.712848244233007e-05, + "loss": 0.9262, + "step": 10153 + }, + { + "epoch": 0.6879869909885493, + "grad_norm": 7.576474189758301, + "learning_rate": 8.712711342323225e-05, + "loss": 0.7432, + "step": 10154 + }, + { + "epoch": 0.6880547462565214, + "grad_norm": 5.722362041473389, + "learning_rate": 8.712574440413445e-05, + "loss": 0.76, + "step": 10155 + }, + { + "epoch": 0.6881225015244935, + "grad_norm": 4.6947197914123535, + "learning_rate": 8.712437538503663e-05, + "loss": 0.7429, + "step": 10156 + }, + { + "epoch": 0.6881902567924656, + "grad_norm": 6.494797229766846, + "learning_rate": 8.712300636593881e-05, + "loss": 0.9554, + "step": 10157 + }, + { + "epoch": 0.6882580120604377, + "grad_norm": 7.0943403244018555, + "learning_rate": 8.712163734684099e-05, + "loss": 0.9698, + "step": 10158 + }, + { + "epoch": 0.6883257673284098, + "grad_norm": 5.764694690704346, + "learning_rate": 8.712026832774317e-05, + "loss": 0.5992, + "step": 10159 + }, + { + "epoch": 0.6883935225963819, + "grad_norm": 8.111281394958496, + "learning_rate": 8.711889930864536e-05, + "loss": 0.9348, + "step": 10160 + }, + { + "epoch": 0.688461277864354, + "grad_norm": 6.08704948425293, + "learning_rate": 8.711753028954754e-05, + "loss": 0.7621, + "step": 10161 + }, + { + "epoch": 0.688529033132326, + "grad_norm": 7.329418659210205, + "learning_rate": 8.711616127044972e-05, + "loss": 0.8404, + "step": 10162 + }, + { + "epoch": 0.6885967884002981, + "grad_norm": 5.368319988250732, + "learning_rate": 8.71147922513519e-05, + "loss": 0.6699, + "step": 10163 + }, + { + "epoch": 0.6886645436682702, + "grad_norm": 4.94074821472168, + "learning_rate": 8.71134232322541e-05, + "loss": 0.7722, + "step": 10164 + }, + { + "epoch": 0.6887322989362423, + "grad_norm": 7.2699408531188965, + "learning_rate": 8.711205421315628e-05, + "loss": 0.9826, + "step": 10165 + }, + { + "epoch": 0.6888000542042144, + "grad_norm": 7.965369701385498, + "learning_rate": 8.711068519405846e-05, + "loss": 1.0658, + "step": 10166 + }, + { + "epoch": 0.6888678094721865, + "grad_norm": 6.148140907287598, + "learning_rate": 8.710931617496064e-05, + "loss": 0.7169, + "step": 10167 + }, + { + "epoch": 0.6889355647401586, + "grad_norm": 7.394513130187988, + "learning_rate": 8.710794715586282e-05, + "loss": 1.0415, + "step": 10168 + }, + { + "epoch": 0.6890033200081306, + "grad_norm": 6.450402736663818, + "learning_rate": 8.710657813676501e-05, + "loss": 0.8588, + "step": 10169 + }, + { + "epoch": 0.6890710752761027, + "grad_norm": 7.909549236297607, + "learning_rate": 8.71052091176672e-05, + "loss": 0.8389, + "step": 10170 + }, + { + "epoch": 0.6891388305440748, + "grad_norm": 6.7112603187561035, + "learning_rate": 8.710384009856937e-05, + "loss": 0.8337, + "step": 10171 + }, + { + "epoch": 0.6892065858120469, + "grad_norm": 5.394613265991211, + "learning_rate": 8.710247107947155e-05, + "loss": 0.6975, + "step": 10172 + }, + { + "epoch": 0.689274341080019, + "grad_norm": 7.6131911277771, + "learning_rate": 8.710110206037375e-05, + "loss": 0.9802, + "step": 10173 + }, + { + "epoch": 0.689342096347991, + "grad_norm": 5.598437786102295, + "learning_rate": 8.709973304127593e-05, + "loss": 0.6925, + "step": 10174 + }, + { + "epoch": 0.6894098516159631, + "grad_norm": 7.252137660980225, + "learning_rate": 8.709836402217811e-05, + "loss": 0.9988, + "step": 10175 + }, + { + "epoch": 0.6894776068839352, + "grad_norm": 5.04716682434082, + "learning_rate": 8.709699500308029e-05, + "loss": 0.6327, + "step": 10176 + }, + { + "epoch": 0.6895453621519073, + "grad_norm": 5.729875564575195, + "learning_rate": 8.709562598398248e-05, + "loss": 0.7374, + "step": 10177 + }, + { + "epoch": 0.6896131174198794, + "grad_norm": 6.3525166511535645, + "learning_rate": 8.709425696488466e-05, + "loss": 0.9111, + "step": 10178 + }, + { + "epoch": 0.6896808726878515, + "grad_norm": 8.86587905883789, + "learning_rate": 8.709288794578684e-05, + "loss": 0.6059, + "step": 10179 + }, + { + "epoch": 0.6897486279558236, + "grad_norm": 6.739536762237549, + "learning_rate": 8.709151892668904e-05, + "loss": 0.6525, + "step": 10180 + }, + { + "epoch": 0.6898163832237957, + "grad_norm": 6.119062900543213, + "learning_rate": 8.709014990759122e-05, + "loss": 0.8521, + "step": 10181 + }, + { + "epoch": 0.6898841384917678, + "grad_norm": 6.746237754821777, + "learning_rate": 8.70887808884934e-05, + "loss": 0.7897, + "step": 10182 + }, + { + "epoch": 0.6899518937597399, + "grad_norm": 6.002828121185303, + "learning_rate": 8.708741186939559e-05, + "loss": 0.7147, + "step": 10183 + }, + { + "epoch": 0.690019649027712, + "grad_norm": 7.710058212280273, + "learning_rate": 8.708604285029777e-05, + "loss": 0.6674, + "step": 10184 + }, + { + "epoch": 0.6900874042956839, + "grad_norm": 7.451826572418213, + "learning_rate": 8.708467383119995e-05, + "loss": 0.8059, + "step": 10185 + }, + { + "epoch": 0.690155159563656, + "grad_norm": 7.2232465744018555, + "learning_rate": 8.708330481210213e-05, + "loss": 0.8831, + "step": 10186 + }, + { + "epoch": 0.6902229148316281, + "grad_norm": 6.742265701293945, + "learning_rate": 8.708193579300433e-05, + "loss": 0.851, + "step": 10187 + }, + { + "epoch": 0.6902906700996002, + "grad_norm": 6.47482967376709, + "learning_rate": 8.708056677390651e-05, + "loss": 0.7428, + "step": 10188 + }, + { + "epoch": 0.6903584253675723, + "grad_norm": 6.393016815185547, + "learning_rate": 8.707919775480869e-05, + "loss": 0.9357, + "step": 10189 + }, + { + "epoch": 0.6904261806355444, + "grad_norm": 5.898126602172852, + "learning_rate": 8.707782873571087e-05, + "loss": 0.7807, + "step": 10190 + }, + { + "epoch": 0.6904939359035165, + "grad_norm": 7.400199890136719, + "learning_rate": 8.707645971661305e-05, + "loss": 0.9482, + "step": 10191 + }, + { + "epoch": 0.6905616911714886, + "grad_norm": 8.556063652038574, + "learning_rate": 8.707509069751524e-05, + "loss": 0.7055, + "step": 10192 + }, + { + "epoch": 0.6906294464394607, + "grad_norm": 7.6027936935424805, + "learning_rate": 8.707372167841742e-05, + "loss": 0.9158, + "step": 10193 + }, + { + "epoch": 0.6906972017074328, + "grad_norm": 6.305631637573242, + "learning_rate": 8.70723526593196e-05, + "loss": 0.8744, + "step": 10194 + }, + { + "epoch": 0.6907649569754049, + "grad_norm": 5.370072841644287, + "learning_rate": 8.707098364022178e-05, + "loss": 0.7291, + "step": 10195 + }, + { + "epoch": 0.690832712243377, + "grad_norm": 6.723821640014648, + "learning_rate": 8.706961462112398e-05, + "loss": 0.9333, + "step": 10196 + }, + { + "epoch": 0.690900467511349, + "grad_norm": 6.01531982421875, + "learning_rate": 8.706824560202616e-05, + "loss": 0.6833, + "step": 10197 + }, + { + "epoch": 0.6909682227793211, + "grad_norm": 7.747717380523682, + "learning_rate": 8.706687658292834e-05, + "loss": 1.0669, + "step": 10198 + }, + { + "epoch": 0.6910359780472932, + "grad_norm": 6.549305438995361, + "learning_rate": 8.706550756383052e-05, + "loss": 0.6726, + "step": 10199 + }, + { + "epoch": 0.6911037333152653, + "grad_norm": 5.983778476715088, + "learning_rate": 8.70641385447327e-05, + "loss": 0.7065, + "step": 10200 + }, + { + "epoch": 0.6911714885832373, + "grad_norm": 6.709543704986572, + "learning_rate": 8.706276952563489e-05, + "loss": 0.7668, + "step": 10201 + }, + { + "epoch": 0.6912392438512094, + "grad_norm": 6.432425498962402, + "learning_rate": 8.706140050653707e-05, + "loss": 0.8235, + "step": 10202 + }, + { + "epoch": 0.6913069991191815, + "grad_norm": 6.770932197570801, + "learning_rate": 8.706003148743925e-05, + "loss": 0.9328, + "step": 10203 + }, + { + "epoch": 0.6913747543871536, + "grad_norm": 6.075129508972168, + "learning_rate": 8.705866246834143e-05, + "loss": 0.6302, + "step": 10204 + }, + { + "epoch": 0.6914425096551257, + "grad_norm": 5.651463985443115, + "learning_rate": 8.705729344924363e-05, + "loss": 0.8457, + "step": 10205 + }, + { + "epoch": 0.6915102649230978, + "grad_norm": 4.9870524406433105, + "learning_rate": 8.705592443014581e-05, + "loss": 0.685, + "step": 10206 + }, + { + "epoch": 0.6915780201910698, + "grad_norm": 5.86956787109375, + "learning_rate": 8.705455541104799e-05, + "loss": 0.9293, + "step": 10207 + }, + { + "epoch": 0.6916457754590419, + "grad_norm": 6.015864849090576, + "learning_rate": 8.705318639195017e-05, + "loss": 0.7802, + "step": 10208 + }, + { + "epoch": 0.691713530727014, + "grad_norm": 6.092733383178711, + "learning_rate": 8.705181737285235e-05, + "loss": 0.9689, + "step": 10209 + }, + { + "epoch": 0.6917812859949861, + "grad_norm": 5.461453437805176, + "learning_rate": 8.705044835375454e-05, + "loss": 0.8396, + "step": 10210 + }, + { + "epoch": 0.6918490412629582, + "grad_norm": 5.914142608642578, + "learning_rate": 8.704907933465672e-05, + "loss": 0.7936, + "step": 10211 + }, + { + "epoch": 0.6919167965309303, + "grad_norm": 7.034205436706543, + "learning_rate": 8.70477103155589e-05, + "loss": 0.9324, + "step": 10212 + }, + { + "epoch": 0.6919845517989024, + "grad_norm": 5.621762275695801, + "learning_rate": 8.704634129646108e-05, + "loss": 0.7763, + "step": 10213 + }, + { + "epoch": 0.6920523070668745, + "grad_norm": 6.53078031539917, + "learning_rate": 8.704497227736326e-05, + "loss": 0.9933, + "step": 10214 + }, + { + "epoch": 0.6921200623348466, + "grad_norm": 6.093494415283203, + "learning_rate": 8.704360325826546e-05, + "loss": 0.8164, + "step": 10215 + }, + { + "epoch": 0.6921878176028187, + "grad_norm": 5.476284503936768, + "learning_rate": 8.704223423916764e-05, + "loss": 0.8249, + "step": 10216 + }, + { + "epoch": 0.6922555728707908, + "grad_norm": 8.038809776306152, + "learning_rate": 8.704086522006982e-05, + "loss": 1.1032, + "step": 10217 + }, + { + "epoch": 0.6923233281387627, + "grad_norm": 6.303304672241211, + "learning_rate": 8.7039496200972e-05, + "loss": 0.8769, + "step": 10218 + }, + { + "epoch": 0.6923910834067348, + "grad_norm": 5.814499855041504, + "learning_rate": 8.703812718187419e-05, + "loss": 0.9121, + "step": 10219 + }, + { + "epoch": 0.6924588386747069, + "grad_norm": 6.704540729522705, + "learning_rate": 8.703675816277637e-05, + "loss": 1.005, + "step": 10220 + }, + { + "epoch": 0.692526593942679, + "grad_norm": 5.562582969665527, + "learning_rate": 8.703538914367855e-05, + "loss": 0.7873, + "step": 10221 + }, + { + "epoch": 0.6925943492106511, + "grad_norm": 6.583497524261475, + "learning_rate": 8.703402012458073e-05, + "loss": 0.9385, + "step": 10222 + }, + { + "epoch": 0.6926621044786232, + "grad_norm": 6.376049518585205, + "learning_rate": 8.703265110548291e-05, + "loss": 0.9021, + "step": 10223 + }, + { + "epoch": 0.6927298597465953, + "grad_norm": 5.302101135253906, + "learning_rate": 8.703128208638511e-05, + "loss": 0.917, + "step": 10224 + }, + { + "epoch": 0.6927976150145674, + "grad_norm": 7.336282730102539, + "learning_rate": 8.702991306728729e-05, + "loss": 0.6929, + "step": 10225 + }, + { + "epoch": 0.6928653702825395, + "grad_norm": 6.04905366897583, + "learning_rate": 8.702854404818948e-05, + "loss": 0.7451, + "step": 10226 + }, + { + "epoch": 0.6929331255505116, + "grad_norm": 5.474672317504883, + "learning_rate": 8.702717502909166e-05, + "loss": 0.7867, + "step": 10227 + }, + { + "epoch": 0.6930008808184837, + "grad_norm": 5.713932037353516, + "learning_rate": 8.702580600999384e-05, + "loss": 0.7569, + "step": 10228 + }, + { + "epoch": 0.6930686360864557, + "grad_norm": 7.172578811645508, + "learning_rate": 8.702443699089604e-05, + "loss": 0.7756, + "step": 10229 + }, + { + "epoch": 0.6931363913544278, + "grad_norm": 5.1826019287109375, + "learning_rate": 8.702306797179822e-05, + "loss": 0.8408, + "step": 10230 + }, + { + "epoch": 0.6932041466223999, + "grad_norm": 8.113396644592285, + "learning_rate": 8.70216989527004e-05, + "loss": 0.8555, + "step": 10231 + }, + { + "epoch": 0.693271901890372, + "grad_norm": 5.396690845489502, + "learning_rate": 8.702032993360258e-05, + "loss": 0.8125, + "step": 10232 + }, + { + "epoch": 0.6933396571583441, + "grad_norm": 5.951786518096924, + "learning_rate": 8.701896091450477e-05, + "loss": 0.8391, + "step": 10233 + }, + { + "epoch": 0.6934074124263161, + "grad_norm": 7.0163774490356445, + "learning_rate": 8.701759189540695e-05, + "loss": 0.853, + "step": 10234 + }, + { + "epoch": 0.6934751676942882, + "grad_norm": 7.563843727111816, + "learning_rate": 8.701622287630913e-05, + "loss": 0.9711, + "step": 10235 + }, + { + "epoch": 0.6935429229622603, + "grad_norm": 5.8124284744262695, + "learning_rate": 8.701485385721131e-05, + "loss": 0.7018, + "step": 10236 + }, + { + "epoch": 0.6936106782302324, + "grad_norm": 6.22074556350708, + "learning_rate": 8.701348483811349e-05, + "loss": 0.7756, + "step": 10237 + }, + { + "epoch": 0.6936784334982045, + "grad_norm": 5.601717472076416, + "learning_rate": 8.701211581901569e-05, + "loss": 0.7832, + "step": 10238 + }, + { + "epoch": 0.6937461887661766, + "grad_norm": 7.209207534790039, + "learning_rate": 8.701074679991787e-05, + "loss": 0.7866, + "step": 10239 + }, + { + "epoch": 0.6938139440341486, + "grad_norm": 5.176044940948486, + "learning_rate": 8.700937778082005e-05, + "loss": 0.6875, + "step": 10240 + }, + { + "epoch": 0.6938816993021207, + "grad_norm": 6.441755771636963, + "learning_rate": 8.700800876172223e-05, + "loss": 0.6563, + "step": 10241 + }, + { + "epoch": 0.6939494545700928, + "grad_norm": 5.935150146484375, + "learning_rate": 8.700663974262442e-05, + "loss": 0.881, + "step": 10242 + }, + { + "epoch": 0.6940172098380649, + "grad_norm": 6.082694053649902, + "learning_rate": 8.70052707235266e-05, + "loss": 0.7059, + "step": 10243 + }, + { + "epoch": 0.694084965106037, + "grad_norm": 7.9285383224487305, + "learning_rate": 8.700390170442878e-05, + "loss": 0.8529, + "step": 10244 + }, + { + "epoch": 0.6941527203740091, + "grad_norm": 6.027041435241699, + "learning_rate": 8.700253268533096e-05, + "loss": 0.7919, + "step": 10245 + }, + { + "epoch": 0.6942204756419812, + "grad_norm": 6.956554889678955, + "learning_rate": 8.700116366623314e-05, + "loss": 0.8655, + "step": 10246 + }, + { + "epoch": 0.6942882309099533, + "grad_norm": 6.508672714233398, + "learning_rate": 8.699979464713534e-05, + "loss": 0.7306, + "step": 10247 + }, + { + "epoch": 0.6943559861779254, + "grad_norm": 6.117376804351807, + "learning_rate": 8.699842562803752e-05, + "loss": 0.7016, + "step": 10248 + }, + { + "epoch": 0.6944237414458975, + "grad_norm": 5.537294864654541, + "learning_rate": 8.69970566089397e-05, + "loss": 0.8452, + "step": 10249 + }, + { + "epoch": 0.6944914967138694, + "grad_norm": 6.4718241691589355, + "learning_rate": 8.699568758984188e-05, + "loss": 0.9693, + "step": 10250 + }, + { + "epoch": 0.6945592519818415, + "grad_norm": 6.249986171722412, + "learning_rate": 8.699431857074407e-05, + "loss": 0.8, + "step": 10251 + }, + { + "epoch": 0.6946270072498136, + "grad_norm": 5.435842037200928, + "learning_rate": 8.699294955164625e-05, + "loss": 0.684, + "step": 10252 + }, + { + "epoch": 0.6946947625177857, + "grad_norm": 7.15748405456543, + "learning_rate": 8.699158053254843e-05, + "loss": 0.9773, + "step": 10253 + }, + { + "epoch": 0.6947625177857578, + "grad_norm": 6.881677150726318, + "learning_rate": 8.699021151345061e-05, + "loss": 0.8775, + "step": 10254 + }, + { + "epoch": 0.6948302730537299, + "grad_norm": 4.51616096496582, + "learning_rate": 8.69888424943528e-05, + "loss": 0.615, + "step": 10255 + }, + { + "epoch": 0.694898028321702, + "grad_norm": 6.824566841125488, + "learning_rate": 8.698747347525499e-05, + "loss": 0.7369, + "step": 10256 + }, + { + "epoch": 0.6949657835896741, + "grad_norm": 5.30488395690918, + "learning_rate": 8.698610445615717e-05, + "loss": 0.7246, + "step": 10257 + }, + { + "epoch": 0.6950335388576462, + "grad_norm": 7.59017276763916, + "learning_rate": 8.698473543705935e-05, + "loss": 0.6782, + "step": 10258 + }, + { + "epoch": 0.6951012941256183, + "grad_norm": 7.0920867919921875, + "learning_rate": 8.698336641796153e-05, + "loss": 0.8515, + "step": 10259 + }, + { + "epoch": 0.6951690493935904, + "grad_norm": 6.791457653045654, + "learning_rate": 8.698199739886372e-05, + "loss": 0.761, + "step": 10260 + }, + { + "epoch": 0.6952368046615625, + "grad_norm": 5.619175910949707, + "learning_rate": 8.69806283797659e-05, + "loss": 1.08, + "step": 10261 + }, + { + "epoch": 0.6953045599295345, + "grad_norm": 6.623924732208252, + "learning_rate": 8.697925936066808e-05, + "loss": 0.8813, + "step": 10262 + }, + { + "epoch": 0.6953723151975066, + "grad_norm": 5.582348823547363, + "learning_rate": 8.697789034157026e-05, + "loss": 0.8114, + "step": 10263 + }, + { + "epoch": 0.6954400704654787, + "grad_norm": 6.643357753753662, + "learning_rate": 8.697652132247244e-05, + "loss": 0.8595, + "step": 10264 + }, + { + "epoch": 0.6955078257334508, + "grad_norm": 5.2722296714782715, + "learning_rate": 8.697515230337464e-05, + "loss": 0.757, + "step": 10265 + }, + { + "epoch": 0.6955755810014229, + "grad_norm": 6.885534286499023, + "learning_rate": 8.697378328427682e-05, + "loss": 0.9969, + "step": 10266 + }, + { + "epoch": 0.6956433362693949, + "grad_norm": 7.001368522644043, + "learning_rate": 8.6972414265179e-05, + "loss": 0.9588, + "step": 10267 + }, + { + "epoch": 0.695711091537367, + "grad_norm": 7.451569557189941, + "learning_rate": 8.697104524608118e-05, + "loss": 0.7535, + "step": 10268 + }, + { + "epoch": 0.6957788468053391, + "grad_norm": 7.196292877197266, + "learning_rate": 8.696967622698336e-05, + "loss": 1.1084, + "step": 10269 + }, + { + "epoch": 0.6958466020733112, + "grad_norm": 7.165349006652832, + "learning_rate": 8.696830720788555e-05, + "loss": 1.0226, + "step": 10270 + }, + { + "epoch": 0.6959143573412833, + "grad_norm": 6.453275680541992, + "learning_rate": 8.696693818878773e-05, + "loss": 0.6691, + "step": 10271 + }, + { + "epoch": 0.6959821126092554, + "grad_norm": 7.4177069664001465, + "learning_rate": 8.696556916968991e-05, + "loss": 0.6886, + "step": 10272 + }, + { + "epoch": 0.6960498678772274, + "grad_norm": 5.137477874755859, + "learning_rate": 8.696420015059211e-05, + "loss": 0.9635, + "step": 10273 + }, + { + "epoch": 0.6961176231451995, + "grad_norm": 7.988467693328857, + "learning_rate": 8.696283113149429e-05, + "loss": 0.7967, + "step": 10274 + }, + { + "epoch": 0.6961853784131716, + "grad_norm": 7.050410747528076, + "learning_rate": 8.696146211239647e-05, + "loss": 1.0407, + "step": 10275 + }, + { + "epoch": 0.6962531336811437, + "grad_norm": 6.711641311645508, + "learning_rate": 8.696009309329866e-05, + "loss": 0.8607, + "step": 10276 + }, + { + "epoch": 0.6963208889491158, + "grad_norm": 4.641478061676025, + "learning_rate": 8.695872407420084e-05, + "loss": 0.6665, + "step": 10277 + }, + { + "epoch": 0.6963886442170879, + "grad_norm": 5.737218379974365, + "learning_rate": 8.695735505510302e-05, + "loss": 0.6503, + "step": 10278 + }, + { + "epoch": 0.69645639948506, + "grad_norm": 5.518801212310791, + "learning_rate": 8.695598603600522e-05, + "loss": 0.9215, + "step": 10279 + }, + { + "epoch": 0.6965241547530321, + "grad_norm": 6.198950290679932, + "learning_rate": 8.69546170169074e-05, + "loss": 0.6259, + "step": 10280 + }, + { + "epoch": 0.6965919100210042, + "grad_norm": 7.505566596984863, + "learning_rate": 8.695324799780958e-05, + "loss": 0.7902, + "step": 10281 + }, + { + "epoch": 0.6966596652889763, + "grad_norm": 6.407822132110596, + "learning_rate": 8.695187897871176e-05, + "loss": 0.7535, + "step": 10282 + }, + { + "epoch": 0.6967274205569483, + "grad_norm": 7.691595554351807, + "learning_rate": 8.695050995961395e-05, + "loss": 0.7941, + "step": 10283 + }, + { + "epoch": 0.6967951758249203, + "grad_norm": 5.803621292114258, + "learning_rate": 8.694914094051613e-05, + "loss": 0.6529, + "step": 10284 + }, + { + "epoch": 0.6968629310928924, + "grad_norm": 6.0364580154418945, + "learning_rate": 8.694777192141831e-05, + "loss": 0.6357, + "step": 10285 + }, + { + "epoch": 0.6969306863608645, + "grad_norm": 6.369047164916992, + "learning_rate": 8.694640290232049e-05, + "loss": 0.7526, + "step": 10286 + }, + { + "epoch": 0.6969984416288366, + "grad_norm": 5.736650466918945, + "learning_rate": 8.694503388322267e-05, + "loss": 0.945, + "step": 10287 + }, + { + "epoch": 0.6970661968968087, + "grad_norm": 5.924343109130859, + "learning_rate": 8.694366486412487e-05, + "loss": 0.7378, + "step": 10288 + }, + { + "epoch": 0.6971339521647808, + "grad_norm": 8.118910789489746, + "learning_rate": 8.694229584502705e-05, + "loss": 1.2927, + "step": 10289 + }, + { + "epoch": 0.6972017074327529, + "grad_norm": 6.7456464767456055, + "learning_rate": 8.694092682592923e-05, + "loss": 0.8051, + "step": 10290 + }, + { + "epoch": 0.697269462700725, + "grad_norm": 8.029818534851074, + "learning_rate": 8.693955780683141e-05, + "loss": 0.7865, + "step": 10291 + }, + { + "epoch": 0.6973372179686971, + "grad_norm": 8.77468204498291, + "learning_rate": 8.693818878773359e-05, + "loss": 0.8901, + "step": 10292 + }, + { + "epoch": 0.6974049732366692, + "grad_norm": 8.635099411010742, + "learning_rate": 8.693681976863578e-05, + "loss": 0.7982, + "step": 10293 + }, + { + "epoch": 0.6974727285046413, + "grad_norm": 6.938762187957764, + "learning_rate": 8.693545074953796e-05, + "loss": 0.7571, + "step": 10294 + }, + { + "epoch": 0.6975404837726134, + "grad_norm": 6.177728652954102, + "learning_rate": 8.693408173044014e-05, + "loss": 0.8976, + "step": 10295 + }, + { + "epoch": 0.6976082390405854, + "grad_norm": 7.001784324645996, + "learning_rate": 8.693271271134232e-05, + "loss": 0.8799, + "step": 10296 + }, + { + "epoch": 0.6976759943085575, + "grad_norm": 5.89376163482666, + "learning_rate": 8.693134369224452e-05, + "loss": 0.6843, + "step": 10297 + }, + { + "epoch": 0.6977437495765296, + "grad_norm": 6.412653923034668, + "learning_rate": 8.69299746731467e-05, + "loss": 0.8198, + "step": 10298 + }, + { + "epoch": 0.6978115048445016, + "grad_norm": 6.647368907928467, + "learning_rate": 8.692860565404888e-05, + "loss": 1.0389, + "step": 10299 + }, + { + "epoch": 0.6978792601124737, + "grad_norm": 7.058552265167236, + "learning_rate": 8.692723663495106e-05, + "loss": 0.9025, + "step": 10300 + }, + { + "epoch": 0.6979470153804458, + "grad_norm": 6.235774993896484, + "learning_rate": 8.692586761585324e-05, + "loss": 0.7788, + "step": 10301 + }, + { + "epoch": 0.6980147706484179, + "grad_norm": 6.167398929595947, + "learning_rate": 8.692449859675543e-05, + "loss": 0.6807, + "step": 10302 + }, + { + "epoch": 0.69808252591639, + "grad_norm": 5.845956802368164, + "learning_rate": 8.692312957765761e-05, + "loss": 0.8136, + "step": 10303 + }, + { + "epoch": 0.6981502811843621, + "grad_norm": 6.548614025115967, + "learning_rate": 8.692176055855979e-05, + "loss": 0.7352, + "step": 10304 + }, + { + "epoch": 0.6982180364523342, + "grad_norm": 6.432018756866455, + "learning_rate": 8.692039153946197e-05, + "loss": 0.9196, + "step": 10305 + }, + { + "epoch": 0.6982857917203062, + "grad_norm": 7.851593971252441, + "learning_rate": 8.691902252036417e-05, + "loss": 1.1077, + "step": 10306 + }, + { + "epoch": 0.6983535469882783, + "grad_norm": 6.037036895751953, + "learning_rate": 8.691765350126635e-05, + "loss": 0.6674, + "step": 10307 + }, + { + "epoch": 0.6984213022562504, + "grad_norm": 6.833919048309326, + "learning_rate": 8.691628448216853e-05, + "loss": 1.1161, + "step": 10308 + }, + { + "epoch": 0.6984890575242225, + "grad_norm": 7.268690586090088, + "learning_rate": 8.691491546307071e-05, + "loss": 0.8906, + "step": 10309 + }, + { + "epoch": 0.6985568127921946, + "grad_norm": 5.807277679443359, + "learning_rate": 8.691354644397289e-05, + "loss": 0.8622, + "step": 10310 + }, + { + "epoch": 0.6986245680601667, + "grad_norm": 6.888156414031982, + "learning_rate": 8.691217742487508e-05, + "loss": 0.9483, + "step": 10311 + }, + { + "epoch": 0.6986923233281388, + "grad_norm": 5.432703971862793, + "learning_rate": 8.691080840577726e-05, + "loss": 0.6413, + "step": 10312 + }, + { + "epoch": 0.6987600785961109, + "grad_norm": 6.416975021362305, + "learning_rate": 8.690943938667944e-05, + "loss": 0.862, + "step": 10313 + }, + { + "epoch": 0.698827833864083, + "grad_norm": 5.268738746643066, + "learning_rate": 8.690807036758162e-05, + "loss": 0.6074, + "step": 10314 + }, + { + "epoch": 0.698895589132055, + "grad_norm": 7.375731945037842, + "learning_rate": 8.690670134848382e-05, + "loss": 0.8446, + "step": 10315 + }, + { + "epoch": 0.698963344400027, + "grad_norm": 5.654892921447754, + "learning_rate": 8.6905332329386e-05, + "loss": 0.7604, + "step": 10316 + }, + { + "epoch": 0.6990310996679991, + "grad_norm": 6.280389308929443, + "learning_rate": 8.690396331028818e-05, + "loss": 0.7515, + "step": 10317 + }, + { + "epoch": 0.6990988549359712, + "grad_norm": 4.915734767913818, + "learning_rate": 8.690259429119036e-05, + "loss": 0.8976, + "step": 10318 + }, + { + "epoch": 0.6991666102039433, + "grad_norm": 6.705817222595215, + "learning_rate": 8.690122527209255e-05, + "loss": 0.7172, + "step": 10319 + }, + { + "epoch": 0.6992343654719154, + "grad_norm": 5.9727253913879395, + "learning_rate": 8.689985625299473e-05, + "loss": 0.9687, + "step": 10320 + }, + { + "epoch": 0.6993021207398875, + "grad_norm": 8.154400825500488, + "learning_rate": 8.689848723389691e-05, + "loss": 0.8834, + "step": 10321 + }, + { + "epoch": 0.6993698760078596, + "grad_norm": 5.356873512268066, + "learning_rate": 8.68971182147991e-05, + "loss": 0.608, + "step": 10322 + }, + { + "epoch": 0.6994376312758317, + "grad_norm": 7.385077476501465, + "learning_rate": 8.689574919570129e-05, + "loss": 0.7914, + "step": 10323 + }, + { + "epoch": 0.6995053865438038, + "grad_norm": 5.762533664703369, + "learning_rate": 8.689438017660347e-05, + "loss": 0.7171, + "step": 10324 + }, + { + "epoch": 0.6995731418117759, + "grad_norm": 6.16245698928833, + "learning_rate": 8.689301115750566e-05, + "loss": 0.7742, + "step": 10325 + }, + { + "epoch": 0.699640897079748, + "grad_norm": 5.776895523071289, + "learning_rate": 8.689164213840784e-05, + "loss": 0.8008, + "step": 10326 + }, + { + "epoch": 0.6997086523477201, + "grad_norm": 7.285096645355225, + "learning_rate": 8.689027311931002e-05, + "loss": 1.0003, + "step": 10327 + }, + { + "epoch": 0.6997764076156922, + "grad_norm": 6.187610149383545, + "learning_rate": 8.68889041002122e-05, + "loss": 0.9697, + "step": 10328 + }, + { + "epoch": 0.6998441628836642, + "grad_norm": 7.224822521209717, + "learning_rate": 8.68875350811144e-05, + "loss": 0.8965, + "step": 10329 + }, + { + "epoch": 0.6999119181516363, + "grad_norm": 7.907904624938965, + "learning_rate": 8.688616606201658e-05, + "loss": 0.9466, + "step": 10330 + }, + { + "epoch": 0.6999796734196084, + "grad_norm": 5.577702522277832, + "learning_rate": 8.688479704291876e-05, + "loss": 0.7779, + "step": 10331 + }, + { + "epoch": 0.7000474286875804, + "grad_norm": 6.485890865325928, + "learning_rate": 8.688342802382094e-05, + "loss": 0.9368, + "step": 10332 + }, + { + "epoch": 0.7001151839555525, + "grad_norm": 6.532778739929199, + "learning_rate": 8.688205900472312e-05, + "loss": 0.9044, + "step": 10333 + }, + { + "epoch": 0.7001829392235246, + "grad_norm": 9.568724632263184, + "learning_rate": 8.688068998562531e-05, + "loss": 1.1657, + "step": 10334 + }, + { + "epoch": 0.7002506944914967, + "grad_norm": 7.1607255935668945, + "learning_rate": 8.687932096652749e-05, + "loss": 0.8107, + "step": 10335 + }, + { + "epoch": 0.7003184497594688, + "grad_norm": 7.112110614776611, + "learning_rate": 8.687795194742967e-05, + "loss": 0.8092, + "step": 10336 + }, + { + "epoch": 0.7003862050274409, + "grad_norm": 6.201446056365967, + "learning_rate": 8.687658292833185e-05, + "loss": 0.8682, + "step": 10337 + }, + { + "epoch": 0.700453960295413, + "grad_norm": 5.587967395782471, + "learning_rate": 8.687521390923405e-05, + "loss": 0.7769, + "step": 10338 + }, + { + "epoch": 0.700521715563385, + "grad_norm": 4.441295623779297, + "learning_rate": 8.687384489013623e-05, + "loss": 0.5984, + "step": 10339 + }, + { + "epoch": 0.7005894708313571, + "grad_norm": 7.061400413513184, + "learning_rate": 8.687247587103841e-05, + "loss": 0.7871, + "step": 10340 + }, + { + "epoch": 0.7006572260993292, + "grad_norm": 6.004641532897949, + "learning_rate": 8.687110685194059e-05, + "loss": 0.8535, + "step": 10341 + }, + { + "epoch": 0.7007249813673013, + "grad_norm": 6.329019546508789, + "learning_rate": 8.686973783284277e-05, + "loss": 0.7604, + "step": 10342 + }, + { + "epoch": 0.7007927366352734, + "grad_norm": 5.995157718658447, + "learning_rate": 8.686836881374496e-05, + "loss": 0.8674, + "step": 10343 + }, + { + "epoch": 0.7008604919032455, + "grad_norm": 6.5860915184021, + "learning_rate": 8.686699979464714e-05, + "loss": 0.6908, + "step": 10344 + }, + { + "epoch": 0.7009282471712176, + "grad_norm": 7.01938009262085, + "learning_rate": 8.686563077554932e-05, + "loss": 0.7821, + "step": 10345 + }, + { + "epoch": 0.7009960024391897, + "grad_norm": 4.958036422729492, + "learning_rate": 8.68642617564515e-05, + "loss": 0.7112, + "step": 10346 + }, + { + "epoch": 0.7010637577071618, + "grad_norm": 5.658689022064209, + "learning_rate": 8.686289273735368e-05, + "loss": 0.768, + "step": 10347 + }, + { + "epoch": 0.7011315129751338, + "grad_norm": 7.060564994812012, + "learning_rate": 8.686152371825588e-05, + "loss": 1.1327, + "step": 10348 + }, + { + "epoch": 0.7011992682431059, + "grad_norm": 6.2527265548706055, + "learning_rate": 8.686015469915806e-05, + "loss": 0.8952, + "step": 10349 + }, + { + "epoch": 0.701267023511078, + "grad_norm": 7.9083452224731445, + "learning_rate": 8.685878568006024e-05, + "loss": 0.9505, + "step": 10350 + }, + { + "epoch": 0.70133477877905, + "grad_norm": 7.475040435791016, + "learning_rate": 8.685741666096242e-05, + "loss": 0.8404, + "step": 10351 + }, + { + "epoch": 0.7014025340470221, + "grad_norm": 7.27475643157959, + "learning_rate": 8.685604764186461e-05, + "loss": 0.9485, + "step": 10352 + }, + { + "epoch": 0.7014702893149942, + "grad_norm": 5.844339847564697, + "learning_rate": 8.685467862276679e-05, + "loss": 0.9179, + "step": 10353 + }, + { + "epoch": 0.7015380445829663, + "grad_norm": 6.823174953460693, + "learning_rate": 8.685330960366897e-05, + "loss": 0.7872, + "step": 10354 + }, + { + "epoch": 0.7016057998509384, + "grad_norm": 7.774914264678955, + "learning_rate": 8.685194058457115e-05, + "loss": 0.9215, + "step": 10355 + }, + { + "epoch": 0.7016735551189105, + "grad_norm": 6.16814661026001, + "learning_rate": 8.685057156547333e-05, + "loss": 0.9785, + "step": 10356 + }, + { + "epoch": 0.7017413103868826, + "grad_norm": 5.761654853820801, + "learning_rate": 8.684920254637553e-05, + "loss": 0.8459, + "step": 10357 + }, + { + "epoch": 0.7018090656548547, + "grad_norm": 5.926375865936279, + "learning_rate": 8.684783352727771e-05, + "loss": 0.7908, + "step": 10358 + }, + { + "epoch": 0.7018768209228268, + "grad_norm": 7.2848639488220215, + "learning_rate": 8.684646450817989e-05, + "loss": 0.8424, + "step": 10359 + }, + { + "epoch": 0.7019445761907989, + "grad_norm": 6.377554416656494, + "learning_rate": 8.684509548908207e-05, + "loss": 0.8161, + "step": 10360 + }, + { + "epoch": 0.702012331458771, + "grad_norm": 6.2031426429748535, + "learning_rate": 8.684372646998426e-05, + "loss": 0.9576, + "step": 10361 + }, + { + "epoch": 0.702080086726743, + "grad_norm": 7.374354362487793, + "learning_rate": 8.684235745088644e-05, + "loss": 0.8022, + "step": 10362 + }, + { + "epoch": 0.7021478419947151, + "grad_norm": 5.276646614074707, + "learning_rate": 8.684098843178862e-05, + "loss": 0.5603, + "step": 10363 + }, + { + "epoch": 0.7022155972626871, + "grad_norm": 5.207109451293945, + "learning_rate": 8.68396194126908e-05, + "loss": 0.5605, + "step": 10364 + }, + { + "epoch": 0.7022833525306592, + "grad_norm": 6.302850723266602, + "learning_rate": 8.6838250393593e-05, + "loss": 0.861, + "step": 10365 + }, + { + "epoch": 0.7023511077986313, + "grad_norm": 5.8094072341918945, + "learning_rate": 8.683688137449518e-05, + "loss": 0.8374, + "step": 10366 + }, + { + "epoch": 0.7024188630666034, + "grad_norm": 6.657436370849609, + "learning_rate": 8.683551235539736e-05, + "loss": 0.9263, + "step": 10367 + }, + { + "epoch": 0.7024866183345755, + "grad_norm": 5.042036533355713, + "learning_rate": 8.683414333629955e-05, + "loss": 0.6188, + "step": 10368 + }, + { + "epoch": 0.7025543736025476, + "grad_norm": 5.913759231567383, + "learning_rate": 8.683277431720173e-05, + "loss": 0.6699, + "step": 10369 + }, + { + "epoch": 0.7026221288705197, + "grad_norm": 6.477380752563477, + "learning_rate": 8.683140529810391e-05, + "loss": 0.8353, + "step": 10370 + }, + { + "epoch": 0.7026898841384918, + "grad_norm": 5.284722805023193, + "learning_rate": 8.68300362790061e-05, + "loss": 0.8633, + "step": 10371 + }, + { + "epoch": 0.7027576394064639, + "grad_norm": 5.480528354644775, + "learning_rate": 8.682866725990829e-05, + "loss": 0.7068, + "step": 10372 + }, + { + "epoch": 0.7028253946744359, + "grad_norm": 5.857044696807861, + "learning_rate": 8.682729824081047e-05, + "loss": 0.8696, + "step": 10373 + }, + { + "epoch": 0.702893149942408, + "grad_norm": 6.4731764793396, + "learning_rate": 8.682592922171265e-05, + "loss": 0.7464, + "step": 10374 + }, + { + "epoch": 0.7029609052103801, + "grad_norm": 7.0602827072143555, + "learning_rate": 8.682456020261484e-05, + "loss": 0.6128, + "step": 10375 + }, + { + "epoch": 0.7030286604783522, + "grad_norm": 5.9556474685668945, + "learning_rate": 8.682319118351702e-05, + "loss": 0.7004, + "step": 10376 + }, + { + "epoch": 0.7030964157463243, + "grad_norm": 7.673183917999268, + "learning_rate": 8.68218221644192e-05, + "loss": 0.7612, + "step": 10377 + }, + { + "epoch": 0.7031641710142964, + "grad_norm": 7.043504238128662, + "learning_rate": 8.682045314532138e-05, + "loss": 1.1159, + "step": 10378 + }, + { + "epoch": 0.7032319262822685, + "grad_norm": 6.224184513092041, + "learning_rate": 8.681908412622356e-05, + "loss": 0.7247, + "step": 10379 + }, + { + "epoch": 0.7032996815502406, + "grad_norm": 7.104019641876221, + "learning_rate": 8.681771510712576e-05, + "loss": 0.8121, + "step": 10380 + }, + { + "epoch": 0.7033674368182126, + "grad_norm": 6.4362263679504395, + "learning_rate": 8.681634608802794e-05, + "loss": 0.8133, + "step": 10381 + }, + { + "epoch": 0.7034351920861847, + "grad_norm": 5.4112067222595215, + "learning_rate": 8.681497706893012e-05, + "loss": 0.8037, + "step": 10382 + }, + { + "epoch": 0.7035029473541567, + "grad_norm": 8.056005477905273, + "learning_rate": 8.68136080498323e-05, + "loss": 1.0531, + "step": 10383 + }, + { + "epoch": 0.7035707026221288, + "grad_norm": 6.620260715484619, + "learning_rate": 8.681223903073449e-05, + "loss": 0.8135, + "step": 10384 + }, + { + "epoch": 0.7036384578901009, + "grad_norm": 5.953632354736328, + "learning_rate": 8.681087001163667e-05, + "loss": 0.9362, + "step": 10385 + }, + { + "epoch": 0.703706213158073, + "grad_norm": 4.4729719161987305, + "learning_rate": 8.680950099253885e-05, + "loss": 0.8507, + "step": 10386 + }, + { + "epoch": 0.7037739684260451, + "grad_norm": 6.998383522033691, + "learning_rate": 8.680813197344103e-05, + "loss": 0.8018, + "step": 10387 + }, + { + "epoch": 0.7038417236940172, + "grad_norm": 5.445269584655762, + "learning_rate": 8.680676295434321e-05, + "loss": 0.7241, + "step": 10388 + }, + { + "epoch": 0.7039094789619893, + "grad_norm": 7.320235729217529, + "learning_rate": 8.68053939352454e-05, + "loss": 1.0455, + "step": 10389 + }, + { + "epoch": 0.7039772342299614, + "grad_norm": 7.40581750869751, + "learning_rate": 8.680402491614759e-05, + "loss": 0.8128, + "step": 10390 + }, + { + "epoch": 0.7040449894979335, + "grad_norm": 6.813145637512207, + "learning_rate": 8.680265589704977e-05, + "loss": 0.9455, + "step": 10391 + }, + { + "epoch": 0.7041127447659056, + "grad_norm": 5.903909683227539, + "learning_rate": 8.680128687795195e-05, + "loss": 0.6496, + "step": 10392 + }, + { + "epoch": 0.7041805000338777, + "grad_norm": 4.9222846031188965, + "learning_rate": 8.679991785885414e-05, + "loss": 0.6605, + "step": 10393 + }, + { + "epoch": 0.7042482553018498, + "grad_norm": 6.948107719421387, + "learning_rate": 8.679854883975632e-05, + "loss": 0.9015, + "step": 10394 + }, + { + "epoch": 0.7043160105698218, + "grad_norm": 6.005917072296143, + "learning_rate": 8.67971798206585e-05, + "loss": 0.9129, + "step": 10395 + }, + { + "epoch": 0.7043837658377939, + "grad_norm": 5.235043048858643, + "learning_rate": 8.679581080156068e-05, + "loss": 0.9046, + "step": 10396 + }, + { + "epoch": 0.7044515211057659, + "grad_norm": 6.271544456481934, + "learning_rate": 8.679444178246286e-05, + "loss": 1.0159, + "step": 10397 + }, + { + "epoch": 0.704519276373738, + "grad_norm": 5.432122707366943, + "learning_rate": 8.679307276336506e-05, + "loss": 0.8806, + "step": 10398 + }, + { + "epoch": 0.7045870316417101, + "grad_norm": 5.534310817718506, + "learning_rate": 8.679170374426724e-05, + "loss": 0.8565, + "step": 10399 + }, + { + "epoch": 0.7046547869096822, + "grad_norm": 6.202160835266113, + "learning_rate": 8.679033472516942e-05, + "loss": 1.1205, + "step": 10400 + }, + { + "epoch": 0.7047225421776543, + "grad_norm": 7.187075614929199, + "learning_rate": 8.67889657060716e-05, + "loss": 0.8384, + "step": 10401 + }, + { + "epoch": 0.7047902974456264, + "grad_norm": 4.62196159362793, + "learning_rate": 8.678759668697378e-05, + "loss": 0.6643, + "step": 10402 + }, + { + "epoch": 0.7048580527135985, + "grad_norm": 10.150405883789062, + "learning_rate": 8.678622766787597e-05, + "loss": 0.6745, + "step": 10403 + }, + { + "epoch": 0.7049258079815706, + "grad_norm": 6.843104362487793, + "learning_rate": 8.678485864877815e-05, + "loss": 0.7891, + "step": 10404 + }, + { + "epoch": 0.7049935632495427, + "grad_norm": 6.199191570281982, + "learning_rate": 8.678348962968033e-05, + "loss": 0.7102, + "step": 10405 + }, + { + "epoch": 0.7050613185175147, + "grad_norm": 5.368592739105225, + "learning_rate": 8.678212061058251e-05, + "loss": 0.8577, + "step": 10406 + }, + { + "epoch": 0.7051290737854868, + "grad_norm": 4.696331977844238, + "learning_rate": 8.67807515914847e-05, + "loss": 0.6958, + "step": 10407 + }, + { + "epoch": 0.7051968290534589, + "grad_norm": 6.961827754974365, + "learning_rate": 8.677938257238689e-05, + "loss": 1.0023, + "step": 10408 + }, + { + "epoch": 0.705264584321431, + "grad_norm": 6.114429473876953, + "learning_rate": 8.677801355328907e-05, + "loss": 0.6351, + "step": 10409 + }, + { + "epoch": 0.7053323395894031, + "grad_norm": 7.005643844604492, + "learning_rate": 8.677664453419125e-05, + "loss": 1.039, + "step": 10410 + }, + { + "epoch": 0.7054000948573752, + "grad_norm": 5.262114524841309, + "learning_rate": 8.677527551509344e-05, + "loss": 0.7621, + "step": 10411 + }, + { + "epoch": 0.7054678501253473, + "grad_norm": 6.364197731018066, + "learning_rate": 8.677390649599562e-05, + "loss": 0.8989, + "step": 10412 + }, + { + "epoch": 0.7055356053933193, + "grad_norm": 5.497344970703125, + "learning_rate": 8.67725374768978e-05, + "loss": 0.6066, + "step": 10413 + }, + { + "epoch": 0.7056033606612914, + "grad_norm": 6.382382869720459, + "learning_rate": 8.67711684578e-05, + "loss": 0.7266, + "step": 10414 + }, + { + "epoch": 0.7056711159292635, + "grad_norm": 7.423126220703125, + "learning_rate": 8.676979943870218e-05, + "loss": 0.7454, + "step": 10415 + }, + { + "epoch": 0.7057388711972356, + "grad_norm": 7.46668004989624, + "learning_rate": 8.676843041960436e-05, + "loss": 0.845, + "step": 10416 + }, + { + "epoch": 0.7058066264652076, + "grad_norm": 5.152261734008789, + "learning_rate": 8.676706140050655e-05, + "loss": 0.8531, + "step": 10417 + }, + { + "epoch": 0.7058743817331797, + "grad_norm": 8.402978897094727, + "learning_rate": 8.676569238140873e-05, + "loss": 1.0256, + "step": 10418 + }, + { + "epoch": 0.7059421370011518, + "grad_norm": 5.3230299949646, + "learning_rate": 8.676432336231091e-05, + "loss": 0.7647, + "step": 10419 + }, + { + "epoch": 0.7060098922691239, + "grad_norm": 7.257562160491943, + "learning_rate": 8.676295434321309e-05, + "loss": 0.8413, + "step": 10420 + }, + { + "epoch": 0.706077647537096, + "grad_norm": 5.904243469238281, + "learning_rate": 8.676158532411529e-05, + "loss": 0.8503, + "step": 10421 + }, + { + "epoch": 0.7061454028050681, + "grad_norm": 6.7053141593933105, + "learning_rate": 8.676021630501747e-05, + "loss": 0.8836, + "step": 10422 + }, + { + "epoch": 0.7062131580730402, + "grad_norm": 7.1715407371521, + "learning_rate": 8.675884728591965e-05, + "loss": 0.8266, + "step": 10423 + }, + { + "epoch": 0.7062809133410123, + "grad_norm": 6.313091278076172, + "learning_rate": 8.675747826682183e-05, + "loss": 0.6628, + "step": 10424 + }, + { + "epoch": 0.7063486686089844, + "grad_norm": 5.576920986175537, + "learning_rate": 8.675610924772401e-05, + "loss": 0.7227, + "step": 10425 + }, + { + "epoch": 0.7064164238769565, + "grad_norm": 6.882504463195801, + "learning_rate": 8.67547402286262e-05, + "loss": 0.8294, + "step": 10426 + }, + { + "epoch": 0.7064841791449286, + "grad_norm": 8.857022285461426, + "learning_rate": 8.675337120952838e-05, + "loss": 0.5265, + "step": 10427 + }, + { + "epoch": 0.7065519344129007, + "grad_norm": 6.785702228546143, + "learning_rate": 8.675200219043056e-05, + "loss": 0.8107, + "step": 10428 + }, + { + "epoch": 0.7066196896808727, + "grad_norm": 7.2406415939331055, + "learning_rate": 8.675063317133274e-05, + "loss": 0.8089, + "step": 10429 + }, + { + "epoch": 0.7066874449488447, + "grad_norm": 5.409148216247559, + "learning_rate": 8.674926415223494e-05, + "loss": 0.7191, + "step": 10430 + }, + { + "epoch": 0.7067552002168168, + "grad_norm": 6.049896717071533, + "learning_rate": 8.674789513313712e-05, + "loss": 0.8811, + "step": 10431 + }, + { + "epoch": 0.7068229554847889, + "grad_norm": 8.478447914123535, + "learning_rate": 8.67465261140393e-05, + "loss": 0.8117, + "step": 10432 + }, + { + "epoch": 0.706890710752761, + "grad_norm": 6.473963260650635, + "learning_rate": 8.674515709494148e-05, + "loss": 1.0078, + "step": 10433 + }, + { + "epoch": 0.7069584660207331, + "grad_norm": 5.57169771194458, + "learning_rate": 8.674378807584366e-05, + "loss": 0.6526, + "step": 10434 + }, + { + "epoch": 0.7070262212887052, + "grad_norm": 5.3196492195129395, + "learning_rate": 8.674241905674585e-05, + "loss": 0.6312, + "step": 10435 + }, + { + "epoch": 0.7070939765566773, + "grad_norm": 5.9507293701171875, + "learning_rate": 8.674105003764803e-05, + "loss": 0.7502, + "step": 10436 + }, + { + "epoch": 0.7071617318246494, + "grad_norm": 5.272159099578857, + "learning_rate": 8.673968101855021e-05, + "loss": 0.7445, + "step": 10437 + }, + { + "epoch": 0.7072294870926215, + "grad_norm": 8.225152969360352, + "learning_rate": 8.673831199945239e-05, + "loss": 0.9625, + "step": 10438 + }, + { + "epoch": 0.7072972423605935, + "grad_norm": 5.791821479797363, + "learning_rate": 8.673694298035459e-05, + "loss": 0.7997, + "step": 10439 + }, + { + "epoch": 0.7073649976285656, + "grad_norm": 6.391631126403809, + "learning_rate": 8.673557396125677e-05, + "loss": 0.8449, + "step": 10440 + }, + { + "epoch": 0.7074327528965377, + "grad_norm": 6.157900333404541, + "learning_rate": 8.673420494215895e-05, + "loss": 0.8001, + "step": 10441 + }, + { + "epoch": 0.7075005081645098, + "grad_norm": 5.64890193939209, + "learning_rate": 8.673283592306113e-05, + "loss": 0.6663, + "step": 10442 + }, + { + "epoch": 0.7075682634324819, + "grad_norm": 7.436509132385254, + "learning_rate": 8.673146690396331e-05, + "loss": 0.8742, + "step": 10443 + }, + { + "epoch": 0.707636018700454, + "grad_norm": 4.78845739364624, + "learning_rate": 8.67300978848655e-05, + "loss": 0.5905, + "step": 10444 + }, + { + "epoch": 0.7077037739684261, + "grad_norm": 7.130674362182617, + "learning_rate": 8.672872886576768e-05, + "loss": 1.2396, + "step": 10445 + }, + { + "epoch": 0.7077715292363981, + "grad_norm": 7.3212761878967285, + "learning_rate": 8.672735984666986e-05, + "loss": 0.8896, + "step": 10446 + }, + { + "epoch": 0.7078392845043702, + "grad_norm": 7.6907548904418945, + "learning_rate": 8.672599082757204e-05, + "loss": 1.1194, + "step": 10447 + }, + { + "epoch": 0.7079070397723423, + "grad_norm": 6.078713417053223, + "learning_rate": 8.672462180847424e-05, + "loss": 0.8578, + "step": 10448 + }, + { + "epoch": 0.7079747950403144, + "grad_norm": 6.047597408294678, + "learning_rate": 8.672325278937642e-05, + "loss": 0.6988, + "step": 10449 + }, + { + "epoch": 0.7080425503082864, + "grad_norm": 6.882000923156738, + "learning_rate": 8.67218837702786e-05, + "loss": 1.0003, + "step": 10450 + }, + { + "epoch": 0.7081103055762585, + "grad_norm": 7.0581560134887695, + "learning_rate": 8.672051475118078e-05, + "loss": 0.8585, + "step": 10451 + }, + { + "epoch": 0.7081780608442306, + "grad_norm": 5.636070728302002, + "learning_rate": 8.671914573208296e-05, + "loss": 0.9355, + "step": 10452 + }, + { + "epoch": 0.7082458161122027, + "grad_norm": 7.167375564575195, + "learning_rate": 8.671777671298515e-05, + "loss": 0.8098, + "step": 10453 + }, + { + "epoch": 0.7083135713801748, + "grad_norm": 6.989759922027588, + "learning_rate": 8.671640769388733e-05, + "loss": 1.1052, + "step": 10454 + }, + { + "epoch": 0.7083813266481469, + "grad_norm": 5.774247646331787, + "learning_rate": 8.671503867478951e-05, + "loss": 0.8362, + "step": 10455 + }, + { + "epoch": 0.708449081916119, + "grad_norm": 5.6326518058776855, + "learning_rate": 8.671366965569169e-05, + "loss": 0.644, + "step": 10456 + }, + { + "epoch": 0.7085168371840911, + "grad_norm": 6.38750696182251, + "learning_rate": 8.671230063659389e-05, + "loss": 0.7756, + "step": 10457 + }, + { + "epoch": 0.7085845924520632, + "grad_norm": 6.129147529602051, + "learning_rate": 8.671093161749607e-05, + "loss": 0.8983, + "step": 10458 + }, + { + "epoch": 0.7086523477200353, + "grad_norm": 7.424493789672852, + "learning_rate": 8.670956259839825e-05, + "loss": 0.9429, + "step": 10459 + }, + { + "epoch": 0.7087201029880074, + "grad_norm": 6.838191509246826, + "learning_rate": 8.670819357930044e-05, + "loss": 0.7546, + "step": 10460 + }, + { + "epoch": 0.7087878582559795, + "grad_norm": 5.380428791046143, + "learning_rate": 8.670682456020262e-05, + "loss": 0.6884, + "step": 10461 + }, + { + "epoch": 0.7088556135239514, + "grad_norm": 5.4953203201293945, + "learning_rate": 8.67054555411048e-05, + "loss": 1.0417, + "step": 10462 + }, + { + "epoch": 0.7089233687919235, + "grad_norm": 6.5481133460998535, + "learning_rate": 8.6704086522007e-05, + "loss": 0.7382, + "step": 10463 + }, + { + "epoch": 0.7089911240598956, + "grad_norm": 7.717205047607422, + "learning_rate": 8.670271750290918e-05, + "loss": 0.9389, + "step": 10464 + }, + { + "epoch": 0.7090588793278677, + "grad_norm": 6.287739276885986, + "learning_rate": 8.670134848381136e-05, + "loss": 0.9393, + "step": 10465 + }, + { + "epoch": 0.7091266345958398, + "grad_norm": 5.565641403198242, + "learning_rate": 8.669997946471354e-05, + "loss": 0.758, + "step": 10466 + }, + { + "epoch": 0.7091943898638119, + "grad_norm": 5.262805938720703, + "learning_rate": 8.669861044561573e-05, + "loss": 0.792, + "step": 10467 + }, + { + "epoch": 0.709262145131784, + "grad_norm": 5.002110004425049, + "learning_rate": 8.669724142651791e-05, + "loss": 0.7365, + "step": 10468 + }, + { + "epoch": 0.7093299003997561, + "grad_norm": 6.84413480758667, + "learning_rate": 8.669587240742009e-05, + "loss": 1.0521, + "step": 10469 + }, + { + "epoch": 0.7093976556677282, + "grad_norm": 6.899505138397217, + "learning_rate": 8.669450338832227e-05, + "loss": 0.795, + "step": 10470 + }, + { + "epoch": 0.7094654109357003, + "grad_norm": 5.376099109649658, + "learning_rate": 8.669313436922447e-05, + "loss": 0.51, + "step": 10471 + }, + { + "epoch": 0.7095331662036723, + "grad_norm": 6.934320449829102, + "learning_rate": 8.669176535012665e-05, + "loss": 0.9405, + "step": 10472 + }, + { + "epoch": 0.7096009214716444, + "grad_norm": 5.896731376647949, + "learning_rate": 8.669039633102883e-05, + "loss": 0.6869, + "step": 10473 + }, + { + "epoch": 0.7096686767396165, + "grad_norm": 5.4463887214660645, + "learning_rate": 8.6689027311931e-05, + "loss": 0.8513, + "step": 10474 + }, + { + "epoch": 0.7097364320075886, + "grad_norm": 6.024421215057373, + "learning_rate": 8.668765829283319e-05, + "loss": 0.7993, + "step": 10475 + }, + { + "epoch": 0.7098041872755607, + "grad_norm": 7.861370086669922, + "learning_rate": 8.668628927373538e-05, + "loss": 0.7246, + "step": 10476 + }, + { + "epoch": 0.7098719425435328, + "grad_norm": 5.0704779624938965, + "learning_rate": 8.668492025463756e-05, + "loss": 0.6759, + "step": 10477 + }, + { + "epoch": 0.7099396978115049, + "grad_norm": 6.787322998046875, + "learning_rate": 8.668355123553974e-05, + "loss": 0.8075, + "step": 10478 + }, + { + "epoch": 0.7100074530794769, + "grad_norm": 5.564799785614014, + "learning_rate": 8.668218221644192e-05, + "loss": 0.7596, + "step": 10479 + }, + { + "epoch": 0.710075208347449, + "grad_norm": 6.072136402130127, + "learning_rate": 8.66808131973441e-05, + "loss": 0.6902, + "step": 10480 + }, + { + "epoch": 0.7101429636154211, + "grad_norm": 6.825998783111572, + "learning_rate": 8.66794441782463e-05, + "loss": 0.7987, + "step": 10481 + }, + { + "epoch": 0.7102107188833932, + "grad_norm": 6.803398609161377, + "learning_rate": 8.667807515914848e-05, + "loss": 0.8946, + "step": 10482 + }, + { + "epoch": 0.7102784741513652, + "grad_norm": 5.5623250007629395, + "learning_rate": 8.667670614005066e-05, + "loss": 0.9588, + "step": 10483 + }, + { + "epoch": 0.7103462294193373, + "grad_norm": 6.420827865600586, + "learning_rate": 8.667533712095284e-05, + "loss": 0.9871, + "step": 10484 + }, + { + "epoch": 0.7104139846873094, + "grad_norm": 5.774916172027588, + "learning_rate": 8.667396810185503e-05, + "loss": 0.7826, + "step": 10485 + }, + { + "epoch": 0.7104817399552815, + "grad_norm": 6.701958656311035, + "learning_rate": 8.667259908275721e-05, + "loss": 0.9528, + "step": 10486 + }, + { + "epoch": 0.7105494952232536, + "grad_norm": 6.663124084472656, + "learning_rate": 8.667123006365939e-05, + "loss": 0.7279, + "step": 10487 + }, + { + "epoch": 0.7106172504912257, + "grad_norm": 6.165869235992432, + "learning_rate": 8.666986104456157e-05, + "loss": 0.7835, + "step": 10488 + }, + { + "epoch": 0.7106850057591978, + "grad_norm": 5.795663356781006, + "learning_rate": 8.666849202546375e-05, + "loss": 0.6844, + "step": 10489 + }, + { + "epoch": 0.7107527610271699, + "grad_norm": 5.601436138153076, + "learning_rate": 8.666712300636595e-05, + "loss": 0.7844, + "step": 10490 + }, + { + "epoch": 0.710820516295142, + "grad_norm": 6.733765125274658, + "learning_rate": 8.666575398726813e-05, + "loss": 0.7687, + "step": 10491 + }, + { + "epoch": 0.7108882715631141, + "grad_norm": 6.032510757446289, + "learning_rate": 8.66643849681703e-05, + "loss": 0.7252, + "step": 10492 + }, + { + "epoch": 0.7109560268310862, + "grad_norm": 4.691253662109375, + "learning_rate": 8.666301594907249e-05, + "loss": 0.7354, + "step": 10493 + }, + { + "epoch": 0.7110237820990583, + "grad_norm": 6.740907669067383, + "learning_rate": 8.666164692997468e-05, + "loss": 0.854, + "step": 10494 + }, + { + "epoch": 0.7110915373670302, + "grad_norm": 6.258440971374512, + "learning_rate": 8.666027791087686e-05, + "loss": 0.7787, + "step": 10495 + }, + { + "epoch": 0.7111592926350023, + "grad_norm": 5.37103271484375, + "learning_rate": 8.665890889177904e-05, + "loss": 0.8159, + "step": 10496 + }, + { + "epoch": 0.7112270479029744, + "grad_norm": 5.726749420166016, + "learning_rate": 8.665753987268122e-05, + "loss": 1.0529, + "step": 10497 + }, + { + "epoch": 0.7112948031709465, + "grad_norm": 6.467258930206299, + "learning_rate": 8.66561708535834e-05, + "loss": 0.7445, + "step": 10498 + }, + { + "epoch": 0.7113625584389186, + "grad_norm": 5.931604385375977, + "learning_rate": 8.66548018344856e-05, + "loss": 0.6146, + "step": 10499 + }, + { + "epoch": 0.7114303137068907, + "grad_norm": 7.676519393920898, + "learning_rate": 8.665343281538778e-05, + "loss": 0.9234, + "step": 10500 + }, + { + "epoch": 0.7114980689748628, + "grad_norm": 6.444290637969971, + "learning_rate": 8.665206379628996e-05, + "loss": 0.8529, + "step": 10501 + }, + { + "epoch": 0.7115658242428349, + "grad_norm": 6.420405864715576, + "learning_rate": 8.665069477719214e-05, + "loss": 0.8439, + "step": 10502 + }, + { + "epoch": 0.711633579510807, + "grad_norm": 5.584212779998779, + "learning_rate": 8.664932575809432e-05, + "loss": 0.6583, + "step": 10503 + }, + { + "epoch": 0.7117013347787791, + "grad_norm": 6.1522746086120605, + "learning_rate": 8.664795673899651e-05, + "loss": 0.7773, + "step": 10504 + }, + { + "epoch": 0.7117690900467512, + "grad_norm": 6.573955535888672, + "learning_rate": 8.664658771989869e-05, + "loss": 0.7553, + "step": 10505 + }, + { + "epoch": 0.7118368453147232, + "grad_norm": 7.660068988800049, + "learning_rate": 8.664521870080087e-05, + "loss": 0.7807, + "step": 10506 + }, + { + "epoch": 0.7119046005826953, + "grad_norm": 6.398780822753906, + "learning_rate": 8.664384968170307e-05, + "loss": 0.7814, + "step": 10507 + }, + { + "epoch": 0.7119723558506674, + "grad_norm": 6.873563766479492, + "learning_rate": 8.664248066260525e-05, + "loss": 0.8067, + "step": 10508 + }, + { + "epoch": 0.7120401111186395, + "grad_norm": 6.932216644287109, + "learning_rate": 8.664111164350743e-05, + "loss": 0.9248, + "step": 10509 + }, + { + "epoch": 0.7121078663866116, + "grad_norm": 6.539022445678711, + "learning_rate": 8.663974262440962e-05, + "loss": 0.8059, + "step": 10510 + }, + { + "epoch": 0.7121756216545836, + "grad_norm": 6.882415771484375, + "learning_rate": 8.66383736053118e-05, + "loss": 0.853, + "step": 10511 + }, + { + "epoch": 0.7122433769225557, + "grad_norm": 7.576079368591309, + "learning_rate": 8.663700458621398e-05, + "loss": 1.1152, + "step": 10512 + }, + { + "epoch": 0.7123111321905278, + "grad_norm": 5.934848785400391, + "learning_rate": 8.663563556711617e-05, + "loss": 0.7804, + "step": 10513 + }, + { + "epoch": 0.7123788874584999, + "grad_norm": 5.297085762023926, + "learning_rate": 8.663426654801836e-05, + "loss": 0.6543, + "step": 10514 + }, + { + "epoch": 0.712446642726472, + "grad_norm": 5.618426322937012, + "learning_rate": 8.663289752892054e-05, + "loss": 0.5286, + "step": 10515 + }, + { + "epoch": 0.712514397994444, + "grad_norm": 5.978342533111572, + "learning_rate": 8.663152850982272e-05, + "loss": 0.6195, + "step": 10516 + }, + { + "epoch": 0.7125821532624161, + "grad_norm": 7.146844863891602, + "learning_rate": 8.663015949072491e-05, + "loss": 0.9239, + "step": 10517 + }, + { + "epoch": 0.7126499085303882, + "grad_norm": 7.71320915222168, + "learning_rate": 8.662879047162709e-05, + "loss": 0.8047, + "step": 10518 + }, + { + "epoch": 0.7127176637983603, + "grad_norm": 5.022526741027832, + "learning_rate": 8.662742145252927e-05, + "loss": 0.5567, + "step": 10519 + }, + { + "epoch": 0.7127854190663324, + "grad_norm": 6.884553909301758, + "learning_rate": 8.662605243343145e-05, + "loss": 0.9116, + "step": 10520 + }, + { + "epoch": 0.7128531743343045, + "grad_norm": 9.86026668548584, + "learning_rate": 8.662468341433363e-05, + "loss": 1.1616, + "step": 10521 + }, + { + "epoch": 0.7129209296022766, + "grad_norm": 6.307768821716309, + "learning_rate": 8.662331439523583e-05, + "loss": 0.8304, + "step": 10522 + }, + { + "epoch": 0.7129886848702487, + "grad_norm": 8.609663009643555, + "learning_rate": 8.6621945376138e-05, + "loss": 0.6635, + "step": 10523 + }, + { + "epoch": 0.7130564401382208, + "grad_norm": 5.718436241149902, + "learning_rate": 8.662057635704019e-05, + "loss": 0.889, + "step": 10524 + }, + { + "epoch": 0.7131241954061929, + "grad_norm": 6.126955986022949, + "learning_rate": 8.661920733794237e-05, + "loss": 0.6973, + "step": 10525 + }, + { + "epoch": 0.713191950674165, + "grad_norm": 6.689998149871826, + "learning_rate": 8.661783831884456e-05, + "loss": 0.6399, + "step": 10526 + }, + { + "epoch": 0.713259705942137, + "grad_norm": 6.500828742980957, + "learning_rate": 8.661646929974674e-05, + "loss": 1.0992, + "step": 10527 + }, + { + "epoch": 0.713327461210109, + "grad_norm": 7.04468297958374, + "learning_rate": 8.661510028064892e-05, + "loss": 1.0635, + "step": 10528 + }, + { + "epoch": 0.7133952164780811, + "grad_norm": 6.968896865844727, + "learning_rate": 8.66137312615511e-05, + "loss": 0.8095, + "step": 10529 + }, + { + "epoch": 0.7134629717460532, + "grad_norm": 7.557732105255127, + "learning_rate": 8.661236224245328e-05, + "loss": 0.6779, + "step": 10530 + }, + { + "epoch": 0.7135307270140253, + "grad_norm": 4.746248245239258, + "learning_rate": 8.661099322335548e-05, + "loss": 0.7275, + "step": 10531 + }, + { + "epoch": 0.7135984822819974, + "grad_norm": 7.140705108642578, + "learning_rate": 8.660962420425766e-05, + "loss": 0.9274, + "step": 10532 + }, + { + "epoch": 0.7136662375499695, + "grad_norm": 6.661166191101074, + "learning_rate": 8.660825518515984e-05, + "loss": 0.8924, + "step": 10533 + }, + { + "epoch": 0.7137339928179416, + "grad_norm": 6.4814653396606445, + "learning_rate": 8.660688616606202e-05, + "loss": 0.7357, + "step": 10534 + }, + { + "epoch": 0.7138017480859137, + "grad_norm": 9.411799430847168, + "learning_rate": 8.66055171469642e-05, + "loss": 0.5209, + "step": 10535 + }, + { + "epoch": 0.7138695033538858, + "grad_norm": 5.223617076873779, + "learning_rate": 8.660414812786639e-05, + "loss": 0.7623, + "step": 10536 + }, + { + "epoch": 0.7139372586218579, + "grad_norm": 8.094182014465332, + "learning_rate": 8.660277910876857e-05, + "loss": 1.0194, + "step": 10537 + }, + { + "epoch": 0.71400501388983, + "grad_norm": 5.444286823272705, + "learning_rate": 8.660141008967075e-05, + "loss": 0.8126, + "step": 10538 + }, + { + "epoch": 0.714072769157802, + "grad_norm": 4.902561664581299, + "learning_rate": 8.660004107057293e-05, + "loss": 0.6524, + "step": 10539 + }, + { + "epoch": 0.7141405244257741, + "grad_norm": 7.155951023101807, + "learning_rate": 8.659867205147513e-05, + "loss": 0.9928, + "step": 10540 + }, + { + "epoch": 0.7142082796937462, + "grad_norm": 6.7633538246154785, + "learning_rate": 8.65973030323773e-05, + "loss": 1.0564, + "step": 10541 + }, + { + "epoch": 0.7142760349617183, + "grad_norm": 6.050258636474609, + "learning_rate": 8.659593401327949e-05, + "loss": 0.8051, + "step": 10542 + }, + { + "epoch": 0.7143437902296904, + "grad_norm": 4.88824987411499, + "learning_rate": 8.659456499418167e-05, + "loss": 0.6135, + "step": 10543 + }, + { + "epoch": 0.7144115454976624, + "grad_norm": 5.773684501647949, + "learning_rate": 8.659319597508385e-05, + "loss": 0.5467, + "step": 10544 + }, + { + "epoch": 0.7144793007656345, + "grad_norm": 7.082754611968994, + "learning_rate": 8.659182695598604e-05, + "loss": 0.7677, + "step": 10545 + }, + { + "epoch": 0.7145470560336066, + "grad_norm": 5.4242424964904785, + "learning_rate": 8.659045793688822e-05, + "loss": 0.7915, + "step": 10546 + }, + { + "epoch": 0.7146148113015787, + "grad_norm": 5.280063152313232, + "learning_rate": 8.65890889177904e-05, + "loss": 0.5614, + "step": 10547 + }, + { + "epoch": 0.7146825665695508, + "grad_norm": 6.720800876617432, + "learning_rate": 8.658771989869258e-05, + "loss": 1.064, + "step": 10548 + }, + { + "epoch": 0.7147503218375229, + "grad_norm": 7.908580303192139, + "learning_rate": 8.658635087959478e-05, + "loss": 0.7564, + "step": 10549 + }, + { + "epoch": 0.7148180771054949, + "grad_norm": 6.164776802062988, + "learning_rate": 8.658498186049696e-05, + "loss": 0.8359, + "step": 10550 + }, + { + "epoch": 0.714885832373467, + "grad_norm": 5.345958709716797, + "learning_rate": 8.658361284139914e-05, + "loss": 0.9018, + "step": 10551 + }, + { + "epoch": 0.7149535876414391, + "grad_norm": 6.751514911651611, + "learning_rate": 8.658224382230132e-05, + "loss": 0.9795, + "step": 10552 + }, + { + "epoch": 0.7150213429094112, + "grad_norm": 5.850390911102295, + "learning_rate": 8.658087480320351e-05, + "loss": 0.8055, + "step": 10553 + }, + { + "epoch": 0.7150890981773833, + "grad_norm": 6.081165790557861, + "learning_rate": 8.657950578410569e-05, + "loss": 0.7952, + "step": 10554 + }, + { + "epoch": 0.7151568534453554, + "grad_norm": 5.211760997772217, + "learning_rate": 8.657813676500787e-05, + "loss": 0.6467, + "step": 10555 + }, + { + "epoch": 0.7152246087133275, + "grad_norm": 7.24871826171875, + "learning_rate": 8.657676774591007e-05, + "loss": 0.8434, + "step": 10556 + }, + { + "epoch": 0.7152923639812996, + "grad_norm": 6.2204413414001465, + "learning_rate": 8.657539872681225e-05, + "loss": 0.7948, + "step": 10557 + }, + { + "epoch": 0.7153601192492717, + "grad_norm": 6.038403511047363, + "learning_rate": 8.657402970771443e-05, + "loss": 0.69, + "step": 10558 + }, + { + "epoch": 0.7154278745172438, + "grad_norm": 6.616792678833008, + "learning_rate": 8.657266068861662e-05, + "loss": 0.9007, + "step": 10559 + }, + { + "epoch": 0.7154956297852157, + "grad_norm": 6.901274681091309, + "learning_rate": 8.65712916695188e-05, + "loss": 0.8176, + "step": 10560 + }, + { + "epoch": 0.7155633850531878, + "grad_norm": 6.145236015319824, + "learning_rate": 8.656992265042098e-05, + "loss": 0.6697, + "step": 10561 + }, + { + "epoch": 0.7156311403211599, + "grad_norm": 6.30226993560791, + "learning_rate": 8.656855363132316e-05, + "loss": 0.7585, + "step": 10562 + }, + { + "epoch": 0.715698895589132, + "grad_norm": 5.349961757659912, + "learning_rate": 8.656718461222535e-05, + "loss": 0.6811, + "step": 10563 + }, + { + "epoch": 0.7157666508571041, + "grad_norm": 6.9230170249938965, + "learning_rate": 8.656581559312753e-05, + "loss": 0.7924, + "step": 10564 + }, + { + "epoch": 0.7158344061250762, + "grad_norm": 6.30393123626709, + "learning_rate": 8.656444657402972e-05, + "loss": 0.8558, + "step": 10565 + }, + { + "epoch": 0.7159021613930483, + "grad_norm": 7.642063617706299, + "learning_rate": 8.65630775549319e-05, + "loss": 0.8416, + "step": 10566 + }, + { + "epoch": 0.7159699166610204, + "grad_norm": 6.944372653961182, + "learning_rate": 8.656170853583408e-05, + "loss": 1.0213, + "step": 10567 + }, + { + "epoch": 0.7160376719289925, + "grad_norm": 6.925499439239502, + "learning_rate": 8.656033951673627e-05, + "loss": 0.8486, + "step": 10568 + }, + { + "epoch": 0.7161054271969646, + "grad_norm": 5.875875949859619, + "learning_rate": 8.655897049763845e-05, + "loss": 0.8515, + "step": 10569 + }, + { + "epoch": 0.7161731824649367, + "grad_norm": 9.030150413513184, + "learning_rate": 8.655760147854063e-05, + "loss": 0.9014, + "step": 10570 + }, + { + "epoch": 0.7162409377329088, + "grad_norm": 5.825559139251709, + "learning_rate": 8.655623245944281e-05, + "loss": 0.8333, + "step": 10571 + }, + { + "epoch": 0.7163086930008808, + "grad_norm": 5.936555862426758, + "learning_rate": 8.6554863440345e-05, + "loss": 0.6806, + "step": 10572 + }, + { + "epoch": 0.7163764482688529, + "grad_norm": 5.446226596832275, + "learning_rate": 8.655349442124719e-05, + "loss": 0.687, + "step": 10573 + }, + { + "epoch": 0.716444203536825, + "grad_norm": 7.467900276184082, + "learning_rate": 8.655212540214937e-05, + "loss": 1.1386, + "step": 10574 + }, + { + "epoch": 0.7165119588047971, + "grad_norm": 6.0190534591674805, + "learning_rate": 8.655075638305155e-05, + "loss": 0.9846, + "step": 10575 + }, + { + "epoch": 0.7165797140727691, + "grad_norm": 6.0063252449035645, + "learning_rate": 8.654938736395373e-05, + "loss": 0.7755, + "step": 10576 + }, + { + "epoch": 0.7166474693407412, + "grad_norm": 7.271022796630859, + "learning_rate": 8.654801834485592e-05, + "loss": 0.8129, + "step": 10577 + }, + { + "epoch": 0.7167152246087133, + "grad_norm": 5.204225063323975, + "learning_rate": 8.65466493257581e-05, + "loss": 0.707, + "step": 10578 + }, + { + "epoch": 0.7167829798766854, + "grad_norm": 6.814970016479492, + "learning_rate": 8.654528030666028e-05, + "loss": 0.9433, + "step": 10579 + }, + { + "epoch": 0.7168507351446575, + "grad_norm": 9.155210494995117, + "learning_rate": 8.654391128756246e-05, + "loss": 0.8058, + "step": 10580 + }, + { + "epoch": 0.7169184904126296, + "grad_norm": 6.666374683380127, + "learning_rate": 8.654254226846465e-05, + "loss": 0.9789, + "step": 10581 + }, + { + "epoch": 0.7169862456806017, + "grad_norm": 5.498271465301514, + "learning_rate": 8.654117324936684e-05, + "loss": 0.7266, + "step": 10582 + }, + { + "epoch": 0.7170540009485737, + "grad_norm": 6.632149696350098, + "learning_rate": 8.653980423026902e-05, + "loss": 0.8929, + "step": 10583 + }, + { + "epoch": 0.7171217562165458, + "grad_norm": 6.820444107055664, + "learning_rate": 8.65384352111712e-05, + "loss": 0.7493, + "step": 10584 + }, + { + "epoch": 0.7171895114845179, + "grad_norm": 9.759723663330078, + "learning_rate": 8.653706619207338e-05, + "loss": 0.8465, + "step": 10585 + }, + { + "epoch": 0.71725726675249, + "grad_norm": 6.131860256195068, + "learning_rate": 8.653569717297557e-05, + "loss": 0.8678, + "step": 10586 + }, + { + "epoch": 0.7173250220204621, + "grad_norm": 5.567459583282471, + "learning_rate": 8.653432815387775e-05, + "loss": 0.7342, + "step": 10587 + }, + { + "epoch": 0.7173927772884342, + "grad_norm": 4.433963775634766, + "learning_rate": 8.653295913477993e-05, + "loss": 0.7901, + "step": 10588 + }, + { + "epoch": 0.7174605325564063, + "grad_norm": 5.557954788208008, + "learning_rate": 8.653159011568211e-05, + "loss": 0.6189, + "step": 10589 + }, + { + "epoch": 0.7175282878243784, + "grad_norm": 6.555310249328613, + "learning_rate": 8.653022109658429e-05, + "loss": 0.8554, + "step": 10590 + }, + { + "epoch": 0.7175960430923505, + "grad_norm": 7.396895408630371, + "learning_rate": 8.652885207748649e-05, + "loss": 0.9261, + "step": 10591 + }, + { + "epoch": 0.7176637983603226, + "grad_norm": 5.6380181312561035, + "learning_rate": 8.652748305838867e-05, + "loss": 0.7968, + "step": 10592 + }, + { + "epoch": 0.7177315536282945, + "grad_norm": 5.689277648925781, + "learning_rate": 8.652611403929085e-05, + "loss": 0.7951, + "step": 10593 + }, + { + "epoch": 0.7177993088962666, + "grad_norm": 5.647032737731934, + "learning_rate": 8.652474502019303e-05, + "loss": 0.8273, + "step": 10594 + }, + { + "epoch": 0.7178670641642387, + "grad_norm": 6.165719985961914, + "learning_rate": 8.652337600109522e-05, + "loss": 0.7203, + "step": 10595 + }, + { + "epoch": 0.7179348194322108, + "grad_norm": 5.114332675933838, + "learning_rate": 8.65220069819974e-05, + "loss": 0.6582, + "step": 10596 + }, + { + "epoch": 0.7180025747001829, + "grad_norm": 6.3832879066467285, + "learning_rate": 8.652063796289958e-05, + "loss": 0.8028, + "step": 10597 + }, + { + "epoch": 0.718070329968155, + "grad_norm": 5.82213020324707, + "learning_rate": 8.651926894380176e-05, + "loss": 0.5738, + "step": 10598 + }, + { + "epoch": 0.7181380852361271, + "grad_norm": 6.337172031402588, + "learning_rate": 8.651789992470396e-05, + "loss": 0.8797, + "step": 10599 + }, + { + "epoch": 0.7182058405040992, + "grad_norm": 5.312211513519287, + "learning_rate": 8.651653090560614e-05, + "loss": 0.732, + "step": 10600 + }, + { + "epoch": 0.7182735957720713, + "grad_norm": 8.132328033447266, + "learning_rate": 8.651516188650832e-05, + "loss": 0.9214, + "step": 10601 + }, + { + "epoch": 0.7183413510400434, + "grad_norm": 6.073488235473633, + "learning_rate": 8.651379286741051e-05, + "loss": 0.7914, + "step": 10602 + }, + { + "epoch": 0.7184091063080155, + "grad_norm": 4.74514102935791, + "learning_rate": 8.651242384831269e-05, + "loss": 0.7396, + "step": 10603 + }, + { + "epoch": 0.7184768615759876, + "grad_norm": 6.970630645751953, + "learning_rate": 8.651105482921487e-05, + "loss": 0.6759, + "step": 10604 + }, + { + "epoch": 0.7185446168439596, + "grad_norm": 5.5301408767700195, + "learning_rate": 8.650968581011706e-05, + "loss": 0.7496, + "step": 10605 + }, + { + "epoch": 0.7186123721119317, + "grad_norm": 9.243334770202637, + "learning_rate": 8.650831679101924e-05, + "loss": 0.9962, + "step": 10606 + }, + { + "epoch": 0.7186801273799038, + "grad_norm": 4.740606784820557, + "learning_rate": 8.650694777192143e-05, + "loss": 0.5528, + "step": 10607 + }, + { + "epoch": 0.7187478826478759, + "grad_norm": 6.146499156951904, + "learning_rate": 8.65055787528236e-05, + "loss": 0.8339, + "step": 10608 + }, + { + "epoch": 0.7188156379158479, + "grad_norm": 6.534127235412598, + "learning_rate": 8.65042097337258e-05, + "loss": 1.1163, + "step": 10609 + }, + { + "epoch": 0.71888339318382, + "grad_norm": 8.349403381347656, + "learning_rate": 8.650284071462798e-05, + "loss": 0.8872, + "step": 10610 + }, + { + "epoch": 0.7189511484517921, + "grad_norm": 4.883057117462158, + "learning_rate": 8.650147169553016e-05, + "loss": 0.7221, + "step": 10611 + }, + { + "epoch": 0.7190189037197642, + "grad_norm": 9.08403491973877, + "learning_rate": 8.650010267643234e-05, + "loss": 0.9264, + "step": 10612 + }, + { + "epoch": 0.7190866589877363, + "grad_norm": 8.753477096557617, + "learning_rate": 8.649873365733452e-05, + "loss": 0.715, + "step": 10613 + }, + { + "epoch": 0.7191544142557084, + "grad_norm": 6.945448875427246, + "learning_rate": 8.649736463823671e-05, + "loss": 0.7706, + "step": 10614 + }, + { + "epoch": 0.7192221695236805, + "grad_norm": 6.655423164367676, + "learning_rate": 8.64959956191389e-05, + "loss": 0.7832, + "step": 10615 + }, + { + "epoch": 0.7192899247916525, + "grad_norm": 6.322832107543945, + "learning_rate": 8.649462660004108e-05, + "loss": 0.8017, + "step": 10616 + }, + { + "epoch": 0.7193576800596246, + "grad_norm": 6.454827785491943, + "learning_rate": 8.649325758094326e-05, + "loss": 0.7492, + "step": 10617 + }, + { + "epoch": 0.7194254353275967, + "grad_norm": 7.011631011962891, + "learning_rate": 8.649188856184545e-05, + "loss": 0.7562, + "step": 10618 + }, + { + "epoch": 0.7194931905955688, + "grad_norm": 6.621539115905762, + "learning_rate": 8.649051954274763e-05, + "loss": 0.76, + "step": 10619 + }, + { + "epoch": 0.7195609458635409, + "grad_norm": 8.84100341796875, + "learning_rate": 8.648915052364981e-05, + "loss": 0.9858, + "step": 10620 + }, + { + "epoch": 0.719628701131513, + "grad_norm": 6.356812000274658, + "learning_rate": 8.648778150455199e-05, + "loss": 0.7553, + "step": 10621 + }, + { + "epoch": 0.7196964563994851, + "grad_norm": 6.760133743286133, + "learning_rate": 8.648641248545417e-05, + "loss": 0.8642, + "step": 10622 + }, + { + "epoch": 0.7197642116674572, + "grad_norm": 6.104750633239746, + "learning_rate": 8.648504346635636e-05, + "loss": 0.6905, + "step": 10623 + }, + { + "epoch": 0.7198319669354293, + "grad_norm": 6.207709312438965, + "learning_rate": 8.648367444725855e-05, + "loss": 0.8549, + "step": 10624 + }, + { + "epoch": 0.7198997222034013, + "grad_norm": 6.436330795288086, + "learning_rate": 8.648230542816073e-05, + "loss": 0.7064, + "step": 10625 + }, + { + "epoch": 0.7199674774713734, + "grad_norm": 5.475677967071533, + "learning_rate": 8.64809364090629e-05, + "loss": 0.6146, + "step": 10626 + }, + { + "epoch": 0.7200352327393454, + "grad_norm": 9.833735466003418, + "learning_rate": 8.64795673899651e-05, + "loss": 0.7222, + "step": 10627 + }, + { + "epoch": 0.7201029880073175, + "grad_norm": 8.227372169494629, + "learning_rate": 8.647819837086728e-05, + "loss": 0.9564, + "step": 10628 + }, + { + "epoch": 0.7201707432752896, + "grad_norm": 7.26641321182251, + "learning_rate": 8.647682935176946e-05, + "loss": 0.796, + "step": 10629 + }, + { + "epoch": 0.7202384985432617, + "grad_norm": 6.712799549102783, + "learning_rate": 8.647546033267164e-05, + "loss": 0.9736, + "step": 10630 + }, + { + "epoch": 0.7203062538112338, + "grad_norm": 6.906972885131836, + "learning_rate": 8.647409131357382e-05, + "loss": 0.8028, + "step": 10631 + }, + { + "epoch": 0.7203740090792059, + "grad_norm": 6.211350440979004, + "learning_rate": 8.647272229447601e-05, + "loss": 0.7953, + "step": 10632 + }, + { + "epoch": 0.720441764347178, + "grad_norm": 7.281525611877441, + "learning_rate": 8.64713532753782e-05, + "loss": 0.7667, + "step": 10633 + }, + { + "epoch": 0.7205095196151501, + "grad_norm": 6.922200679779053, + "learning_rate": 8.646998425628038e-05, + "loss": 0.9156, + "step": 10634 + }, + { + "epoch": 0.7205772748831222, + "grad_norm": 4.672682762145996, + "learning_rate": 8.646861523718256e-05, + "loss": 0.6602, + "step": 10635 + }, + { + "epoch": 0.7206450301510943, + "grad_norm": 6.199947834014893, + "learning_rate": 8.646724621808474e-05, + "loss": 1.058, + "step": 10636 + }, + { + "epoch": 0.7207127854190664, + "grad_norm": 6.395276069641113, + "learning_rate": 8.646587719898693e-05, + "loss": 0.834, + "step": 10637 + }, + { + "epoch": 0.7207805406870385, + "grad_norm": 5.92854118347168, + "learning_rate": 8.646450817988911e-05, + "loss": 0.9097, + "step": 10638 + }, + { + "epoch": 0.7208482959550105, + "grad_norm": 9.13015079498291, + "learning_rate": 8.646313916079129e-05, + "loss": 0.7069, + "step": 10639 + }, + { + "epoch": 0.7209160512229826, + "grad_norm": 5.72170877456665, + "learning_rate": 8.646177014169347e-05, + "loss": 0.9316, + "step": 10640 + }, + { + "epoch": 0.7209838064909547, + "grad_norm": 7.367129325866699, + "learning_rate": 8.646040112259567e-05, + "loss": 0.7785, + "step": 10641 + }, + { + "epoch": 0.7210515617589267, + "grad_norm": 5.120598316192627, + "learning_rate": 8.645903210349785e-05, + "loss": 0.6656, + "step": 10642 + }, + { + "epoch": 0.7211193170268988, + "grad_norm": 6.610129356384277, + "learning_rate": 8.645766308440003e-05, + "loss": 0.9131, + "step": 10643 + }, + { + "epoch": 0.7211870722948709, + "grad_norm": 6.49082612991333, + "learning_rate": 8.64562940653022e-05, + "loss": 0.8923, + "step": 10644 + }, + { + "epoch": 0.721254827562843, + "grad_norm": 7.010980129241943, + "learning_rate": 8.64549250462044e-05, + "loss": 0.794, + "step": 10645 + }, + { + "epoch": 0.7213225828308151, + "grad_norm": 7.219010829925537, + "learning_rate": 8.645355602710658e-05, + "loss": 0.9228, + "step": 10646 + }, + { + "epoch": 0.7213903380987872, + "grad_norm": 5.3610053062438965, + "learning_rate": 8.645218700800876e-05, + "loss": 0.7479, + "step": 10647 + }, + { + "epoch": 0.7214580933667593, + "grad_norm": 6.72417688369751, + "learning_rate": 8.645081798891095e-05, + "loss": 0.7015, + "step": 10648 + }, + { + "epoch": 0.7215258486347313, + "grad_norm": 6.321094989776611, + "learning_rate": 8.644944896981313e-05, + "loss": 0.7247, + "step": 10649 + }, + { + "epoch": 0.7215936039027034, + "grad_norm": 6.939053058624268, + "learning_rate": 8.644807995071532e-05, + "loss": 0.9851, + "step": 10650 + }, + { + "epoch": 0.7216613591706755, + "grad_norm": 7.304567337036133, + "learning_rate": 8.644671093161751e-05, + "loss": 0.9053, + "step": 10651 + }, + { + "epoch": 0.7217291144386476, + "grad_norm": 7.707671165466309, + "learning_rate": 8.644534191251969e-05, + "loss": 0.903, + "step": 10652 + }, + { + "epoch": 0.7217968697066197, + "grad_norm": 8.089873313903809, + "learning_rate": 8.644397289342187e-05, + "loss": 0.8698, + "step": 10653 + }, + { + "epoch": 0.7218646249745918, + "grad_norm": 7.8891119956970215, + "learning_rate": 8.644260387432405e-05, + "loss": 0.9856, + "step": 10654 + }, + { + "epoch": 0.7219323802425639, + "grad_norm": 5.457139015197754, + "learning_rate": 8.644123485522624e-05, + "loss": 0.6027, + "step": 10655 + }, + { + "epoch": 0.722000135510536, + "grad_norm": 5.993939399719238, + "learning_rate": 8.643986583612842e-05, + "loss": 0.8584, + "step": 10656 + }, + { + "epoch": 0.7220678907785081, + "grad_norm": 5.376394271850586, + "learning_rate": 8.64384968170306e-05, + "loss": 0.6551, + "step": 10657 + }, + { + "epoch": 0.7221356460464801, + "grad_norm": 7.0075249671936035, + "learning_rate": 8.643712779793279e-05, + "loss": 0.6486, + "step": 10658 + }, + { + "epoch": 0.7222034013144522, + "grad_norm": 6.753172397613525, + "learning_rate": 8.643575877883498e-05, + "loss": 1.0099, + "step": 10659 + }, + { + "epoch": 0.7222711565824242, + "grad_norm": 8.42198371887207, + "learning_rate": 8.643438975973716e-05, + "loss": 0.8545, + "step": 10660 + }, + { + "epoch": 0.7223389118503963, + "grad_norm": 9.268589973449707, + "learning_rate": 8.643302074063934e-05, + "loss": 1.0352, + "step": 10661 + }, + { + "epoch": 0.7224066671183684, + "grad_norm": 6.209371566772461, + "learning_rate": 8.643165172154152e-05, + "loss": 0.8474, + "step": 10662 + }, + { + "epoch": 0.7224744223863405, + "grad_norm": 10.753402709960938, + "learning_rate": 8.64302827024437e-05, + "loss": 0.8922, + "step": 10663 + }, + { + "epoch": 0.7225421776543126, + "grad_norm": 7.065412998199463, + "learning_rate": 8.64289136833459e-05, + "loss": 0.7056, + "step": 10664 + }, + { + "epoch": 0.7226099329222847, + "grad_norm": 5.867188930511475, + "learning_rate": 8.642754466424807e-05, + "loss": 0.6052, + "step": 10665 + }, + { + "epoch": 0.7226776881902568, + "grad_norm": 7.415475368499756, + "learning_rate": 8.642617564515025e-05, + "loss": 0.8593, + "step": 10666 + }, + { + "epoch": 0.7227454434582289, + "grad_norm": 6.486458778381348, + "learning_rate": 8.642480662605244e-05, + "loss": 0.8857, + "step": 10667 + }, + { + "epoch": 0.722813198726201, + "grad_norm": 8.942933082580566, + "learning_rate": 8.642343760695462e-05, + "loss": 0.8, + "step": 10668 + }, + { + "epoch": 0.7228809539941731, + "grad_norm": 4.676167011260986, + "learning_rate": 8.642206858785681e-05, + "loss": 0.8456, + "step": 10669 + }, + { + "epoch": 0.7229487092621452, + "grad_norm": 6.750422477722168, + "learning_rate": 8.642069956875899e-05, + "loss": 0.9438, + "step": 10670 + }, + { + "epoch": 0.7230164645301173, + "grad_norm": 8.17405891418457, + "learning_rate": 8.641933054966117e-05, + "loss": 0.9946, + "step": 10671 + }, + { + "epoch": 0.7230842197980893, + "grad_norm": 7.05765438079834, + "learning_rate": 8.641796153056335e-05, + "loss": 1.0896, + "step": 10672 + }, + { + "epoch": 0.7231519750660614, + "grad_norm": 5.9634857177734375, + "learning_rate": 8.641659251146554e-05, + "loss": 0.7975, + "step": 10673 + }, + { + "epoch": 0.7232197303340334, + "grad_norm": 5.722130298614502, + "learning_rate": 8.641522349236772e-05, + "loss": 0.7891, + "step": 10674 + }, + { + "epoch": 0.7232874856020055, + "grad_norm": 6.058647155761719, + "learning_rate": 8.64138544732699e-05, + "loss": 0.9987, + "step": 10675 + }, + { + "epoch": 0.7233552408699776, + "grad_norm": 8.651153564453125, + "learning_rate": 8.641248545417209e-05, + "loss": 0.8333, + "step": 10676 + }, + { + "epoch": 0.7234229961379497, + "grad_norm": 5.720202445983887, + "learning_rate": 8.641111643507427e-05, + "loss": 1.0346, + "step": 10677 + }, + { + "epoch": 0.7234907514059218, + "grad_norm": 5.499077796936035, + "learning_rate": 8.640974741597646e-05, + "loss": 0.6993, + "step": 10678 + }, + { + "epoch": 0.7235585066738939, + "grad_norm": 4.752992153167725, + "learning_rate": 8.640837839687864e-05, + "loss": 0.6134, + "step": 10679 + }, + { + "epoch": 0.723626261941866, + "grad_norm": 5.855991363525391, + "learning_rate": 8.640700937778082e-05, + "loss": 0.7177, + "step": 10680 + }, + { + "epoch": 0.7236940172098381, + "grad_norm": 6.163865566253662, + "learning_rate": 8.6405640358683e-05, + "loss": 0.8502, + "step": 10681 + }, + { + "epoch": 0.7237617724778102, + "grad_norm": 9.418116569519043, + "learning_rate": 8.64042713395852e-05, + "loss": 1.1688, + "step": 10682 + }, + { + "epoch": 0.7238295277457822, + "grad_norm": 6.628981113433838, + "learning_rate": 8.640290232048737e-05, + "loss": 0.6387, + "step": 10683 + }, + { + "epoch": 0.7238972830137543, + "grad_norm": 5.6346659660339355, + "learning_rate": 8.640153330138956e-05, + "loss": 1.1043, + "step": 10684 + }, + { + "epoch": 0.7239650382817264, + "grad_norm": 6.523744583129883, + "learning_rate": 8.640016428229174e-05, + "loss": 0.8915, + "step": 10685 + }, + { + "epoch": 0.7240327935496985, + "grad_norm": 5.4516167640686035, + "learning_rate": 8.639879526319392e-05, + "loss": 0.6427, + "step": 10686 + }, + { + "epoch": 0.7241005488176706, + "grad_norm": 6.612290382385254, + "learning_rate": 8.639742624409611e-05, + "loss": 0.8633, + "step": 10687 + }, + { + "epoch": 0.7241683040856427, + "grad_norm": 5.145784854888916, + "learning_rate": 8.639605722499829e-05, + "loss": 0.7764, + "step": 10688 + }, + { + "epoch": 0.7242360593536148, + "grad_norm": 5.991262912750244, + "learning_rate": 8.639468820590047e-05, + "loss": 0.723, + "step": 10689 + }, + { + "epoch": 0.7243038146215869, + "grad_norm": 5.2909955978393555, + "learning_rate": 8.639331918680265e-05, + "loss": 0.6494, + "step": 10690 + }, + { + "epoch": 0.7243715698895589, + "grad_norm": 5.02228307723999, + "learning_rate": 8.639195016770484e-05, + "loss": 0.5259, + "step": 10691 + }, + { + "epoch": 0.724439325157531, + "grad_norm": 7.383895397186279, + "learning_rate": 8.639058114860703e-05, + "loss": 0.8099, + "step": 10692 + }, + { + "epoch": 0.724507080425503, + "grad_norm": 7.651692867279053, + "learning_rate": 8.63892121295092e-05, + "loss": 0.9502, + "step": 10693 + }, + { + "epoch": 0.7245748356934751, + "grad_norm": 7.732839107513428, + "learning_rate": 8.63878431104114e-05, + "loss": 0.6534, + "step": 10694 + }, + { + "epoch": 0.7246425909614472, + "grad_norm": 6.229733467102051, + "learning_rate": 8.638647409131358e-05, + "loss": 0.9293, + "step": 10695 + }, + { + "epoch": 0.7247103462294193, + "grad_norm": 6.323513507843018, + "learning_rate": 8.638510507221576e-05, + "loss": 0.7275, + "step": 10696 + }, + { + "epoch": 0.7247781014973914, + "grad_norm": 4.998154163360596, + "learning_rate": 8.638373605311795e-05, + "loss": 0.7756, + "step": 10697 + }, + { + "epoch": 0.7248458567653635, + "grad_norm": 5.609971046447754, + "learning_rate": 8.638236703402013e-05, + "loss": 0.9864, + "step": 10698 + }, + { + "epoch": 0.7249136120333356, + "grad_norm": 8.138459205627441, + "learning_rate": 8.638099801492231e-05, + "loss": 0.8453, + "step": 10699 + }, + { + "epoch": 0.7249813673013077, + "grad_norm": 6.91035795211792, + "learning_rate": 8.63796289958245e-05, + "loss": 0.7852, + "step": 10700 + }, + { + "epoch": 0.7250491225692798, + "grad_norm": 5.772835731506348, + "learning_rate": 8.637825997672669e-05, + "loss": 0.7951, + "step": 10701 + }, + { + "epoch": 0.7251168778372519, + "grad_norm": 7.034023761749268, + "learning_rate": 8.637689095762887e-05, + "loss": 0.9835, + "step": 10702 + }, + { + "epoch": 0.725184633105224, + "grad_norm": 6.605203628540039, + "learning_rate": 8.637552193853105e-05, + "loss": 0.6946, + "step": 10703 + }, + { + "epoch": 0.725252388373196, + "grad_norm": 6.754047870635986, + "learning_rate": 8.637415291943323e-05, + "loss": 0.605, + "step": 10704 + }, + { + "epoch": 0.7253201436411681, + "grad_norm": 5.961748123168945, + "learning_rate": 8.637278390033542e-05, + "loss": 0.8438, + "step": 10705 + }, + { + "epoch": 0.7253878989091402, + "grad_norm": 5.660187721252441, + "learning_rate": 8.63714148812376e-05, + "loss": 0.7882, + "step": 10706 + }, + { + "epoch": 0.7254556541771122, + "grad_norm": 6.974256992340088, + "learning_rate": 8.637004586213978e-05, + "loss": 0.9528, + "step": 10707 + }, + { + "epoch": 0.7255234094450843, + "grad_norm": 6.0205183029174805, + "learning_rate": 8.636867684304196e-05, + "loss": 0.8064, + "step": 10708 + }, + { + "epoch": 0.7255911647130564, + "grad_norm": 5.7911057472229, + "learning_rate": 8.636730782394415e-05, + "loss": 0.7973, + "step": 10709 + }, + { + "epoch": 0.7256589199810285, + "grad_norm": 6.384799480438232, + "learning_rate": 8.636593880484634e-05, + "loss": 0.8016, + "step": 10710 + }, + { + "epoch": 0.7257266752490006, + "grad_norm": 5.134740352630615, + "learning_rate": 8.636456978574852e-05, + "loss": 0.7034, + "step": 10711 + }, + { + "epoch": 0.7257944305169727, + "grad_norm": 6.371201992034912, + "learning_rate": 8.63632007666507e-05, + "loss": 0.8934, + "step": 10712 + }, + { + "epoch": 0.7258621857849448, + "grad_norm": 9.626450538635254, + "learning_rate": 8.636183174755288e-05, + "loss": 0.9802, + "step": 10713 + }, + { + "epoch": 0.7259299410529169, + "grad_norm": 7.453325271606445, + "learning_rate": 8.636046272845507e-05, + "loss": 0.7881, + "step": 10714 + }, + { + "epoch": 0.725997696320889, + "grad_norm": 6.08189582824707, + "learning_rate": 8.635909370935725e-05, + "loss": 0.8223, + "step": 10715 + }, + { + "epoch": 0.726065451588861, + "grad_norm": 6.120866298675537, + "learning_rate": 8.635772469025943e-05, + "loss": 0.7251, + "step": 10716 + }, + { + "epoch": 0.7261332068568331, + "grad_norm": 5.628901481628418, + "learning_rate": 8.635635567116161e-05, + "loss": 0.7254, + "step": 10717 + }, + { + "epoch": 0.7262009621248052, + "grad_norm": 5.944281101226807, + "learning_rate": 8.63549866520638e-05, + "loss": 0.9566, + "step": 10718 + }, + { + "epoch": 0.7262687173927773, + "grad_norm": 5.806936740875244, + "learning_rate": 8.635361763296599e-05, + "loss": 0.7517, + "step": 10719 + }, + { + "epoch": 0.7263364726607494, + "grad_norm": 6.54838228225708, + "learning_rate": 8.635224861386817e-05, + "loss": 0.6679, + "step": 10720 + }, + { + "epoch": 0.7264042279287215, + "grad_norm": 6.255834102630615, + "learning_rate": 8.635087959477035e-05, + "loss": 0.5863, + "step": 10721 + }, + { + "epoch": 0.7264719831966936, + "grad_norm": 6.097255706787109, + "learning_rate": 8.634951057567253e-05, + "loss": 0.7183, + "step": 10722 + }, + { + "epoch": 0.7265397384646656, + "grad_norm": 8.153336524963379, + "learning_rate": 8.634814155657471e-05, + "loss": 0.89, + "step": 10723 + }, + { + "epoch": 0.7266074937326377, + "grad_norm": 5.64036226272583, + "learning_rate": 8.63467725374769e-05, + "loss": 0.7711, + "step": 10724 + }, + { + "epoch": 0.7266752490006098, + "grad_norm": 5.449916362762451, + "learning_rate": 8.634540351837908e-05, + "loss": 0.7613, + "step": 10725 + }, + { + "epoch": 0.7267430042685818, + "grad_norm": 5.611260890960693, + "learning_rate": 8.634403449928127e-05, + "loss": 0.7947, + "step": 10726 + }, + { + "epoch": 0.7268107595365539, + "grad_norm": 6.064743518829346, + "learning_rate": 8.634266548018345e-05, + "loss": 0.7733, + "step": 10727 + }, + { + "epoch": 0.726878514804526, + "grad_norm": 6.760382175445557, + "learning_rate": 8.634129646108564e-05, + "loss": 0.7532, + "step": 10728 + }, + { + "epoch": 0.7269462700724981, + "grad_norm": 6.390462398529053, + "learning_rate": 8.633992744198782e-05, + "loss": 0.827, + "step": 10729 + }, + { + "epoch": 0.7270140253404702, + "grad_norm": 5.5772881507873535, + "learning_rate": 8.633855842289e-05, + "loss": 0.6548, + "step": 10730 + }, + { + "epoch": 0.7270817806084423, + "grad_norm": 6.615449905395508, + "learning_rate": 8.633718940379218e-05, + "loss": 0.7181, + "step": 10731 + }, + { + "epoch": 0.7271495358764144, + "grad_norm": 5.905831336975098, + "learning_rate": 8.633582038469436e-05, + "loss": 0.8118, + "step": 10732 + }, + { + "epoch": 0.7272172911443865, + "grad_norm": 8.367280006408691, + "learning_rate": 8.633445136559655e-05, + "loss": 0.895, + "step": 10733 + }, + { + "epoch": 0.7272850464123586, + "grad_norm": 5.006227493286133, + "learning_rate": 8.633308234649873e-05, + "loss": 0.7743, + "step": 10734 + }, + { + "epoch": 0.7273528016803307, + "grad_norm": 5.736496448516846, + "learning_rate": 8.633171332740092e-05, + "loss": 0.7135, + "step": 10735 + }, + { + "epoch": 0.7274205569483028, + "grad_norm": 6.69479513168335, + "learning_rate": 8.63303443083031e-05, + "loss": 0.9781, + "step": 10736 + }, + { + "epoch": 0.7274883122162749, + "grad_norm": 5.120262622833252, + "learning_rate": 8.632897528920529e-05, + "loss": 0.703, + "step": 10737 + }, + { + "epoch": 0.727556067484247, + "grad_norm": 5.343075275421143, + "learning_rate": 8.632760627010747e-05, + "loss": 0.7495, + "step": 10738 + }, + { + "epoch": 0.727623822752219, + "grad_norm": 5.489655494689941, + "learning_rate": 8.632623725100965e-05, + "loss": 0.7753, + "step": 10739 + }, + { + "epoch": 0.727691578020191, + "grad_norm": 5.322958469390869, + "learning_rate": 8.632486823191184e-05, + "loss": 0.7566, + "step": 10740 + }, + { + "epoch": 0.7277593332881631, + "grad_norm": 8.018611907958984, + "learning_rate": 8.632349921281402e-05, + "loss": 0.8898, + "step": 10741 + }, + { + "epoch": 0.7278270885561352, + "grad_norm": 5.888221740722656, + "learning_rate": 8.63221301937162e-05, + "loss": 0.7483, + "step": 10742 + }, + { + "epoch": 0.7278948438241073, + "grad_norm": 7.260030746459961, + "learning_rate": 8.63207611746184e-05, + "loss": 0.826, + "step": 10743 + }, + { + "epoch": 0.7279625990920794, + "grad_norm": 4.39701509475708, + "learning_rate": 8.631939215552058e-05, + "loss": 0.7154, + "step": 10744 + }, + { + "epoch": 0.7280303543600515, + "grad_norm": 7.766528606414795, + "learning_rate": 8.631802313642276e-05, + "loss": 0.9013, + "step": 10745 + }, + { + "epoch": 0.7280981096280236, + "grad_norm": 6.7016167640686035, + "learning_rate": 8.631665411732494e-05, + "loss": 0.6431, + "step": 10746 + }, + { + "epoch": 0.7281658648959957, + "grad_norm": 7.331559181213379, + "learning_rate": 8.631528509822713e-05, + "loss": 0.8061, + "step": 10747 + }, + { + "epoch": 0.7282336201639678, + "grad_norm": 7.0098114013671875, + "learning_rate": 8.631391607912931e-05, + "loss": 0.9915, + "step": 10748 + }, + { + "epoch": 0.7283013754319398, + "grad_norm": 5.108738899230957, + "learning_rate": 8.63125470600315e-05, + "loss": 0.66, + "step": 10749 + }, + { + "epoch": 0.7283691306999119, + "grad_norm": 5.508449554443359, + "learning_rate": 8.631117804093367e-05, + "loss": 0.6366, + "step": 10750 + }, + { + "epoch": 0.728436885967884, + "grad_norm": 5.624075889587402, + "learning_rate": 8.630980902183587e-05, + "loss": 0.8407, + "step": 10751 + }, + { + "epoch": 0.7285046412358561, + "grad_norm": 6.403767108917236, + "learning_rate": 8.630844000273805e-05, + "loss": 0.7612, + "step": 10752 + }, + { + "epoch": 0.7285723965038282, + "grad_norm": 5.629929542541504, + "learning_rate": 8.630707098364023e-05, + "loss": 1.0102, + "step": 10753 + }, + { + "epoch": 0.7286401517718003, + "grad_norm": 8.822092056274414, + "learning_rate": 8.630570196454241e-05, + "loss": 0.8066, + "step": 10754 + }, + { + "epoch": 0.7287079070397724, + "grad_norm": 8.513496398925781, + "learning_rate": 8.630433294544459e-05, + "loss": 1.061, + "step": 10755 + }, + { + "epoch": 0.7287756623077444, + "grad_norm": 5.223329067230225, + "learning_rate": 8.630296392634678e-05, + "loss": 0.7784, + "step": 10756 + }, + { + "epoch": 0.7288434175757165, + "grad_norm": 7.098320960998535, + "learning_rate": 8.630159490724896e-05, + "loss": 0.9385, + "step": 10757 + }, + { + "epoch": 0.7289111728436886, + "grad_norm": 5.522270679473877, + "learning_rate": 8.630022588815114e-05, + "loss": 0.6976, + "step": 10758 + }, + { + "epoch": 0.7289789281116607, + "grad_norm": 6.411101341247559, + "learning_rate": 8.629885686905332e-05, + "loss": 0.7332, + "step": 10759 + }, + { + "epoch": 0.7290466833796327, + "grad_norm": 5.263405799865723, + "learning_rate": 8.629748784995552e-05, + "loss": 0.699, + "step": 10760 + }, + { + "epoch": 0.7291144386476048, + "grad_norm": 8.649581909179688, + "learning_rate": 8.62961188308577e-05, + "loss": 0.818, + "step": 10761 + }, + { + "epoch": 0.7291821939155769, + "grad_norm": 5.901177883148193, + "learning_rate": 8.629474981175988e-05, + "loss": 0.8351, + "step": 10762 + }, + { + "epoch": 0.729249949183549, + "grad_norm": 7.491204738616943, + "learning_rate": 8.629338079266206e-05, + "loss": 0.8642, + "step": 10763 + }, + { + "epoch": 0.7293177044515211, + "grad_norm": 7.189452171325684, + "learning_rate": 8.629201177356424e-05, + "loss": 0.8524, + "step": 10764 + }, + { + "epoch": 0.7293854597194932, + "grad_norm": 5.667553901672363, + "learning_rate": 8.629064275446643e-05, + "loss": 0.8197, + "step": 10765 + }, + { + "epoch": 0.7294532149874653, + "grad_norm": 6.584259510040283, + "learning_rate": 8.628927373536861e-05, + "loss": 0.629, + "step": 10766 + }, + { + "epoch": 0.7295209702554374, + "grad_norm": 7.937713146209717, + "learning_rate": 8.62879047162708e-05, + "loss": 0.8158, + "step": 10767 + }, + { + "epoch": 0.7295887255234095, + "grad_norm": 9.484813690185547, + "learning_rate": 8.628653569717297e-05, + "loss": 1.0614, + "step": 10768 + }, + { + "epoch": 0.7296564807913816, + "grad_norm": 5.351037502288818, + "learning_rate": 8.628516667807516e-05, + "loss": 0.6763, + "step": 10769 + }, + { + "epoch": 0.7297242360593537, + "grad_norm": 7.94450569152832, + "learning_rate": 8.628379765897735e-05, + "loss": 1.1605, + "step": 10770 + }, + { + "epoch": 0.7297919913273258, + "grad_norm": 8.101015090942383, + "learning_rate": 8.628242863987953e-05, + "loss": 0.8566, + "step": 10771 + }, + { + "epoch": 0.7298597465952977, + "grad_norm": 5.217764854431152, + "learning_rate": 8.628105962078171e-05, + "loss": 0.7402, + "step": 10772 + }, + { + "epoch": 0.7299275018632698, + "grad_norm": 6.194571495056152, + "learning_rate": 8.627969060168389e-05, + "loss": 0.7202, + "step": 10773 + }, + { + "epoch": 0.7299952571312419, + "grad_norm": 5.476653575897217, + "learning_rate": 8.627832158258608e-05, + "loss": 0.7069, + "step": 10774 + }, + { + "epoch": 0.730063012399214, + "grad_norm": 6.67211389541626, + "learning_rate": 8.627695256348826e-05, + "loss": 0.7778, + "step": 10775 + }, + { + "epoch": 0.7301307676671861, + "grad_norm": 5.744596004486084, + "learning_rate": 8.627558354439044e-05, + "loss": 0.877, + "step": 10776 + }, + { + "epoch": 0.7301985229351582, + "grad_norm": 7.225982666015625, + "learning_rate": 8.627421452529263e-05, + "loss": 0.8411, + "step": 10777 + }, + { + "epoch": 0.7302662782031303, + "grad_norm": 5.470047473907471, + "learning_rate": 8.62728455061948e-05, + "loss": 0.6319, + "step": 10778 + }, + { + "epoch": 0.7303340334711024, + "grad_norm": 6.065576553344727, + "learning_rate": 8.6271476487097e-05, + "loss": 0.744, + "step": 10779 + }, + { + "epoch": 0.7304017887390745, + "grad_norm": 6.549447536468506, + "learning_rate": 8.627010746799918e-05, + "loss": 0.7161, + "step": 10780 + }, + { + "epoch": 0.7304695440070466, + "grad_norm": 4.89664363861084, + "learning_rate": 8.626873844890136e-05, + "loss": 0.7585, + "step": 10781 + }, + { + "epoch": 0.7305372992750186, + "grad_norm": 5.463417053222656, + "learning_rate": 8.626736942980354e-05, + "loss": 0.866, + "step": 10782 + }, + { + "epoch": 0.7306050545429907, + "grad_norm": 7.537135601043701, + "learning_rate": 8.626600041070573e-05, + "loss": 0.8118, + "step": 10783 + }, + { + "epoch": 0.7306728098109628, + "grad_norm": 6.079577445983887, + "learning_rate": 8.626463139160791e-05, + "loss": 0.846, + "step": 10784 + }, + { + "epoch": 0.7307405650789349, + "grad_norm": 8.43422794342041, + "learning_rate": 8.62632623725101e-05, + "loss": 0.4532, + "step": 10785 + }, + { + "epoch": 0.730808320346907, + "grad_norm": 9.411410331726074, + "learning_rate": 8.626189335341228e-05, + "loss": 0.808, + "step": 10786 + }, + { + "epoch": 0.7308760756148791, + "grad_norm": 6.041145324707031, + "learning_rate": 8.626052433431447e-05, + "loss": 0.7452, + "step": 10787 + }, + { + "epoch": 0.7309438308828511, + "grad_norm": 5.750189304351807, + "learning_rate": 8.625915531521665e-05, + "loss": 0.5868, + "step": 10788 + }, + { + "epoch": 0.7310115861508232, + "grad_norm": 6.068814754486084, + "learning_rate": 8.625778629611883e-05, + "loss": 0.8551, + "step": 10789 + }, + { + "epoch": 0.7310793414187953, + "grad_norm": 4.875567436218262, + "learning_rate": 8.625641727702102e-05, + "loss": 0.7231, + "step": 10790 + }, + { + "epoch": 0.7311470966867674, + "grad_norm": 6.8060712814331055, + "learning_rate": 8.62550482579232e-05, + "loss": 0.7692, + "step": 10791 + }, + { + "epoch": 0.7312148519547395, + "grad_norm": 5.561190605163574, + "learning_rate": 8.625367923882538e-05, + "loss": 0.7405, + "step": 10792 + }, + { + "epoch": 0.7312826072227115, + "grad_norm": 5.9766130447387695, + "learning_rate": 8.625231021972758e-05, + "loss": 0.8625, + "step": 10793 + }, + { + "epoch": 0.7313503624906836, + "grad_norm": 7.188475131988525, + "learning_rate": 8.625094120062976e-05, + "loss": 1.0192, + "step": 10794 + }, + { + "epoch": 0.7314181177586557, + "grad_norm": 7.752885341644287, + "learning_rate": 8.624957218153194e-05, + "loss": 0.7109, + "step": 10795 + }, + { + "epoch": 0.7314858730266278, + "grad_norm": 6.262071132659912, + "learning_rate": 8.624820316243412e-05, + "loss": 0.7859, + "step": 10796 + }, + { + "epoch": 0.7315536282945999, + "grad_norm": 8.485372543334961, + "learning_rate": 8.624683414333631e-05, + "loss": 0.585, + "step": 10797 + }, + { + "epoch": 0.731621383562572, + "grad_norm": 8.141338348388672, + "learning_rate": 8.62454651242385e-05, + "loss": 1.1567, + "step": 10798 + }, + { + "epoch": 0.7316891388305441, + "grad_norm": 7.912255764007568, + "learning_rate": 8.624409610514067e-05, + "loss": 0.7, + "step": 10799 + }, + { + "epoch": 0.7317568940985162, + "grad_norm": 5.951817035675049, + "learning_rate": 8.624272708604285e-05, + "loss": 0.9155, + "step": 10800 + }, + { + "epoch": 0.7318246493664883, + "grad_norm": 6.594394683837891, + "learning_rate": 8.624135806694503e-05, + "loss": 0.9209, + "step": 10801 + }, + { + "epoch": 0.7318924046344604, + "grad_norm": 6.647227764129639, + "learning_rate": 8.623998904784723e-05, + "loss": 0.7095, + "step": 10802 + }, + { + "epoch": 0.7319601599024325, + "grad_norm": 5.953242778778076, + "learning_rate": 8.623862002874941e-05, + "loss": 0.6739, + "step": 10803 + }, + { + "epoch": 0.7320279151704046, + "grad_norm": 6.691287517547607, + "learning_rate": 8.623725100965159e-05, + "loss": 0.8772, + "step": 10804 + }, + { + "epoch": 0.7320956704383765, + "grad_norm": 8.432060241699219, + "learning_rate": 8.623588199055377e-05, + "loss": 0.676, + "step": 10805 + }, + { + "epoch": 0.7321634257063486, + "grad_norm": 7.356803894042969, + "learning_rate": 8.623451297145596e-05, + "loss": 0.866, + "step": 10806 + }, + { + "epoch": 0.7322311809743207, + "grad_norm": 6.421758651733398, + "learning_rate": 8.623314395235814e-05, + "loss": 0.9497, + "step": 10807 + }, + { + "epoch": 0.7322989362422928, + "grad_norm": 4.597375869750977, + "learning_rate": 8.623177493326032e-05, + "loss": 0.788, + "step": 10808 + }, + { + "epoch": 0.7323666915102649, + "grad_norm": 5.468592166900635, + "learning_rate": 8.62304059141625e-05, + "loss": 0.7864, + "step": 10809 + }, + { + "epoch": 0.732434446778237, + "grad_norm": 6.1158833503723145, + "learning_rate": 8.622903689506468e-05, + "loss": 0.7756, + "step": 10810 + }, + { + "epoch": 0.7325022020462091, + "grad_norm": 6.584497451782227, + "learning_rate": 8.622766787596688e-05, + "loss": 0.806, + "step": 10811 + }, + { + "epoch": 0.7325699573141812, + "grad_norm": 4.517557621002197, + "learning_rate": 8.622629885686906e-05, + "loss": 0.5618, + "step": 10812 + }, + { + "epoch": 0.7326377125821533, + "grad_norm": 9.059310913085938, + "learning_rate": 8.622492983777124e-05, + "loss": 0.8049, + "step": 10813 + }, + { + "epoch": 0.7327054678501254, + "grad_norm": 5.33046293258667, + "learning_rate": 8.622356081867342e-05, + "loss": 0.7975, + "step": 10814 + }, + { + "epoch": 0.7327732231180974, + "grad_norm": 6.890832424163818, + "learning_rate": 8.622219179957561e-05, + "loss": 0.781, + "step": 10815 + }, + { + "epoch": 0.7328409783860695, + "grad_norm": 5.804647922515869, + "learning_rate": 8.62208227804778e-05, + "loss": 0.6286, + "step": 10816 + }, + { + "epoch": 0.7329087336540416, + "grad_norm": 6.476672649383545, + "learning_rate": 8.621945376137997e-05, + "loss": 0.7815, + "step": 10817 + }, + { + "epoch": 0.7329764889220137, + "grad_norm": 6.72651481628418, + "learning_rate": 8.621808474228215e-05, + "loss": 0.9064, + "step": 10818 + }, + { + "epoch": 0.7330442441899858, + "grad_norm": 7.354333877563477, + "learning_rate": 8.621671572318433e-05, + "loss": 0.8214, + "step": 10819 + }, + { + "epoch": 0.7331119994579579, + "grad_norm": 7.4680609703063965, + "learning_rate": 8.621534670408653e-05, + "loss": 0.9942, + "step": 10820 + }, + { + "epoch": 0.7331797547259299, + "grad_norm": 6.3199968338012695, + "learning_rate": 8.621397768498871e-05, + "loss": 0.9796, + "step": 10821 + }, + { + "epoch": 0.733247509993902, + "grad_norm": 5.8508453369140625, + "learning_rate": 8.621260866589089e-05, + "loss": 0.8393, + "step": 10822 + }, + { + "epoch": 0.7333152652618741, + "grad_norm": 7.697128772735596, + "learning_rate": 8.621123964679307e-05, + "loss": 1.1706, + "step": 10823 + }, + { + "epoch": 0.7333830205298462, + "grad_norm": 6.382595539093018, + "learning_rate": 8.620987062769525e-05, + "loss": 0.9019, + "step": 10824 + }, + { + "epoch": 0.7334507757978183, + "grad_norm": 6.121464729309082, + "learning_rate": 8.620850160859744e-05, + "loss": 0.7314, + "step": 10825 + }, + { + "epoch": 0.7335185310657903, + "grad_norm": 5.412440299987793, + "learning_rate": 8.620713258949962e-05, + "loss": 0.904, + "step": 10826 + }, + { + "epoch": 0.7335862863337624, + "grad_norm": 5.761203765869141, + "learning_rate": 8.62057635704018e-05, + "loss": 0.7746, + "step": 10827 + }, + { + "epoch": 0.7336540416017345, + "grad_norm": 5.339764595031738, + "learning_rate": 8.620439455130399e-05, + "loss": 0.8274, + "step": 10828 + }, + { + "epoch": 0.7337217968697066, + "grad_norm": 4.942664623260498, + "learning_rate": 8.620302553220618e-05, + "loss": 0.7429, + "step": 10829 + }, + { + "epoch": 0.7337895521376787, + "grad_norm": 6.011295318603516, + "learning_rate": 8.620165651310836e-05, + "loss": 0.6289, + "step": 10830 + }, + { + "epoch": 0.7338573074056508, + "grad_norm": 5.268429279327393, + "learning_rate": 8.620028749401054e-05, + "loss": 0.7213, + "step": 10831 + }, + { + "epoch": 0.7339250626736229, + "grad_norm": 5.994687080383301, + "learning_rate": 8.619891847491272e-05, + "loss": 0.7625, + "step": 10832 + }, + { + "epoch": 0.733992817941595, + "grad_norm": 5.908527374267578, + "learning_rate": 8.619754945581491e-05, + "loss": 0.6557, + "step": 10833 + }, + { + "epoch": 0.7340605732095671, + "grad_norm": 6.297107696533203, + "learning_rate": 8.61961804367171e-05, + "loss": 0.7886, + "step": 10834 + }, + { + "epoch": 0.7341283284775392, + "grad_norm": 7.262679576873779, + "learning_rate": 8.619481141761927e-05, + "loss": 0.703, + "step": 10835 + }, + { + "epoch": 0.7341960837455113, + "grad_norm": 6.179491996765137, + "learning_rate": 8.619344239852147e-05, + "loss": 0.646, + "step": 10836 + }, + { + "epoch": 0.7342638390134832, + "grad_norm": 5.197315216064453, + "learning_rate": 8.619207337942365e-05, + "loss": 0.8495, + "step": 10837 + }, + { + "epoch": 0.7343315942814553, + "grad_norm": 5.623149394989014, + "learning_rate": 8.619070436032583e-05, + "loss": 0.7513, + "step": 10838 + }, + { + "epoch": 0.7343993495494274, + "grad_norm": 6.872591018676758, + "learning_rate": 8.618933534122802e-05, + "loss": 0.756, + "step": 10839 + }, + { + "epoch": 0.7344671048173995, + "grad_norm": 6.851233005523682, + "learning_rate": 8.61879663221302e-05, + "loss": 0.9182, + "step": 10840 + }, + { + "epoch": 0.7345348600853716, + "grad_norm": 6.495956897735596, + "learning_rate": 8.618659730303238e-05, + "loss": 0.9298, + "step": 10841 + }, + { + "epoch": 0.7346026153533437, + "grad_norm": 5.927680492401123, + "learning_rate": 8.618522828393456e-05, + "loss": 0.7563, + "step": 10842 + }, + { + "epoch": 0.7346703706213158, + "grad_norm": 6.39896821975708, + "learning_rate": 8.618385926483676e-05, + "loss": 0.993, + "step": 10843 + }, + { + "epoch": 0.7347381258892879, + "grad_norm": 5.756770133972168, + "learning_rate": 8.618249024573894e-05, + "loss": 0.6887, + "step": 10844 + }, + { + "epoch": 0.73480588115726, + "grad_norm": 4.987828731536865, + "learning_rate": 8.618112122664112e-05, + "loss": 0.5908, + "step": 10845 + }, + { + "epoch": 0.7348736364252321, + "grad_norm": 7.487382888793945, + "learning_rate": 8.61797522075433e-05, + "loss": 0.6574, + "step": 10846 + }, + { + "epoch": 0.7349413916932042, + "grad_norm": 5.079615592956543, + "learning_rate": 8.617838318844549e-05, + "loss": 0.9038, + "step": 10847 + }, + { + "epoch": 0.7350091469611763, + "grad_norm": 7.286651134490967, + "learning_rate": 8.617701416934767e-05, + "loss": 0.9635, + "step": 10848 + }, + { + "epoch": 0.7350769022291483, + "grad_norm": 6.60608434677124, + "learning_rate": 8.617564515024985e-05, + "loss": 0.8811, + "step": 10849 + }, + { + "epoch": 0.7351446574971204, + "grad_norm": 5.821297645568848, + "learning_rate": 8.617427613115203e-05, + "loss": 0.6802, + "step": 10850 + }, + { + "epoch": 0.7352124127650925, + "grad_norm": 5.659653663635254, + "learning_rate": 8.617290711205421e-05, + "loss": 0.9145, + "step": 10851 + }, + { + "epoch": 0.7352801680330646, + "grad_norm": 6.8696980476379395, + "learning_rate": 8.617153809295641e-05, + "loss": 1.0301, + "step": 10852 + }, + { + "epoch": 0.7353479233010367, + "grad_norm": 5.688724040985107, + "learning_rate": 8.617016907385859e-05, + "loss": 0.7185, + "step": 10853 + }, + { + "epoch": 0.7354156785690087, + "grad_norm": 6.016847610473633, + "learning_rate": 8.616880005476077e-05, + "loss": 0.7597, + "step": 10854 + }, + { + "epoch": 0.7354834338369808, + "grad_norm": 6.0264692306518555, + "learning_rate": 8.616743103566295e-05, + "loss": 0.6665, + "step": 10855 + }, + { + "epoch": 0.7355511891049529, + "grad_norm": 5.6370415687561035, + "learning_rate": 8.616606201656513e-05, + "loss": 0.7088, + "step": 10856 + }, + { + "epoch": 0.735618944372925, + "grad_norm": 6.686164379119873, + "learning_rate": 8.616469299746732e-05, + "loss": 0.7139, + "step": 10857 + }, + { + "epoch": 0.7356866996408971, + "grad_norm": 6.4555134773254395, + "learning_rate": 8.61633239783695e-05, + "loss": 0.7969, + "step": 10858 + }, + { + "epoch": 0.7357544549088691, + "grad_norm": 5.4364752769470215, + "learning_rate": 8.616195495927168e-05, + "loss": 0.6922, + "step": 10859 + }, + { + "epoch": 0.7358222101768412, + "grad_norm": 7.089079856872559, + "learning_rate": 8.616058594017386e-05, + "loss": 1.0301, + "step": 10860 + }, + { + "epoch": 0.7358899654448133, + "grad_norm": 5.888257026672363, + "learning_rate": 8.615921692107606e-05, + "loss": 0.5942, + "step": 10861 + }, + { + "epoch": 0.7359577207127854, + "grad_norm": 5.762383460998535, + "learning_rate": 8.615784790197824e-05, + "loss": 0.6695, + "step": 10862 + }, + { + "epoch": 0.7360254759807575, + "grad_norm": 5.797264575958252, + "learning_rate": 8.615647888288042e-05, + "loss": 0.705, + "step": 10863 + }, + { + "epoch": 0.7360932312487296, + "grad_norm": 4.282798767089844, + "learning_rate": 8.61551098637826e-05, + "loss": 0.6221, + "step": 10864 + }, + { + "epoch": 0.7361609865167017, + "grad_norm": 5.539671897888184, + "learning_rate": 8.615374084468478e-05, + "loss": 0.739, + "step": 10865 + }, + { + "epoch": 0.7362287417846738, + "grad_norm": 5.675551891326904, + "learning_rate": 8.615237182558697e-05, + "loss": 0.9089, + "step": 10866 + }, + { + "epoch": 0.7362964970526459, + "grad_norm": 5.868411064147949, + "learning_rate": 8.615100280648915e-05, + "loss": 0.8312, + "step": 10867 + }, + { + "epoch": 0.736364252320618, + "grad_norm": 5.487252712249756, + "learning_rate": 8.614963378739133e-05, + "loss": 1.0099, + "step": 10868 + }, + { + "epoch": 0.7364320075885901, + "grad_norm": 6.392849445343018, + "learning_rate": 8.614826476829351e-05, + "loss": 0.6691, + "step": 10869 + }, + { + "epoch": 0.736499762856562, + "grad_norm": 6.58278226852417, + "learning_rate": 8.614689574919571e-05, + "loss": 0.8738, + "step": 10870 + }, + { + "epoch": 0.7365675181245341, + "grad_norm": 8.645120620727539, + "learning_rate": 8.614552673009789e-05, + "loss": 0.7185, + "step": 10871 + }, + { + "epoch": 0.7366352733925062, + "grad_norm": 4.876821041107178, + "learning_rate": 8.614415771100007e-05, + "loss": 0.6354, + "step": 10872 + }, + { + "epoch": 0.7367030286604783, + "grad_norm": 5.1632232666015625, + "learning_rate": 8.614278869190225e-05, + "loss": 0.6593, + "step": 10873 + }, + { + "epoch": 0.7367707839284504, + "grad_norm": 7.005191326141357, + "learning_rate": 8.614141967280443e-05, + "loss": 0.9814, + "step": 10874 + }, + { + "epoch": 0.7368385391964225, + "grad_norm": 5.898367881774902, + "learning_rate": 8.614005065370662e-05, + "loss": 0.7328, + "step": 10875 + }, + { + "epoch": 0.7369062944643946, + "grad_norm": 6.076502799987793, + "learning_rate": 8.61386816346088e-05, + "loss": 0.8477, + "step": 10876 + }, + { + "epoch": 0.7369740497323667, + "grad_norm": 7.934567928314209, + "learning_rate": 8.613731261551098e-05, + "loss": 0.7914, + "step": 10877 + }, + { + "epoch": 0.7370418050003388, + "grad_norm": 6.465484619140625, + "learning_rate": 8.613594359641316e-05, + "loss": 0.768, + "step": 10878 + }, + { + "epoch": 0.7371095602683109, + "grad_norm": 5.772295951843262, + "learning_rate": 8.613457457731536e-05, + "loss": 0.7722, + "step": 10879 + }, + { + "epoch": 0.737177315536283, + "grad_norm": 5.704565525054932, + "learning_rate": 8.613320555821754e-05, + "loss": 0.6609, + "step": 10880 + }, + { + "epoch": 0.737245070804255, + "grad_norm": 6.054666042327881, + "learning_rate": 8.613183653911972e-05, + "loss": 0.8513, + "step": 10881 + }, + { + "epoch": 0.7373128260722271, + "grad_norm": 6.808941841125488, + "learning_rate": 8.613046752002191e-05, + "loss": 0.8188, + "step": 10882 + }, + { + "epoch": 0.7373805813401992, + "grad_norm": 5.95088005065918, + "learning_rate": 8.61290985009241e-05, + "loss": 0.8952, + "step": 10883 + }, + { + "epoch": 0.7374483366081713, + "grad_norm": 6.34380578994751, + "learning_rate": 8.612772948182627e-05, + "loss": 0.9835, + "step": 10884 + }, + { + "epoch": 0.7375160918761434, + "grad_norm": 6.720582008361816, + "learning_rate": 8.612636046272847e-05, + "loss": 1.0013, + "step": 10885 + }, + { + "epoch": 0.7375838471441154, + "grad_norm": 7.400576591491699, + "learning_rate": 8.612499144363065e-05, + "loss": 0.906, + "step": 10886 + }, + { + "epoch": 0.7376516024120875, + "grad_norm": 7.385793209075928, + "learning_rate": 8.612362242453283e-05, + "loss": 0.9448, + "step": 10887 + }, + { + "epoch": 0.7377193576800596, + "grad_norm": 4.82860803604126, + "learning_rate": 8.612225340543501e-05, + "loss": 0.724, + "step": 10888 + }, + { + "epoch": 0.7377871129480317, + "grad_norm": 7.288435935974121, + "learning_rate": 8.61208843863372e-05, + "loss": 1.0472, + "step": 10889 + }, + { + "epoch": 0.7378548682160038, + "grad_norm": 5.416788578033447, + "learning_rate": 8.611951536723938e-05, + "loss": 0.6952, + "step": 10890 + }, + { + "epoch": 0.7379226234839759, + "grad_norm": 6.170418739318848, + "learning_rate": 8.611814634814156e-05, + "loss": 0.8321, + "step": 10891 + }, + { + "epoch": 0.737990378751948, + "grad_norm": 6.4705119132995605, + "learning_rate": 8.611677732904374e-05, + "loss": 0.9207, + "step": 10892 + }, + { + "epoch": 0.73805813401992, + "grad_norm": 5.7486138343811035, + "learning_rate": 8.611540830994594e-05, + "loss": 0.8527, + "step": 10893 + }, + { + "epoch": 0.7381258892878921, + "grad_norm": 6.1065568923950195, + "learning_rate": 8.611403929084812e-05, + "loss": 0.7026, + "step": 10894 + }, + { + "epoch": 0.7381936445558642, + "grad_norm": 5.706049919128418, + "learning_rate": 8.61126702717503e-05, + "loss": 0.8366, + "step": 10895 + }, + { + "epoch": 0.7382613998238363, + "grad_norm": 4.653761863708496, + "learning_rate": 8.611130125265248e-05, + "loss": 0.538, + "step": 10896 + }, + { + "epoch": 0.7383291550918084, + "grad_norm": 7.086725234985352, + "learning_rate": 8.610993223355466e-05, + "loss": 0.906, + "step": 10897 + }, + { + "epoch": 0.7383969103597805, + "grad_norm": 8.890185356140137, + "learning_rate": 8.610856321445685e-05, + "loss": 1.0786, + "step": 10898 + }, + { + "epoch": 0.7384646656277526, + "grad_norm": 6.072719097137451, + "learning_rate": 8.610719419535903e-05, + "loss": 0.8558, + "step": 10899 + }, + { + "epoch": 0.7385324208957247, + "grad_norm": 6.266420364379883, + "learning_rate": 8.610582517626121e-05, + "loss": 0.9337, + "step": 10900 + }, + { + "epoch": 0.7386001761636968, + "grad_norm": 5.14294958114624, + "learning_rate": 8.61044561571634e-05, + "loss": 0.5262, + "step": 10901 + }, + { + "epoch": 0.7386679314316689, + "grad_norm": 8.386168479919434, + "learning_rate": 8.610308713806559e-05, + "loss": 1.3214, + "step": 10902 + }, + { + "epoch": 0.7387356866996408, + "grad_norm": 4.496156692504883, + "learning_rate": 8.610171811896777e-05, + "loss": 0.81, + "step": 10903 + }, + { + "epoch": 0.7388034419676129, + "grad_norm": 8.191973686218262, + "learning_rate": 8.610034909986995e-05, + "loss": 0.7648, + "step": 10904 + }, + { + "epoch": 0.738871197235585, + "grad_norm": 6.122156620025635, + "learning_rate": 8.609898008077213e-05, + "loss": 0.8471, + "step": 10905 + }, + { + "epoch": 0.7389389525035571, + "grad_norm": 6.742517471313477, + "learning_rate": 8.609761106167431e-05, + "loss": 0.8456, + "step": 10906 + }, + { + "epoch": 0.7390067077715292, + "grad_norm": 4.137988090515137, + "learning_rate": 8.60962420425765e-05, + "loss": 0.7035, + "step": 10907 + }, + { + "epoch": 0.7390744630395013, + "grad_norm": 6.304561138153076, + "learning_rate": 8.609487302347868e-05, + "loss": 0.8491, + "step": 10908 + }, + { + "epoch": 0.7391422183074734, + "grad_norm": 7.52929162979126, + "learning_rate": 8.609350400438086e-05, + "loss": 0.6814, + "step": 10909 + }, + { + "epoch": 0.7392099735754455, + "grad_norm": 5.229283809661865, + "learning_rate": 8.609213498528304e-05, + "loss": 0.8091, + "step": 10910 + }, + { + "epoch": 0.7392777288434176, + "grad_norm": 7.810683250427246, + "learning_rate": 8.609076596618522e-05, + "loss": 1.0311, + "step": 10911 + }, + { + "epoch": 0.7393454841113897, + "grad_norm": 4.896294116973877, + "learning_rate": 8.608939694708742e-05, + "loss": 0.7789, + "step": 10912 + }, + { + "epoch": 0.7394132393793618, + "grad_norm": 5.661660194396973, + "learning_rate": 8.60880279279896e-05, + "loss": 0.8267, + "step": 10913 + }, + { + "epoch": 0.7394809946473339, + "grad_norm": 6.691354274749756, + "learning_rate": 8.608665890889178e-05, + "loss": 0.9455, + "step": 10914 + }, + { + "epoch": 0.739548749915306, + "grad_norm": 4.900321960449219, + "learning_rate": 8.608528988979396e-05, + "loss": 0.7102, + "step": 10915 + }, + { + "epoch": 0.739616505183278, + "grad_norm": 6.105568885803223, + "learning_rate": 8.608392087069615e-05, + "loss": 0.825, + "step": 10916 + }, + { + "epoch": 0.7396842604512501, + "grad_norm": 6.132299900054932, + "learning_rate": 8.608255185159833e-05, + "loss": 0.754, + "step": 10917 + }, + { + "epoch": 0.7397520157192222, + "grad_norm": 6.12472677230835, + "learning_rate": 8.608118283250051e-05, + "loss": 0.9008, + "step": 10918 + }, + { + "epoch": 0.7398197709871942, + "grad_norm": 6.3657002449035645, + "learning_rate": 8.60798138134027e-05, + "loss": 1.0276, + "step": 10919 + }, + { + "epoch": 0.7398875262551663, + "grad_norm": 5.224886894226074, + "learning_rate": 8.607844479430487e-05, + "loss": 0.8636, + "step": 10920 + }, + { + "epoch": 0.7399552815231384, + "grad_norm": 12.790693283081055, + "learning_rate": 8.607707577520707e-05, + "loss": 0.9617, + "step": 10921 + }, + { + "epoch": 0.7400230367911105, + "grad_norm": 5.787003040313721, + "learning_rate": 8.607570675610925e-05, + "loss": 0.6847, + "step": 10922 + }, + { + "epoch": 0.7400907920590826, + "grad_norm": 5.556890964508057, + "learning_rate": 8.607433773701143e-05, + "loss": 0.7855, + "step": 10923 + }, + { + "epoch": 0.7401585473270547, + "grad_norm": 4.655983924865723, + "learning_rate": 8.607296871791361e-05, + "loss": 0.7021, + "step": 10924 + }, + { + "epoch": 0.7402263025950268, + "grad_norm": 6.317244052886963, + "learning_rate": 8.60715996988158e-05, + "loss": 0.7709, + "step": 10925 + }, + { + "epoch": 0.7402940578629988, + "grad_norm": 5.975430965423584, + "learning_rate": 8.607023067971798e-05, + "loss": 0.8145, + "step": 10926 + }, + { + "epoch": 0.7403618131309709, + "grad_norm": 5.794954776763916, + "learning_rate": 8.606886166062016e-05, + "loss": 0.8564, + "step": 10927 + }, + { + "epoch": 0.740429568398943, + "grad_norm": 5.966963291168213, + "learning_rate": 8.606749264152236e-05, + "loss": 0.7833, + "step": 10928 + }, + { + "epoch": 0.7404973236669151, + "grad_norm": 8.969901084899902, + "learning_rate": 8.606612362242454e-05, + "loss": 0.9768, + "step": 10929 + }, + { + "epoch": 0.7405650789348872, + "grad_norm": 7.2957353591918945, + "learning_rate": 8.606475460332672e-05, + "loss": 1.0157, + "step": 10930 + }, + { + "epoch": 0.7406328342028593, + "grad_norm": 7.000049591064453, + "learning_rate": 8.606338558422891e-05, + "loss": 0.7907, + "step": 10931 + }, + { + "epoch": 0.7407005894708314, + "grad_norm": 7.120476722717285, + "learning_rate": 8.606201656513109e-05, + "loss": 0.7708, + "step": 10932 + }, + { + "epoch": 0.7407683447388035, + "grad_norm": 6.740073204040527, + "learning_rate": 8.606064754603327e-05, + "loss": 0.8841, + "step": 10933 + }, + { + "epoch": 0.7408361000067756, + "grad_norm": 4.036641597747803, + "learning_rate": 8.605927852693545e-05, + "loss": 0.6445, + "step": 10934 + }, + { + "epoch": 0.7409038552747476, + "grad_norm": 6.200973033905029, + "learning_rate": 8.605790950783765e-05, + "loss": 0.6976, + "step": 10935 + }, + { + "epoch": 0.7409716105427196, + "grad_norm": 6.047748565673828, + "learning_rate": 8.605654048873983e-05, + "loss": 0.8174, + "step": 10936 + }, + { + "epoch": 0.7410393658106917, + "grad_norm": 6.8715081214904785, + "learning_rate": 8.605517146964201e-05, + "loss": 0.9817, + "step": 10937 + }, + { + "epoch": 0.7411071210786638, + "grad_norm": 5.555269241333008, + "learning_rate": 8.605380245054419e-05, + "loss": 0.6318, + "step": 10938 + }, + { + "epoch": 0.7411748763466359, + "grad_norm": 8.042158126831055, + "learning_rate": 8.605243343144638e-05, + "loss": 1.2133, + "step": 10939 + }, + { + "epoch": 0.741242631614608, + "grad_norm": 6.014042854309082, + "learning_rate": 8.605106441234856e-05, + "loss": 0.7934, + "step": 10940 + }, + { + "epoch": 0.7413103868825801, + "grad_norm": 5.317357063293457, + "learning_rate": 8.604969539325074e-05, + "loss": 0.7135, + "step": 10941 + }, + { + "epoch": 0.7413781421505522, + "grad_norm": 5.755384922027588, + "learning_rate": 8.604832637415292e-05, + "loss": 0.4912, + "step": 10942 + }, + { + "epoch": 0.7414458974185243, + "grad_norm": 5.812741279602051, + "learning_rate": 8.60469573550551e-05, + "loss": 1.029, + "step": 10943 + }, + { + "epoch": 0.7415136526864964, + "grad_norm": 6.497066020965576, + "learning_rate": 8.60455883359573e-05, + "loss": 0.8707, + "step": 10944 + }, + { + "epoch": 0.7415814079544685, + "grad_norm": 5.14995002746582, + "learning_rate": 8.604421931685948e-05, + "loss": 0.6412, + "step": 10945 + }, + { + "epoch": 0.7416491632224406, + "grad_norm": 6.122746467590332, + "learning_rate": 8.604285029776166e-05, + "loss": 0.7705, + "step": 10946 + }, + { + "epoch": 0.7417169184904127, + "grad_norm": 5.844207763671875, + "learning_rate": 8.604148127866384e-05, + "loss": 0.9171, + "step": 10947 + }, + { + "epoch": 0.7417846737583847, + "grad_norm": 4.956113815307617, + "learning_rate": 8.604011225956603e-05, + "loss": 0.6854, + "step": 10948 + }, + { + "epoch": 0.7418524290263568, + "grad_norm": 6.357004165649414, + "learning_rate": 8.603874324046821e-05, + "loss": 0.9097, + "step": 10949 + }, + { + "epoch": 0.7419201842943289, + "grad_norm": 8.953754425048828, + "learning_rate": 8.60373742213704e-05, + "loss": 0.7142, + "step": 10950 + }, + { + "epoch": 0.741987939562301, + "grad_norm": 6.2500691413879395, + "learning_rate": 8.603600520227257e-05, + "loss": 0.5626, + "step": 10951 + }, + { + "epoch": 0.742055694830273, + "grad_norm": 6.902349472045898, + "learning_rate": 8.603463618317475e-05, + "loss": 1.0085, + "step": 10952 + }, + { + "epoch": 0.7421234500982451, + "grad_norm": 5.850625991821289, + "learning_rate": 8.603326716407695e-05, + "loss": 0.691, + "step": 10953 + }, + { + "epoch": 0.7421912053662172, + "grad_norm": 6.940263748168945, + "learning_rate": 8.603189814497913e-05, + "loss": 0.9006, + "step": 10954 + }, + { + "epoch": 0.7422589606341893, + "grad_norm": 6.827815532684326, + "learning_rate": 8.603052912588131e-05, + "loss": 0.7597, + "step": 10955 + }, + { + "epoch": 0.7423267159021614, + "grad_norm": 7.243155479431152, + "learning_rate": 8.602916010678349e-05, + "loss": 0.7973, + "step": 10956 + }, + { + "epoch": 0.7423944711701335, + "grad_norm": 5.524760723114014, + "learning_rate": 8.602779108768567e-05, + "loss": 0.8164, + "step": 10957 + }, + { + "epoch": 0.7424622264381056, + "grad_norm": 6.662420749664307, + "learning_rate": 8.602642206858786e-05, + "loss": 0.6328, + "step": 10958 + }, + { + "epoch": 0.7425299817060776, + "grad_norm": 8.47179889678955, + "learning_rate": 8.602505304949004e-05, + "loss": 1.1036, + "step": 10959 + }, + { + "epoch": 0.7425977369740497, + "grad_norm": 6.377445220947266, + "learning_rate": 8.602368403039222e-05, + "loss": 0.7944, + "step": 10960 + }, + { + "epoch": 0.7426654922420218, + "grad_norm": 6.111810207366943, + "learning_rate": 8.60223150112944e-05, + "loss": 1.008, + "step": 10961 + }, + { + "epoch": 0.7427332475099939, + "grad_norm": 3.763913154602051, + "learning_rate": 8.60209459921966e-05, + "loss": 0.6067, + "step": 10962 + }, + { + "epoch": 0.742801002777966, + "grad_norm": 7.272477626800537, + "learning_rate": 8.601957697309878e-05, + "loss": 0.813, + "step": 10963 + }, + { + "epoch": 0.7428687580459381, + "grad_norm": 6.8635406494140625, + "learning_rate": 8.601820795400096e-05, + "loss": 0.6991, + "step": 10964 + }, + { + "epoch": 0.7429365133139102, + "grad_norm": 5.985293388366699, + "learning_rate": 8.601683893490314e-05, + "loss": 0.8404, + "step": 10965 + }, + { + "epoch": 0.7430042685818823, + "grad_norm": 7.298139572143555, + "learning_rate": 8.601546991580532e-05, + "loss": 0.7766, + "step": 10966 + }, + { + "epoch": 0.7430720238498544, + "grad_norm": 6.489261150360107, + "learning_rate": 8.601410089670751e-05, + "loss": 0.8351, + "step": 10967 + }, + { + "epoch": 0.7431397791178264, + "grad_norm": 6.494144439697266, + "learning_rate": 8.60127318776097e-05, + "loss": 0.7456, + "step": 10968 + }, + { + "epoch": 0.7432075343857985, + "grad_norm": 7.2324090003967285, + "learning_rate": 8.601136285851187e-05, + "loss": 0.7192, + "step": 10969 + }, + { + "epoch": 0.7432752896537705, + "grad_norm": 5.349514007568359, + "learning_rate": 8.600999383941405e-05, + "loss": 0.8994, + "step": 10970 + }, + { + "epoch": 0.7433430449217426, + "grad_norm": 5.59208869934082, + "learning_rate": 8.600862482031625e-05, + "loss": 0.6534, + "step": 10971 + }, + { + "epoch": 0.7434108001897147, + "grad_norm": 6.849013328552246, + "learning_rate": 8.600725580121843e-05, + "loss": 0.7908, + "step": 10972 + }, + { + "epoch": 0.7434785554576868, + "grad_norm": 7.590834617614746, + "learning_rate": 8.600588678212061e-05, + "loss": 0.7773, + "step": 10973 + }, + { + "epoch": 0.7435463107256589, + "grad_norm": 5.68290901184082, + "learning_rate": 8.60045177630228e-05, + "loss": 0.9227, + "step": 10974 + }, + { + "epoch": 0.743614065993631, + "grad_norm": 7.747159004211426, + "learning_rate": 8.600314874392498e-05, + "loss": 0.8515, + "step": 10975 + }, + { + "epoch": 0.7436818212616031, + "grad_norm": 6.546794891357422, + "learning_rate": 8.600177972482716e-05, + "loss": 0.7743, + "step": 10976 + }, + { + "epoch": 0.7437495765295752, + "grad_norm": 6.478428363800049, + "learning_rate": 8.600041070572936e-05, + "loss": 0.844, + "step": 10977 + }, + { + "epoch": 0.7438173317975473, + "grad_norm": 6.902507781982422, + "learning_rate": 8.599904168663154e-05, + "loss": 0.9503, + "step": 10978 + }, + { + "epoch": 0.7438850870655194, + "grad_norm": 5.328883171081543, + "learning_rate": 8.599767266753372e-05, + "loss": 0.9925, + "step": 10979 + }, + { + "epoch": 0.7439528423334915, + "grad_norm": 7.376800537109375, + "learning_rate": 8.599630364843591e-05, + "loss": 1.0425, + "step": 10980 + }, + { + "epoch": 0.7440205976014636, + "grad_norm": 5.4552693367004395, + "learning_rate": 8.599493462933809e-05, + "loss": 0.7362, + "step": 10981 + }, + { + "epoch": 0.7440883528694356, + "grad_norm": 6.60410213470459, + "learning_rate": 8.599356561024027e-05, + "loss": 0.5882, + "step": 10982 + }, + { + "epoch": 0.7441561081374077, + "grad_norm": 5.760132789611816, + "learning_rate": 8.599219659114245e-05, + "loss": 0.8001, + "step": 10983 + }, + { + "epoch": 0.7442238634053797, + "grad_norm": 6.064749717712402, + "learning_rate": 8.599082757204463e-05, + "loss": 0.6922, + "step": 10984 + }, + { + "epoch": 0.7442916186733518, + "grad_norm": 5.042727470397949, + "learning_rate": 8.598945855294683e-05, + "loss": 0.7558, + "step": 10985 + }, + { + "epoch": 0.7443593739413239, + "grad_norm": 5.222632884979248, + "learning_rate": 8.598808953384901e-05, + "loss": 0.6729, + "step": 10986 + }, + { + "epoch": 0.744427129209296, + "grad_norm": 6.379947185516357, + "learning_rate": 8.598672051475119e-05, + "loss": 0.995, + "step": 10987 + }, + { + "epoch": 0.7444948844772681, + "grad_norm": 8.097314834594727, + "learning_rate": 8.598535149565337e-05, + "loss": 0.9257, + "step": 10988 + }, + { + "epoch": 0.7445626397452402, + "grad_norm": 7.161442279815674, + "learning_rate": 8.598398247655555e-05, + "loss": 0.9308, + "step": 10989 + }, + { + "epoch": 0.7446303950132123, + "grad_norm": 8.118345260620117, + "learning_rate": 8.598261345745774e-05, + "loss": 0.7901, + "step": 10990 + }, + { + "epoch": 0.7446981502811844, + "grad_norm": 5.9762797355651855, + "learning_rate": 8.598124443835992e-05, + "loss": 0.7084, + "step": 10991 + }, + { + "epoch": 0.7447659055491564, + "grad_norm": 5.742055416107178, + "learning_rate": 8.59798754192621e-05, + "loss": 0.6301, + "step": 10992 + }, + { + "epoch": 0.7448336608171285, + "grad_norm": 5.871792793273926, + "learning_rate": 8.597850640016428e-05, + "loss": 0.6938, + "step": 10993 + }, + { + "epoch": 0.7449014160851006, + "grad_norm": 6.759030342102051, + "learning_rate": 8.597713738106648e-05, + "loss": 0.8002, + "step": 10994 + }, + { + "epoch": 0.7449691713530727, + "grad_norm": 7.746789455413818, + "learning_rate": 8.597576836196866e-05, + "loss": 1.0539, + "step": 10995 + }, + { + "epoch": 0.7450369266210448, + "grad_norm": 8.104504585266113, + "learning_rate": 8.597439934287084e-05, + "loss": 0.8205, + "step": 10996 + }, + { + "epoch": 0.7451046818890169, + "grad_norm": 5.84213399887085, + "learning_rate": 8.597303032377302e-05, + "loss": 1.0039, + "step": 10997 + }, + { + "epoch": 0.745172437156989, + "grad_norm": 6.890494346618652, + "learning_rate": 8.59716613046752e-05, + "loss": 1.1614, + "step": 10998 + }, + { + "epoch": 0.7452401924249611, + "grad_norm": 5.201790809631348, + "learning_rate": 8.597029228557739e-05, + "loss": 0.6836, + "step": 10999 + }, + { + "epoch": 0.7453079476929331, + "grad_norm": 5.523726463317871, + "learning_rate": 8.596892326647957e-05, + "loss": 0.9281, + "step": 11000 + }, + { + "epoch": 0.7453757029609052, + "grad_norm": 6.218978404998779, + "learning_rate": 8.596755424738175e-05, + "loss": 0.8996, + "step": 11001 + }, + { + "epoch": 0.7454434582288773, + "grad_norm": 4.860278606414795, + "learning_rate": 8.596618522828393e-05, + "loss": 0.8022, + "step": 11002 + }, + { + "epoch": 0.7455112134968493, + "grad_norm": 6.433527946472168, + "learning_rate": 8.596481620918613e-05, + "loss": 0.8649, + "step": 11003 + }, + { + "epoch": 0.7455789687648214, + "grad_norm": 6.700179100036621, + "learning_rate": 8.596344719008831e-05, + "loss": 0.8124, + "step": 11004 + }, + { + "epoch": 0.7456467240327935, + "grad_norm": 6.234446048736572, + "learning_rate": 8.596207817099049e-05, + "loss": 0.8406, + "step": 11005 + }, + { + "epoch": 0.7457144793007656, + "grad_norm": 6.376819610595703, + "learning_rate": 8.596070915189267e-05, + "loss": 0.7509, + "step": 11006 + }, + { + "epoch": 0.7457822345687377, + "grad_norm": 5.808053970336914, + "learning_rate": 8.595934013279485e-05, + "loss": 0.864, + "step": 11007 + }, + { + "epoch": 0.7458499898367098, + "grad_norm": 6.195199012756348, + "learning_rate": 8.595797111369704e-05, + "loss": 0.9481, + "step": 11008 + }, + { + "epoch": 0.7459177451046819, + "grad_norm": 5.600381374359131, + "learning_rate": 8.595660209459922e-05, + "loss": 0.8916, + "step": 11009 + }, + { + "epoch": 0.745985500372654, + "grad_norm": 5.866032600402832, + "learning_rate": 8.59552330755014e-05, + "loss": 0.813, + "step": 11010 + }, + { + "epoch": 0.7460532556406261, + "grad_norm": 7.881616592407227, + "learning_rate": 8.595386405640358e-05, + "loss": 0.7804, + "step": 11011 + }, + { + "epoch": 0.7461210109085982, + "grad_norm": 6.006317138671875, + "learning_rate": 8.595249503730576e-05, + "loss": 0.9739, + "step": 11012 + }, + { + "epoch": 0.7461887661765703, + "grad_norm": 5.69516658782959, + "learning_rate": 8.595112601820796e-05, + "loss": 0.876, + "step": 11013 + }, + { + "epoch": 0.7462565214445424, + "grad_norm": 6.4250006675720215, + "learning_rate": 8.594975699911014e-05, + "loss": 1.126, + "step": 11014 + }, + { + "epoch": 0.7463242767125144, + "grad_norm": 4.753213405609131, + "learning_rate": 8.594838798001232e-05, + "loss": 0.9536, + "step": 11015 + }, + { + "epoch": 0.7463920319804865, + "grad_norm": 6.818500995635986, + "learning_rate": 8.59470189609145e-05, + "loss": 0.8902, + "step": 11016 + }, + { + "epoch": 0.7464597872484585, + "grad_norm": 5.300811290740967, + "learning_rate": 8.594564994181669e-05, + "loss": 0.7823, + "step": 11017 + }, + { + "epoch": 0.7465275425164306, + "grad_norm": 7.0292582511901855, + "learning_rate": 8.594428092271887e-05, + "loss": 0.733, + "step": 11018 + }, + { + "epoch": 0.7465952977844027, + "grad_norm": 6.624716281890869, + "learning_rate": 8.594291190362105e-05, + "loss": 0.8518, + "step": 11019 + }, + { + "epoch": 0.7466630530523748, + "grad_norm": 6.195197105407715, + "learning_rate": 8.594154288452323e-05, + "loss": 0.9108, + "step": 11020 + }, + { + "epoch": 0.7467308083203469, + "grad_norm": 5.729263782501221, + "learning_rate": 8.594017386542543e-05, + "loss": 0.8028, + "step": 11021 + }, + { + "epoch": 0.746798563588319, + "grad_norm": 5.285633563995361, + "learning_rate": 8.593880484632761e-05, + "loss": 0.765, + "step": 11022 + }, + { + "epoch": 0.7468663188562911, + "grad_norm": 7.621737480163574, + "learning_rate": 8.593743582722979e-05, + "loss": 0.811, + "step": 11023 + }, + { + "epoch": 0.7469340741242632, + "grad_norm": 6.321857929229736, + "learning_rate": 8.593606680813198e-05, + "loss": 0.8962, + "step": 11024 + }, + { + "epoch": 0.7470018293922353, + "grad_norm": 8.122148513793945, + "learning_rate": 8.593469778903416e-05, + "loss": 0.9326, + "step": 11025 + }, + { + "epoch": 0.7470695846602073, + "grad_norm": 8.037406921386719, + "learning_rate": 8.593332876993634e-05, + "loss": 0.7504, + "step": 11026 + }, + { + "epoch": 0.7471373399281794, + "grad_norm": 6.311530590057373, + "learning_rate": 8.593195975083854e-05, + "loss": 0.8079, + "step": 11027 + }, + { + "epoch": 0.7472050951961515, + "grad_norm": 6.504730224609375, + "learning_rate": 8.593059073174072e-05, + "loss": 0.8561, + "step": 11028 + }, + { + "epoch": 0.7472728504641236, + "grad_norm": 6.117341995239258, + "learning_rate": 8.59292217126429e-05, + "loss": 1.0076, + "step": 11029 + }, + { + "epoch": 0.7473406057320957, + "grad_norm": 5.374571800231934, + "learning_rate": 8.592785269354508e-05, + "loss": 0.763, + "step": 11030 + }, + { + "epoch": 0.7474083610000678, + "grad_norm": 5.502954006195068, + "learning_rate": 8.592648367444727e-05, + "loss": 0.7479, + "step": 11031 + }, + { + "epoch": 0.7474761162680399, + "grad_norm": 8.302000045776367, + "learning_rate": 8.592511465534945e-05, + "loss": 0.9581, + "step": 11032 + }, + { + "epoch": 0.7475438715360119, + "grad_norm": 5.1271748542785645, + "learning_rate": 8.592374563625163e-05, + "loss": 0.8742, + "step": 11033 + }, + { + "epoch": 0.747611626803984, + "grad_norm": 7.070951461791992, + "learning_rate": 8.592237661715381e-05, + "loss": 0.8405, + "step": 11034 + }, + { + "epoch": 0.7476793820719561, + "grad_norm": 6.252740383148193, + "learning_rate": 8.592100759805601e-05, + "loss": 0.9714, + "step": 11035 + }, + { + "epoch": 0.7477471373399281, + "grad_norm": 5.273550033569336, + "learning_rate": 8.591963857895819e-05, + "loss": 0.847, + "step": 11036 + }, + { + "epoch": 0.7478148926079002, + "grad_norm": 5.054128646850586, + "learning_rate": 8.591826955986037e-05, + "loss": 0.7635, + "step": 11037 + }, + { + "epoch": 0.7478826478758723, + "grad_norm": 6.404119968414307, + "learning_rate": 8.591690054076255e-05, + "loss": 0.8501, + "step": 11038 + }, + { + "epoch": 0.7479504031438444, + "grad_norm": 4.8361430168151855, + "learning_rate": 8.591553152166473e-05, + "loss": 0.8474, + "step": 11039 + }, + { + "epoch": 0.7480181584118165, + "grad_norm": 8.294208526611328, + "learning_rate": 8.591416250256692e-05, + "loss": 0.8207, + "step": 11040 + }, + { + "epoch": 0.7480859136797886, + "grad_norm": 4.709441661834717, + "learning_rate": 8.59127934834691e-05, + "loss": 0.7697, + "step": 11041 + }, + { + "epoch": 0.7481536689477607, + "grad_norm": 7.428562641143799, + "learning_rate": 8.591142446437128e-05, + "loss": 0.9717, + "step": 11042 + }, + { + "epoch": 0.7482214242157328, + "grad_norm": 5.442418098449707, + "learning_rate": 8.591005544527346e-05, + "loss": 0.7347, + "step": 11043 + }, + { + "epoch": 0.7482891794837049, + "grad_norm": 6.216116428375244, + "learning_rate": 8.590868642617564e-05, + "loss": 0.6919, + "step": 11044 + }, + { + "epoch": 0.748356934751677, + "grad_norm": 6.814431190490723, + "learning_rate": 8.590731740707784e-05, + "loss": 0.9049, + "step": 11045 + }, + { + "epoch": 0.7484246900196491, + "grad_norm": 6.422941207885742, + "learning_rate": 8.590594838798002e-05, + "loss": 0.8655, + "step": 11046 + }, + { + "epoch": 0.7484924452876212, + "grad_norm": 6.052054405212402, + "learning_rate": 8.59045793688822e-05, + "loss": 0.8558, + "step": 11047 + }, + { + "epoch": 0.7485602005555932, + "grad_norm": 7.135115146636963, + "learning_rate": 8.590321034978438e-05, + "loss": 0.7672, + "step": 11048 + }, + { + "epoch": 0.7486279558235652, + "grad_norm": 5.608342170715332, + "learning_rate": 8.590184133068657e-05, + "loss": 0.6696, + "step": 11049 + }, + { + "epoch": 0.7486957110915373, + "grad_norm": 6.936807155609131, + "learning_rate": 8.590047231158875e-05, + "loss": 0.9131, + "step": 11050 + }, + { + "epoch": 0.7487634663595094, + "grad_norm": 6.121455669403076, + "learning_rate": 8.589910329249093e-05, + "loss": 0.8561, + "step": 11051 + }, + { + "epoch": 0.7488312216274815, + "grad_norm": 5.633101463317871, + "learning_rate": 8.589773427339311e-05, + "loss": 0.7347, + "step": 11052 + }, + { + "epoch": 0.7488989768954536, + "grad_norm": 5.5565080642700195, + "learning_rate": 8.58963652542953e-05, + "loss": 0.9127, + "step": 11053 + }, + { + "epoch": 0.7489667321634257, + "grad_norm": 6.5937418937683105, + "learning_rate": 8.589499623519749e-05, + "loss": 0.9561, + "step": 11054 + }, + { + "epoch": 0.7490344874313978, + "grad_norm": 5.2491774559021, + "learning_rate": 8.589362721609967e-05, + "loss": 0.5976, + "step": 11055 + }, + { + "epoch": 0.7491022426993699, + "grad_norm": 7.343997478485107, + "learning_rate": 8.589225819700185e-05, + "loss": 0.7404, + "step": 11056 + }, + { + "epoch": 0.749169997967342, + "grad_norm": 5.535366058349609, + "learning_rate": 8.589088917790403e-05, + "loss": 0.9568, + "step": 11057 + }, + { + "epoch": 0.749237753235314, + "grad_norm": 5.751670837402344, + "learning_rate": 8.588952015880622e-05, + "loss": 0.6918, + "step": 11058 + }, + { + "epoch": 0.7493055085032861, + "grad_norm": 6.7734150886535645, + "learning_rate": 8.58881511397084e-05, + "loss": 0.7541, + "step": 11059 + }, + { + "epoch": 0.7493732637712582, + "grad_norm": 6.3308424949646, + "learning_rate": 8.588678212061058e-05, + "loss": 0.9318, + "step": 11060 + }, + { + "epoch": 0.7494410190392303, + "grad_norm": 6.561203479766846, + "learning_rate": 8.588541310151276e-05, + "loss": 0.8515, + "step": 11061 + }, + { + "epoch": 0.7495087743072024, + "grad_norm": 6.7505974769592285, + "learning_rate": 8.588404408241494e-05, + "loss": 0.5823, + "step": 11062 + }, + { + "epoch": 0.7495765295751745, + "grad_norm": 6.116795063018799, + "learning_rate": 8.588267506331714e-05, + "loss": 0.8536, + "step": 11063 + }, + { + "epoch": 0.7496442848431466, + "grad_norm": 5.443800926208496, + "learning_rate": 8.588130604421932e-05, + "loss": 0.8771, + "step": 11064 + }, + { + "epoch": 0.7497120401111187, + "grad_norm": 6.711660385131836, + "learning_rate": 8.58799370251215e-05, + "loss": 0.9655, + "step": 11065 + }, + { + "epoch": 0.7497797953790907, + "grad_norm": 7.173767566680908, + "learning_rate": 8.587856800602368e-05, + "loss": 0.8445, + "step": 11066 + }, + { + "epoch": 0.7498475506470628, + "grad_norm": 5.898787498474121, + "learning_rate": 8.587719898692587e-05, + "loss": 0.9952, + "step": 11067 + }, + { + "epoch": 0.7498475506470628, + "eval_loss": 0.784003496170044, + "eval_noise_accuracy": 0.0, + "eval_runtime": 1565.4788, + "eval_samples_per_second": 3.283, + "eval_steps_per_second": 0.206, + "eval_wer": 88.605156710052, + "step": 11067 + }, + { + "epoch": 0.7499153059150349, + "grad_norm": 5.584912300109863, + "learning_rate": 8.587582996782805e-05, + "loss": 0.6039, + "step": 11068 + }, + { + "epoch": 0.749983061183007, + "grad_norm": 7.05438232421875, + "learning_rate": 8.587446094873023e-05, + "loss": 0.8871, + "step": 11069 + }, + { + "epoch": 0.750050816450979, + "grad_norm": 5.896789073944092, + "learning_rate": 8.587309192963243e-05, + "loss": 0.6864, + "step": 11070 + }, + { + "epoch": 0.7501185717189511, + "grad_norm": 5.357899188995361, + "learning_rate": 8.587172291053461e-05, + "loss": 0.7744, + "step": 11071 + }, + { + "epoch": 0.7501863269869232, + "grad_norm": 5.943928241729736, + "learning_rate": 8.587035389143679e-05, + "loss": 0.8338, + "step": 11072 + }, + { + "epoch": 0.7502540822548953, + "grad_norm": 5.910472393035889, + "learning_rate": 8.586898487233898e-05, + "loss": 0.6772, + "step": 11073 + }, + { + "epoch": 0.7503218375228674, + "grad_norm": 7.976921558380127, + "learning_rate": 8.586761585324116e-05, + "loss": 0.6949, + "step": 11074 + }, + { + "epoch": 0.7503895927908395, + "grad_norm": 10.462014198303223, + "learning_rate": 8.586624683414334e-05, + "loss": 0.903, + "step": 11075 + }, + { + "epoch": 0.7504573480588116, + "grad_norm": 7.0388946533203125, + "learning_rate": 8.586487781504552e-05, + "loss": 0.9651, + "step": 11076 + }, + { + "epoch": 0.7505251033267837, + "grad_norm": 6.222432613372803, + "learning_rate": 8.586350879594772e-05, + "loss": 0.8637, + "step": 11077 + }, + { + "epoch": 0.7505928585947558, + "grad_norm": 5.401667594909668, + "learning_rate": 8.58621397768499e-05, + "loss": 1.0538, + "step": 11078 + }, + { + "epoch": 0.7506606138627279, + "grad_norm": 5.605716705322266, + "learning_rate": 8.586077075775208e-05, + "loss": 0.589, + "step": 11079 + }, + { + "epoch": 0.7507283691307, + "grad_norm": 5.001210689544678, + "learning_rate": 8.585940173865426e-05, + "loss": 0.8688, + "step": 11080 + }, + { + "epoch": 0.750796124398672, + "grad_norm": 6.432781219482422, + "learning_rate": 8.585803271955645e-05, + "loss": 0.8322, + "step": 11081 + }, + { + "epoch": 0.750863879666644, + "grad_norm": 5.7087483406066895, + "learning_rate": 8.585666370045863e-05, + "loss": 0.913, + "step": 11082 + }, + { + "epoch": 0.7509316349346161, + "grad_norm": 6.006197452545166, + "learning_rate": 8.585529468136081e-05, + "loss": 0.7931, + "step": 11083 + }, + { + "epoch": 0.7509993902025882, + "grad_norm": 6.532283782958984, + "learning_rate": 8.585392566226299e-05, + "loss": 0.7063, + "step": 11084 + }, + { + "epoch": 0.7510671454705603, + "grad_norm": 5.088547229766846, + "learning_rate": 8.585255664316517e-05, + "loss": 0.7701, + "step": 11085 + }, + { + "epoch": 0.7511349007385324, + "grad_norm": 4.305655479431152, + "learning_rate": 8.585118762406737e-05, + "loss": 0.5815, + "step": 11086 + }, + { + "epoch": 0.7512026560065045, + "grad_norm": 6.688506603240967, + "learning_rate": 8.584981860496955e-05, + "loss": 0.7622, + "step": 11087 + }, + { + "epoch": 0.7512704112744766, + "grad_norm": 4.817746639251709, + "learning_rate": 8.584844958587173e-05, + "loss": 0.6914, + "step": 11088 + }, + { + "epoch": 0.7513381665424487, + "grad_norm": 6.330674648284912, + "learning_rate": 8.584708056677391e-05, + "loss": 0.7376, + "step": 11089 + }, + { + "epoch": 0.7514059218104208, + "grad_norm": 6.616771697998047, + "learning_rate": 8.584571154767609e-05, + "loss": 0.741, + "step": 11090 + }, + { + "epoch": 0.7514736770783929, + "grad_norm": 5.213818550109863, + "learning_rate": 8.584434252857828e-05, + "loss": 0.7452, + "step": 11091 + }, + { + "epoch": 0.751541432346365, + "grad_norm": 6.440800189971924, + "learning_rate": 8.584297350948046e-05, + "loss": 0.7903, + "step": 11092 + }, + { + "epoch": 0.751609187614337, + "grad_norm": 7.082359790802002, + "learning_rate": 8.584160449038264e-05, + "loss": 0.9452, + "step": 11093 + }, + { + "epoch": 0.7516769428823091, + "grad_norm": 5.019514083862305, + "learning_rate": 8.584023547128482e-05, + "loss": 0.6764, + "step": 11094 + }, + { + "epoch": 0.7517446981502812, + "grad_norm": 6.08909797668457, + "learning_rate": 8.583886645218702e-05, + "loss": 0.6528, + "step": 11095 + }, + { + "epoch": 0.7518124534182533, + "grad_norm": 6.15897274017334, + "learning_rate": 8.58374974330892e-05, + "loss": 0.7528, + "step": 11096 + }, + { + "epoch": 0.7518802086862254, + "grad_norm": 6.012785911560059, + "learning_rate": 8.583612841399138e-05, + "loss": 0.8348, + "step": 11097 + }, + { + "epoch": 0.7519479639541974, + "grad_norm": 6.340338230133057, + "learning_rate": 8.583475939489356e-05, + "loss": 0.8322, + "step": 11098 + }, + { + "epoch": 0.7520157192221695, + "grad_norm": 6.704728603363037, + "learning_rate": 8.583339037579574e-05, + "loss": 0.9286, + "step": 11099 + }, + { + "epoch": 0.7520834744901416, + "grad_norm": 7.134490013122559, + "learning_rate": 8.583202135669793e-05, + "loss": 0.9965, + "step": 11100 + }, + { + "epoch": 0.7521512297581137, + "grad_norm": 6.226062774658203, + "learning_rate": 8.583065233760011e-05, + "loss": 0.6716, + "step": 11101 + }, + { + "epoch": 0.7522189850260858, + "grad_norm": 5.990030765533447, + "learning_rate": 8.582928331850229e-05, + "loss": 0.7636, + "step": 11102 + }, + { + "epoch": 0.7522867402940578, + "grad_norm": 5.9684834480285645, + "learning_rate": 8.582791429940447e-05, + "loss": 0.7847, + "step": 11103 + }, + { + "epoch": 0.7523544955620299, + "grad_norm": 6.136056900024414, + "learning_rate": 8.582654528030667e-05, + "loss": 0.8641, + "step": 11104 + }, + { + "epoch": 0.752422250830002, + "grad_norm": 6.887735366821289, + "learning_rate": 8.582517626120885e-05, + "loss": 0.8848, + "step": 11105 + }, + { + "epoch": 0.7524900060979741, + "grad_norm": 5.988890647888184, + "learning_rate": 8.582380724211103e-05, + "loss": 0.8409, + "step": 11106 + }, + { + "epoch": 0.7525577613659462, + "grad_norm": 5.09688138961792, + "learning_rate": 8.582243822301321e-05, + "loss": 0.813, + "step": 11107 + }, + { + "epoch": 0.7526255166339183, + "grad_norm": 5.739447593688965, + "learning_rate": 8.582106920391539e-05, + "loss": 0.8414, + "step": 11108 + }, + { + "epoch": 0.7526932719018904, + "grad_norm": 7.205900192260742, + "learning_rate": 8.581970018481758e-05, + "loss": 0.7134, + "step": 11109 + }, + { + "epoch": 0.7527610271698625, + "grad_norm": 5.002781867980957, + "learning_rate": 8.581833116571976e-05, + "loss": 0.7937, + "step": 11110 + }, + { + "epoch": 0.7528287824378346, + "grad_norm": 7.405692100524902, + "learning_rate": 8.581696214662194e-05, + "loss": 0.7896, + "step": 11111 + }, + { + "epoch": 0.7528965377058067, + "grad_norm": 6.863432884216309, + "learning_rate": 8.581559312752412e-05, + "loss": 1.0113, + "step": 11112 + }, + { + "epoch": 0.7529642929737788, + "grad_norm": 6.594325542449951, + "learning_rate": 8.581422410842632e-05, + "loss": 1.1729, + "step": 11113 + }, + { + "epoch": 0.7530320482417509, + "grad_norm": 7.210272312164307, + "learning_rate": 8.58128550893285e-05, + "loss": 1.0648, + "step": 11114 + }, + { + "epoch": 0.7530998035097228, + "grad_norm": 6.694648742675781, + "learning_rate": 8.581148607023068e-05, + "loss": 0.7308, + "step": 11115 + }, + { + "epoch": 0.7531675587776949, + "grad_norm": 4.825174808502197, + "learning_rate": 8.581011705113287e-05, + "loss": 0.7378, + "step": 11116 + }, + { + "epoch": 0.753235314045667, + "grad_norm": 7.157209873199463, + "learning_rate": 8.580874803203505e-05, + "loss": 0.8906, + "step": 11117 + }, + { + "epoch": 0.7533030693136391, + "grad_norm": 7.719106674194336, + "learning_rate": 8.580737901293723e-05, + "loss": 0.5838, + "step": 11118 + }, + { + "epoch": 0.7533708245816112, + "grad_norm": 5.7004828453063965, + "learning_rate": 8.580600999383943e-05, + "loss": 0.8375, + "step": 11119 + }, + { + "epoch": 0.7534385798495833, + "grad_norm": 6.208179950714111, + "learning_rate": 8.580464097474161e-05, + "loss": 0.7722, + "step": 11120 + }, + { + "epoch": 0.7535063351175554, + "grad_norm": 7.243391990661621, + "learning_rate": 8.580327195564379e-05, + "loss": 1.0486, + "step": 11121 + }, + { + "epoch": 0.7535740903855275, + "grad_norm": 5.903680324554443, + "learning_rate": 8.580190293654597e-05, + "loss": 0.7814, + "step": 11122 + }, + { + "epoch": 0.7536418456534996, + "grad_norm": 6.017585277557373, + "learning_rate": 8.580053391744816e-05, + "loss": 0.9528, + "step": 11123 + }, + { + "epoch": 0.7537096009214717, + "grad_norm": 6.117088317871094, + "learning_rate": 8.579916489835034e-05, + "loss": 0.8894, + "step": 11124 + }, + { + "epoch": 0.7537773561894437, + "grad_norm": 4.541194915771484, + "learning_rate": 8.579779587925252e-05, + "loss": 0.6975, + "step": 11125 + }, + { + "epoch": 0.7538451114574158, + "grad_norm": 5.8757452964782715, + "learning_rate": 8.57964268601547e-05, + "loss": 1.0208, + "step": 11126 + }, + { + "epoch": 0.7539128667253879, + "grad_norm": 6.408355712890625, + "learning_rate": 8.57950578410569e-05, + "loss": 0.7566, + "step": 11127 + }, + { + "epoch": 0.75398062199336, + "grad_norm": 6.257778167724609, + "learning_rate": 8.579368882195908e-05, + "loss": 0.8246, + "step": 11128 + }, + { + "epoch": 0.7540483772613321, + "grad_norm": 5.630858898162842, + "learning_rate": 8.579231980286126e-05, + "loss": 0.9085, + "step": 11129 + }, + { + "epoch": 0.7541161325293042, + "grad_norm": 5.071380138397217, + "learning_rate": 8.579095078376344e-05, + "loss": 0.6106, + "step": 11130 + }, + { + "epoch": 0.7541838877972762, + "grad_norm": 5.429622173309326, + "learning_rate": 8.578958176466562e-05, + "loss": 0.5699, + "step": 11131 + }, + { + "epoch": 0.7542516430652483, + "grad_norm": 7.031247615814209, + "learning_rate": 8.578821274556781e-05, + "loss": 0.9928, + "step": 11132 + }, + { + "epoch": 0.7543193983332204, + "grad_norm": 6.868895053863525, + "learning_rate": 8.578684372646999e-05, + "loss": 0.5757, + "step": 11133 + }, + { + "epoch": 0.7543871536011925, + "grad_norm": 6.9466118812561035, + "learning_rate": 8.578547470737217e-05, + "loss": 0.7591, + "step": 11134 + }, + { + "epoch": 0.7544549088691646, + "grad_norm": 5.605920314788818, + "learning_rate": 8.578410568827435e-05, + "loss": 0.7244, + "step": 11135 + }, + { + "epoch": 0.7545226641371366, + "grad_norm": 5.563654899597168, + "learning_rate": 8.578273666917655e-05, + "loss": 0.6451, + "step": 11136 + }, + { + "epoch": 0.7545904194051087, + "grad_norm": 5.8858771324157715, + "learning_rate": 8.578136765007873e-05, + "loss": 0.9521, + "step": 11137 + }, + { + "epoch": 0.7546581746730808, + "grad_norm": 6.400531768798828, + "learning_rate": 8.577999863098091e-05, + "loss": 0.7077, + "step": 11138 + }, + { + "epoch": 0.7547259299410529, + "grad_norm": 6.579067230224609, + "learning_rate": 8.577862961188309e-05, + "loss": 0.8734, + "step": 11139 + }, + { + "epoch": 0.754793685209025, + "grad_norm": 5.629753112792969, + "learning_rate": 8.577726059278527e-05, + "loss": 0.7637, + "step": 11140 + }, + { + "epoch": 0.7548614404769971, + "grad_norm": 5.546406269073486, + "learning_rate": 8.577589157368746e-05, + "loss": 0.8094, + "step": 11141 + }, + { + "epoch": 0.7549291957449692, + "grad_norm": 5.510883331298828, + "learning_rate": 8.577452255458964e-05, + "loss": 0.8629, + "step": 11142 + }, + { + "epoch": 0.7549969510129413, + "grad_norm": 8.156396865844727, + "learning_rate": 8.577315353549182e-05, + "loss": 0.7051, + "step": 11143 + }, + { + "epoch": 0.7550647062809134, + "grad_norm": 5.59662389755249, + "learning_rate": 8.5771784516394e-05, + "loss": 0.8627, + "step": 11144 + }, + { + "epoch": 0.7551324615488855, + "grad_norm": 7.06483793258667, + "learning_rate": 8.577041549729618e-05, + "loss": 0.8544, + "step": 11145 + }, + { + "epoch": 0.7552002168168576, + "grad_norm": 5.5211663246154785, + "learning_rate": 8.576904647819838e-05, + "loss": 0.9621, + "step": 11146 + }, + { + "epoch": 0.7552679720848295, + "grad_norm": 5.117467880249023, + "learning_rate": 8.576767745910056e-05, + "loss": 0.7759, + "step": 11147 + }, + { + "epoch": 0.7553357273528016, + "grad_norm": 6.539487361907959, + "learning_rate": 8.576630844000274e-05, + "loss": 0.8292, + "step": 11148 + }, + { + "epoch": 0.7554034826207737, + "grad_norm": 6.704295635223389, + "learning_rate": 8.576493942090492e-05, + "loss": 0.6924, + "step": 11149 + }, + { + "epoch": 0.7554712378887458, + "grad_norm": 6.1922926902771, + "learning_rate": 8.576357040180711e-05, + "loss": 0.9602, + "step": 11150 + }, + { + "epoch": 0.7555389931567179, + "grad_norm": 6.421977519989014, + "learning_rate": 8.576220138270929e-05, + "loss": 0.6867, + "step": 11151 + }, + { + "epoch": 0.75560674842469, + "grad_norm": 9.157228469848633, + "learning_rate": 8.576083236361147e-05, + "loss": 0.7311, + "step": 11152 + }, + { + "epoch": 0.7556745036926621, + "grad_norm": 6.960337162017822, + "learning_rate": 8.575946334451365e-05, + "loss": 0.986, + "step": 11153 + }, + { + "epoch": 0.7557422589606342, + "grad_norm": 7.604123115539551, + "learning_rate": 8.575809432541583e-05, + "loss": 0.8723, + "step": 11154 + }, + { + "epoch": 0.7558100142286063, + "grad_norm": 4.943920135498047, + "learning_rate": 8.575672530631803e-05, + "loss": 0.7067, + "step": 11155 + }, + { + "epoch": 0.7558777694965784, + "grad_norm": 6.594372272491455, + "learning_rate": 8.575535628722021e-05, + "loss": 0.8089, + "step": 11156 + }, + { + "epoch": 0.7559455247645505, + "grad_norm": 6.759579181671143, + "learning_rate": 8.575398726812239e-05, + "loss": 0.8882, + "step": 11157 + }, + { + "epoch": 0.7560132800325225, + "grad_norm": 7.21705961227417, + "learning_rate": 8.575261824902457e-05, + "loss": 0.7922, + "step": 11158 + }, + { + "epoch": 0.7560810353004946, + "grad_norm": 5.465212821960449, + "learning_rate": 8.575124922992676e-05, + "loss": 0.5913, + "step": 11159 + }, + { + "epoch": 0.7561487905684667, + "grad_norm": 5.213903427124023, + "learning_rate": 8.574988021082894e-05, + "loss": 0.8809, + "step": 11160 + }, + { + "epoch": 0.7562165458364388, + "grad_norm": 5.521292686462402, + "learning_rate": 8.574851119173112e-05, + "loss": 0.8858, + "step": 11161 + }, + { + "epoch": 0.7562843011044109, + "grad_norm": 4.67765474319458, + "learning_rate": 8.574714217263332e-05, + "loss": 0.624, + "step": 11162 + }, + { + "epoch": 0.756352056372383, + "grad_norm": 6.993425369262695, + "learning_rate": 8.57457731535355e-05, + "loss": 0.8126, + "step": 11163 + }, + { + "epoch": 0.756419811640355, + "grad_norm": 6.692846775054932, + "learning_rate": 8.574440413443768e-05, + "loss": 1.0727, + "step": 11164 + }, + { + "epoch": 0.7564875669083271, + "grad_norm": 6.0148773193359375, + "learning_rate": 8.574303511533987e-05, + "loss": 0.9334, + "step": 11165 + }, + { + "epoch": 0.7565553221762992, + "grad_norm": 8.869120597839355, + "learning_rate": 8.574166609624205e-05, + "loss": 0.9927, + "step": 11166 + }, + { + "epoch": 0.7566230774442713, + "grad_norm": 6.259548187255859, + "learning_rate": 8.574029707714423e-05, + "loss": 0.9905, + "step": 11167 + }, + { + "epoch": 0.7566908327122434, + "grad_norm": 6.2330322265625, + "learning_rate": 8.573892805804643e-05, + "loss": 0.5814, + "step": 11168 + }, + { + "epoch": 0.7567585879802154, + "grad_norm": 5.996000289916992, + "learning_rate": 8.57375590389486e-05, + "loss": 0.7676, + "step": 11169 + }, + { + "epoch": 0.7568263432481875, + "grad_norm": 5.289711952209473, + "learning_rate": 8.573619001985079e-05, + "loss": 0.5761, + "step": 11170 + }, + { + "epoch": 0.7568940985161596, + "grad_norm": 7.447518825531006, + "learning_rate": 8.573482100075297e-05, + "loss": 0.8822, + "step": 11171 + }, + { + "epoch": 0.7569618537841317, + "grad_norm": 5.87143087387085, + "learning_rate": 8.573345198165515e-05, + "loss": 0.7935, + "step": 11172 + }, + { + "epoch": 0.7570296090521038, + "grad_norm": 5.967605113983154, + "learning_rate": 8.573208296255734e-05, + "loss": 0.7755, + "step": 11173 + }, + { + "epoch": 0.7570973643200759, + "grad_norm": 5.8669562339782715, + "learning_rate": 8.573071394345952e-05, + "loss": 0.7079, + "step": 11174 + }, + { + "epoch": 0.757165119588048, + "grad_norm": 5.084571361541748, + "learning_rate": 8.57293449243617e-05, + "loss": 0.5683, + "step": 11175 + }, + { + "epoch": 0.7572328748560201, + "grad_norm": 5.499231338500977, + "learning_rate": 8.572797590526388e-05, + "loss": 0.7443, + "step": 11176 + }, + { + "epoch": 0.7573006301239922, + "grad_norm": 6.275290489196777, + "learning_rate": 8.572660688616606e-05, + "loss": 0.8232, + "step": 11177 + }, + { + "epoch": 0.7573683853919643, + "grad_norm": 9.729387283325195, + "learning_rate": 8.572523786706826e-05, + "loss": 1.0499, + "step": 11178 + }, + { + "epoch": 0.7574361406599364, + "grad_norm": 5.243049144744873, + "learning_rate": 8.572386884797044e-05, + "loss": 0.8695, + "step": 11179 + }, + { + "epoch": 0.7575038959279083, + "grad_norm": 6.83506965637207, + "learning_rate": 8.572249982887262e-05, + "loss": 1.0048, + "step": 11180 + }, + { + "epoch": 0.7575716511958804, + "grad_norm": 5.801395893096924, + "learning_rate": 8.57211308097748e-05, + "loss": 0.7159, + "step": 11181 + }, + { + "epoch": 0.7576394064638525, + "grad_norm": 5.878986835479736, + "learning_rate": 8.571976179067699e-05, + "loss": 0.8116, + "step": 11182 + }, + { + "epoch": 0.7577071617318246, + "grad_norm": 5.621567726135254, + "learning_rate": 8.571839277157917e-05, + "loss": 0.771, + "step": 11183 + }, + { + "epoch": 0.7577749169997967, + "grad_norm": 6.197023391723633, + "learning_rate": 8.571702375248135e-05, + "loss": 0.7588, + "step": 11184 + }, + { + "epoch": 0.7578426722677688, + "grad_norm": 6.170006275177002, + "learning_rate": 8.571565473338353e-05, + "loss": 0.7644, + "step": 11185 + }, + { + "epoch": 0.7579104275357409, + "grad_norm": 7.654489517211914, + "learning_rate": 8.571428571428571e-05, + "loss": 0.8256, + "step": 11186 + }, + { + "epoch": 0.757978182803713, + "grad_norm": 6.45438289642334, + "learning_rate": 8.57129166951879e-05, + "loss": 0.8437, + "step": 11187 + }, + { + "epoch": 0.7580459380716851, + "grad_norm": 6.441734790802002, + "learning_rate": 8.571154767609009e-05, + "loss": 0.86, + "step": 11188 + }, + { + "epoch": 0.7581136933396572, + "grad_norm": 6.674750804901123, + "learning_rate": 8.571017865699227e-05, + "loss": 0.761, + "step": 11189 + }, + { + "epoch": 0.7581814486076293, + "grad_norm": 5.539404392242432, + "learning_rate": 8.570880963789445e-05, + "loss": 0.7542, + "step": 11190 + }, + { + "epoch": 0.7582492038756014, + "grad_norm": 7.210087776184082, + "learning_rate": 8.570744061879664e-05, + "loss": 0.9209, + "step": 11191 + }, + { + "epoch": 0.7583169591435734, + "grad_norm": 5.365791320800781, + "learning_rate": 8.570607159969882e-05, + "loss": 0.7885, + "step": 11192 + }, + { + "epoch": 0.7583847144115455, + "grad_norm": 5.170724391937256, + "learning_rate": 8.5704702580601e-05, + "loss": 0.7241, + "step": 11193 + }, + { + "epoch": 0.7584524696795176, + "grad_norm": 6.509468078613281, + "learning_rate": 8.570333356150318e-05, + "loss": 0.6644, + "step": 11194 + }, + { + "epoch": 0.7585202249474897, + "grad_norm": 7.682931900024414, + "learning_rate": 8.570196454240536e-05, + "loss": 0.8989, + "step": 11195 + }, + { + "epoch": 0.7585879802154617, + "grad_norm": 5.240300178527832, + "learning_rate": 8.570059552330756e-05, + "loss": 0.7577, + "step": 11196 + }, + { + "epoch": 0.7586557354834338, + "grad_norm": 6.523096084594727, + "learning_rate": 8.569922650420974e-05, + "loss": 0.9607, + "step": 11197 + }, + { + "epoch": 0.7587234907514059, + "grad_norm": 7.254205703735352, + "learning_rate": 8.569785748511192e-05, + "loss": 0.744, + "step": 11198 + }, + { + "epoch": 0.758791246019378, + "grad_norm": 6.33684778213501, + "learning_rate": 8.56964884660141e-05, + "loss": 0.7514, + "step": 11199 + }, + { + "epoch": 0.7588590012873501, + "grad_norm": 7.9795403480529785, + "learning_rate": 8.569511944691628e-05, + "loss": 0.8608, + "step": 11200 + }, + { + "epoch": 0.7589267565553222, + "grad_norm": 9.582289695739746, + "learning_rate": 8.569375042781847e-05, + "loss": 0.63, + "step": 11201 + }, + { + "epoch": 0.7589945118232942, + "grad_norm": 7.716179847717285, + "learning_rate": 8.569238140872065e-05, + "loss": 0.8992, + "step": 11202 + }, + { + "epoch": 0.7590622670912663, + "grad_norm": 7.2479095458984375, + "learning_rate": 8.569101238962283e-05, + "loss": 0.7297, + "step": 11203 + }, + { + "epoch": 0.7591300223592384, + "grad_norm": 5.631564140319824, + "learning_rate": 8.568964337052501e-05, + "loss": 0.5615, + "step": 11204 + }, + { + "epoch": 0.7591977776272105, + "grad_norm": 6.547656536102295, + "learning_rate": 8.568827435142721e-05, + "loss": 0.7367, + "step": 11205 + }, + { + "epoch": 0.7592655328951826, + "grad_norm": 5.980971813201904, + "learning_rate": 8.568690533232939e-05, + "loss": 0.857, + "step": 11206 + }, + { + "epoch": 0.7593332881631547, + "grad_norm": 5.813320159912109, + "learning_rate": 8.568553631323157e-05, + "loss": 0.9032, + "step": 11207 + }, + { + "epoch": 0.7594010434311268, + "grad_norm": 8.85381031036377, + "learning_rate": 8.568416729413376e-05, + "loss": 1.2161, + "step": 11208 + }, + { + "epoch": 0.7594687986990989, + "grad_norm": 5.185744285583496, + "learning_rate": 8.568279827503594e-05, + "loss": 0.7288, + "step": 11209 + }, + { + "epoch": 0.759536553967071, + "grad_norm": 6.757566452026367, + "learning_rate": 8.568142925593812e-05, + "loss": 0.8418, + "step": 11210 + }, + { + "epoch": 0.7596043092350431, + "grad_norm": 6.195647239685059, + "learning_rate": 8.568006023684032e-05, + "loss": 0.8931, + "step": 11211 + }, + { + "epoch": 0.759672064503015, + "grad_norm": 5.0666117668151855, + "learning_rate": 8.56786912177425e-05, + "loss": 0.677, + "step": 11212 + }, + { + "epoch": 0.7597398197709871, + "grad_norm": 6.256303787231445, + "learning_rate": 8.567732219864468e-05, + "loss": 0.8905, + "step": 11213 + }, + { + "epoch": 0.7598075750389592, + "grad_norm": 5.787086009979248, + "learning_rate": 8.567595317954687e-05, + "loss": 0.7823, + "step": 11214 + }, + { + "epoch": 0.7598753303069313, + "grad_norm": 6.895840644836426, + "learning_rate": 8.567458416044905e-05, + "loss": 0.7453, + "step": 11215 + }, + { + "epoch": 0.7599430855749034, + "grad_norm": 5.626133441925049, + "learning_rate": 8.567321514135123e-05, + "loss": 0.681, + "step": 11216 + }, + { + "epoch": 0.7600108408428755, + "grad_norm": 5.967296123504639, + "learning_rate": 8.567184612225341e-05, + "loss": 0.7011, + "step": 11217 + }, + { + "epoch": 0.7600785961108476, + "grad_norm": 6.5361175537109375, + "learning_rate": 8.567047710315559e-05, + "loss": 0.9365, + "step": 11218 + }, + { + "epoch": 0.7601463513788197, + "grad_norm": 5.349835395812988, + "learning_rate": 8.566910808405779e-05, + "loss": 0.8221, + "step": 11219 + }, + { + "epoch": 0.7602141066467918, + "grad_norm": 5.8798089027404785, + "learning_rate": 8.566773906495997e-05, + "loss": 0.8863, + "step": 11220 + }, + { + "epoch": 0.7602818619147639, + "grad_norm": 5.611867427825928, + "learning_rate": 8.566637004586215e-05, + "loss": 0.7967, + "step": 11221 + }, + { + "epoch": 0.760349617182736, + "grad_norm": 7.200530529022217, + "learning_rate": 8.566500102676433e-05, + "loss": 0.7193, + "step": 11222 + }, + { + "epoch": 0.7604173724507081, + "grad_norm": 8.199413299560547, + "learning_rate": 8.566363200766651e-05, + "loss": 1.0548, + "step": 11223 + }, + { + "epoch": 0.7604851277186802, + "grad_norm": 6.439299583435059, + "learning_rate": 8.56622629885687e-05, + "loss": 0.9681, + "step": 11224 + }, + { + "epoch": 0.7605528829866522, + "grad_norm": 5.07982873916626, + "learning_rate": 8.566089396947088e-05, + "loss": 0.5863, + "step": 11225 + }, + { + "epoch": 0.7606206382546243, + "grad_norm": 5.301721572875977, + "learning_rate": 8.565952495037306e-05, + "loss": 0.6788, + "step": 11226 + }, + { + "epoch": 0.7606883935225964, + "grad_norm": 6.359459400177002, + "learning_rate": 8.565815593127524e-05, + "loss": 0.8885, + "step": 11227 + }, + { + "epoch": 0.7607561487905685, + "grad_norm": 7.1541619300842285, + "learning_rate": 8.565678691217744e-05, + "loss": 0.7039, + "step": 11228 + }, + { + "epoch": 0.7608239040585405, + "grad_norm": 5.493987560272217, + "learning_rate": 8.565541789307962e-05, + "loss": 0.698, + "step": 11229 + }, + { + "epoch": 0.7608916593265126, + "grad_norm": 5.63759708404541, + "learning_rate": 8.56540488739818e-05, + "loss": 0.73, + "step": 11230 + }, + { + "epoch": 0.7609594145944847, + "grad_norm": 4.844949245452881, + "learning_rate": 8.565267985488398e-05, + "loss": 0.9189, + "step": 11231 + }, + { + "epoch": 0.7610271698624568, + "grad_norm": 5.61946964263916, + "learning_rate": 8.565131083578616e-05, + "loss": 0.9857, + "step": 11232 + }, + { + "epoch": 0.7610949251304289, + "grad_norm": 6.210287570953369, + "learning_rate": 8.564994181668835e-05, + "loss": 0.9976, + "step": 11233 + }, + { + "epoch": 0.761162680398401, + "grad_norm": 8.433357238769531, + "learning_rate": 8.564857279759053e-05, + "loss": 0.7793, + "step": 11234 + }, + { + "epoch": 0.761230435666373, + "grad_norm": 6.107748031616211, + "learning_rate": 8.564720377849271e-05, + "loss": 0.8645, + "step": 11235 + }, + { + "epoch": 0.7612981909343451, + "grad_norm": 5.744904041290283, + "learning_rate": 8.564583475939489e-05, + "loss": 0.9381, + "step": 11236 + }, + { + "epoch": 0.7613659462023172, + "grad_norm": 7.262130260467529, + "learning_rate": 8.564446574029709e-05, + "loss": 0.6813, + "step": 11237 + }, + { + "epoch": 0.7614337014702893, + "grad_norm": 5.340671062469482, + "learning_rate": 8.564309672119927e-05, + "loss": 0.8911, + "step": 11238 + }, + { + "epoch": 0.7615014567382614, + "grad_norm": 5.608626842498779, + "learning_rate": 8.564172770210145e-05, + "loss": 0.8672, + "step": 11239 + }, + { + "epoch": 0.7615692120062335, + "grad_norm": 5.490970611572266, + "learning_rate": 8.564035868300363e-05, + "loss": 0.7373, + "step": 11240 + }, + { + "epoch": 0.7616369672742056, + "grad_norm": 5.939301013946533, + "learning_rate": 8.563898966390581e-05, + "loss": 0.8574, + "step": 11241 + }, + { + "epoch": 0.7617047225421777, + "grad_norm": 5.254984378814697, + "learning_rate": 8.5637620644808e-05, + "loss": 0.888, + "step": 11242 + }, + { + "epoch": 0.7617724778101498, + "grad_norm": 5.318180561065674, + "learning_rate": 8.563625162571018e-05, + "loss": 0.7723, + "step": 11243 + }, + { + "epoch": 0.7618402330781219, + "grad_norm": 6.106221675872803, + "learning_rate": 8.563488260661236e-05, + "loss": 1.0503, + "step": 11244 + }, + { + "epoch": 0.7619079883460939, + "grad_norm": 6.882452011108398, + "learning_rate": 8.563351358751454e-05, + "loss": 0.875, + "step": 11245 + }, + { + "epoch": 0.761975743614066, + "grad_norm": 5.541532516479492, + "learning_rate": 8.563214456841674e-05, + "loss": 0.7503, + "step": 11246 + }, + { + "epoch": 0.762043498882038, + "grad_norm": 6.156836986541748, + "learning_rate": 8.563077554931892e-05, + "loss": 0.6966, + "step": 11247 + }, + { + "epoch": 0.7621112541500101, + "grad_norm": 5.691517353057861, + "learning_rate": 8.56294065302211e-05, + "loss": 0.8133, + "step": 11248 + }, + { + "epoch": 0.7621790094179822, + "grad_norm": 5.82022762298584, + "learning_rate": 8.562803751112328e-05, + "loss": 0.8547, + "step": 11249 + }, + { + "epoch": 0.7622467646859543, + "grad_norm": 7.37861442565918, + "learning_rate": 8.562666849202546e-05, + "loss": 0.9013, + "step": 11250 + }, + { + "epoch": 0.7623145199539264, + "grad_norm": 6.810284614562988, + "learning_rate": 8.562529947292765e-05, + "loss": 0.8586, + "step": 11251 + }, + { + "epoch": 0.7623822752218985, + "grad_norm": 5.333098411560059, + "learning_rate": 8.562393045382983e-05, + "loss": 0.719, + "step": 11252 + }, + { + "epoch": 0.7624500304898706, + "grad_norm": 5.309335231781006, + "learning_rate": 8.562256143473201e-05, + "loss": 0.6253, + "step": 11253 + }, + { + "epoch": 0.7625177857578427, + "grad_norm": 6.738717079162598, + "learning_rate": 8.562119241563419e-05, + "loss": 0.8135, + "step": 11254 + }, + { + "epoch": 0.7625855410258148, + "grad_norm": 7.389588832855225, + "learning_rate": 8.561982339653639e-05, + "loss": 0.9612, + "step": 11255 + }, + { + "epoch": 0.7626532962937869, + "grad_norm": 5.986783504486084, + "learning_rate": 8.561845437743857e-05, + "loss": 0.8069, + "step": 11256 + }, + { + "epoch": 0.762721051561759, + "grad_norm": 4.995396614074707, + "learning_rate": 8.561708535834076e-05, + "loss": 0.8176, + "step": 11257 + }, + { + "epoch": 0.762788806829731, + "grad_norm": 5.639811992645264, + "learning_rate": 8.561571633924294e-05, + "loss": 0.8188, + "step": 11258 + }, + { + "epoch": 0.7628565620977031, + "grad_norm": 5.036534309387207, + "learning_rate": 8.561434732014512e-05, + "loss": 0.6055, + "step": 11259 + }, + { + "epoch": 0.7629243173656752, + "grad_norm": 7.262217998504639, + "learning_rate": 8.561297830104732e-05, + "loss": 0.9039, + "step": 11260 + }, + { + "epoch": 0.7629920726336472, + "grad_norm": 6.3017497062683105, + "learning_rate": 8.56116092819495e-05, + "loss": 0.9808, + "step": 11261 + }, + { + "epoch": 0.7630598279016193, + "grad_norm": 5.999700546264648, + "learning_rate": 8.561024026285168e-05, + "loss": 0.771, + "step": 11262 + }, + { + "epoch": 0.7631275831695914, + "grad_norm": 5.911481857299805, + "learning_rate": 8.560887124375386e-05, + "loss": 0.8681, + "step": 11263 + }, + { + "epoch": 0.7631953384375635, + "grad_norm": 6.147214412689209, + "learning_rate": 8.560750222465604e-05, + "loss": 0.7072, + "step": 11264 + }, + { + "epoch": 0.7632630937055356, + "grad_norm": 7.115184783935547, + "learning_rate": 8.560613320555823e-05, + "loss": 0.6084, + "step": 11265 + }, + { + "epoch": 0.7633308489735077, + "grad_norm": 5.635624885559082, + "learning_rate": 8.560476418646041e-05, + "loss": 0.7086, + "step": 11266 + }, + { + "epoch": 0.7633986042414798, + "grad_norm": 5.630052089691162, + "learning_rate": 8.560339516736259e-05, + "loss": 0.7175, + "step": 11267 + }, + { + "epoch": 0.7634663595094519, + "grad_norm": 6.82834529876709, + "learning_rate": 8.560202614826477e-05, + "loss": 0.8163, + "step": 11268 + }, + { + "epoch": 0.763534114777424, + "grad_norm": 5.716002464294434, + "learning_rate": 8.560065712916697e-05, + "loss": 0.7745, + "step": 11269 + }, + { + "epoch": 0.763601870045396, + "grad_norm": 8.498682022094727, + "learning_rate": 8.559928811006915e-05, + "loss": 1.2038, + "step": 11270 + }, + { + "epoch": 0.7636696253133681, + "grad_norm": 6.311810493469238, + "learning_rate": 8.559791909097133e-05, + "loss": 0.723, + "step": 11271 + }, + { + "epoch": 0.7637373805813402, + "grad_norm": 8.272360801696777, + "learning_rate": 8.55965500718735e-05, + "loss": 0.8831, + "step": 11272 + }, + { + "epoch": 0.7638051358493123, + "grad_norm": 8.232126235961914, + "learning_rate": 8.559518105277569e-05, + "loss": 0.9191, + "step": 11273 + }, + { + "epoch": 0.7638728911172844, + "grad_norm": 5.009904384613037, + "learning_rate": 8.559381203367788e-05, + "loss": 0.7523, + "step": 11274 + }, + { + "epoch": 0.7639406463852565, + "grad_norm": 5.405948638916016, + "learning_rate": 8.559244301458006e-05, + "loss": 0.8288, + "step": 11275 + }, + { + "epoch": 0.7640084016532286, + "grad_norm": 5.3381428718566895, + "learning_rate": 8.559107399548224e-05, + "loss": 0.8073, + "step": 11276 + }, + { + "epoch": 0.7640761569212007, + "grad_norm": 5.603418827056885, + "learning_rate": 8.558970497638442e-05, + "loss": 0.5939, + "step": 11277 + }, + { + "epoch": 0.7641439121891727, + "grad_norm": 6.921864986419678, + "learning_rate": 8.55883359572866e-05, + "loss": 0.7877, + "step": 11278 + }, + { + "epoch": 0.7642116674571447, + "grad_norm": 7.049600124359131, + "learning_rate": 8.55869669381888e-05, + "loss": 1.0853, + "step": 11279 + }, + { + "epoch": 0.7642794227251168, + "grad_norm": 6.981689929962158, + "learning_rate": 8.558559791909098e-05, + "loss": 0.9528, + "step": 11280 + }, + { + "epoch": 0.7643471779930889, + "grad_norm": 5.135133266448975, + "learning_rate": 8.558422889999316e-05, + "loss": 0.6623, + "step": 11281 + }, + { + "epoch": 0.764414933261061, + "grad_norm": 4.53966760635376, + "learning_rate": 8.558285988089534e-05, + "loss": 0.5281, + "step": 11282 + }, + { + "epoch": 0.7644826885290331, + "grad_norm": 7.814493656158447, + "learning_rate": 8.558149086179753e-05, + "loss": 1.132, + "step": 11283 + }, + { + "epoch": 0.7645504437970052, + "grad_norm": 7.641327381134033, + "learning_rate": 8.558012184269971e-05, + "loss": 1.0694, + "step": 11284 + }, + { + "epoch": 0.7646181990649773, + "grad_norm": 5.932433128356934, + "learning_rate": 8.557875282360189e-05, + "loss": 0.9439, + "step": 11285 + }, + { + "epoch": 0.7646859543329494, + "grad_norm": 5.546622276306152, + "learning_rate": 8.557738380450407e-05, + "loss": 0.8643, + "step": 11286 + }, + { + "epoch": 0.7647537096009215, + "grad_norm": 5.611812114715576, + "learning_rate": 8.557601478540625e-05, + "loss": 0.8404, + "step": 11287 + }, + { + "epoch": 0.7648214648688936, + "grad_norm": 6.550889015197754, + "learning_rate": 8.557464576630845e-05, + "loss": 0.6958, + "step": 11288 + }, + { + "epoch": 0.7648892201368657, + "grad_norm": 5.113431930541992, + "learning_rate": 8.557327674721063e-05, + "loss": 0.5798, + "step": 11289 + }, + { + "epoch": 0.7649569754048378, + "grad_norm": 5.645472526550293, + "learning_rate": 8.557190772811281e-05, + "loss": 0.9511, + "step": 11290 + }, + { + "epoch": 0.7650247306728098, + "grad_norm": 6.488102912902832, + "learning_rate": 8.557053870901499e-05, + "loss": 1.1031, + "step": 11291 + }, + { + "epoch": 0.7650924859407819, + "grad_norm": 5.89171028137207, + "learning_rate": 8.556916968991718e-05, + "loss": 0.6539, + "step": 11292 + }, + { + "epoch": 0.765160241208754, + "grad_norm": 6.806115627288818, + "learning_rate": 8.556780067081936e-05, + "loss": 0.7376, + "step": 11293 + }, + { + "epoch": 0.765227996476726, + "grad_norm": 4.954026222229004, + "learning_rate": 8.556643165172154e-05, + "loss": 0.8474, + "step": 11294 + }, + { + "epoch": 0.7652957517446981, + "grad_norm": 5.056695461273193, + "learning_rate": 8.556506263262372e-05, + "loss": 0.6977, + "step": 11295 + }, + { + "epoch": 0.7653635070126702, + "grad_norm": 6.3873162269592285, + "learning_rate": 8.55636936135259e-05, + "loss": 0.8378, + "step": 11296 + }, + { + "epoch": 0.7654312622806423, + "grad_norm": 6.397019386291504, + "learning_rate": 8.55623245944281e-05, + "loss": 0.7574, + "step": 11297 + }, + { + "epoch": 0.7654990175486144, + "grad_norm": 7.052022457122803, + "learning_rate": 8.556095557533028e-05, + "loss": 0.6341, + "step": 11298 + }, + { + "epoch": 0.7655667728165865, + "grad_norm": 6.845788478851318, + "learning_rate": 8.555958655623246e-05, + "loss": 0.9614, + "step": 11299 + }, + { + "epoch": 0.7656345280845586, + "grad_norm": 6.307339668273926, + "learning_rate": 8.555821753713464e-05, + "loss": 0.8428, + "step": 11300 + }, + { + "epoch": 0.7657022833525307, + "grad_norm": 9.020713806152344, + "learning_rate": 8.555684851803683e-05, + "loss": 0.9878, + "step": 11301 + }, + { + "epoch": 0.7657700386205027, + "grad_norm": 5.308076858520508, + "learning_rate": 8.555547949893901e-05, + "loss": 0.614, + "step": 11302 + }, + { + "epoch": 0.7658377938884748, + "grad_norm": 5.961923122406006, + "learning_rate": 8.555411047984119e-05, + "loss": 0.7601, + "step": 11303 + }, + { + "epoch": 0.7659055491564469, + "grad_norm": 5.368841648101807, + "learning_rate": 8.555274146074339e-05, + "loss": 0.802, + "step": 11304 + }, + { + "epoch": 0.765973304424419, + "grad_norm": 6.24576473236084, + "learning_rate": 8.555137244164557e-05, + "loss": 0.8612, + "step": 11305 + }, + { + "epoch": 0.7660410596923911, + "grad_norm": 6.770591735839844, + "learning_rate": 8.555000342254775e-05, + "loss": 0.9074, + "step": 11306 + }, + { + "epoch": 0.7661088149603632, + "grad_norm": 7.066356658935547, + "learning_rate": 8.554863440344994e-05, + "loss": 0.9979, + "step": 11307 + }, + { + "epoch": 0.7661765702283353, + "grad_norm": 6.5950727462768555, + "learning_rate": 8.554726538435212e-05, + "loss": 0.6511, + "step": 11308 + }, + { + "epoch": 0.7662443254963074, + "grad_norm": 7.035087585449219, + "learning_rate": 8.55458963652543e-05, + "loss": 0.6259, + "step": 11309 + }, + { + "epoch": 0.7663120807642794, + "grad_norm": 5.893954753875732, + "learning_rate": 8.554452734615648e-05, + "loss": 0.8179, + "step": 11310 + }, + { + "epoch": 0.7663798360322515, + "grad_norm": 9.012917518615723, + "learning_rate": 8.554315832705868e-05, + "loss": 0.8251, + "step": 11311 + }, + { + "epoch": 0.7664475913002236, + "grad_norm": 6.316323280334473, + "learning_rate": 8.554178930796086e-05, + "loss": 0.9139, + "step": 11312 + }, + { + "epoch": 0.7665153465681956, + "grad_norm": 7.094173908233643, + "learning_rate": 8.554042028886304e-05, + "loss": 0.8208, + "step": 11313 + }, + { + "epoch": 0.7665831018361677, + "grad_norm": 6.499126434326172, + "learning_rate": 8.553905126976522e-05, + "loss": 0.7567, + "step": 11314 + }, + { + "epoch": 0.7666508571041398, + "grad_norm": 5.729644775390625, + "learning_rate": 8.553768225066741e-05, + "loss": 0.7671, + "step": 11315 + }, + { + "epoch": 0.7667186123721119, + "grad_norm": 8.627473831176758, + "learning_rate": 8.553631323156959e-05, + "loss": 1.0222, + "step": 11316 + }, + { + "epoch": 0.766786367640084, + "grad_norm": 6.386945724487305, + "learning_rate": 8.553494421247177e-05, + "loss": 0.7227, + "step": 11317 + }, + { + "epoch": 0.7668541229080561, + "grad_norm": 5.726612567901611, + "learning_rate": 8.553357519337395e-05, + "loss": 0.931, + "step": 11318 + }, + { + "epoch": 0.7669218781760282, + "grad_norm": 6.600430488586426, + "learning_rate": 8.553220617427613e-05, + "loss": 0.9788, + "step": 11319 + }, + { + "epoch": 0.7669896334440003, + "grad_norm": 4.86123514175415, + "learning_rate": 8.553083715517833e-05, + "loss": 0.7648, + "step": 11320 + }, + { + "epoch": 0.7670573887119724, + "grad_norm": 6.815804481506348, + "learning_rate": 8.55294681360805e-05, + "loss": 0.5505, + "step": 11321 + }, + { + "epoch": 0.7671251439799445, + "grad_norm": 6.213870048522949, + "learning_rate": 8.552809911698269e-05, + "loss": 0.8116, + "step": 11322 + }, + { + "epoch": 0.7671928992479166, + "grad_norm": 6.11044454574585, + "learning_rate": 8.552673009788487e-05, + "loss": 0.8966, + "step": 11323 + }, + { + "epoch": 0.7672606545158887, + "grad_norm": 6.12937068939209, + "learning_rate": 8.552536107878706e-05, + "loss": 0.7981, + "step": 11324 + }, + { + "epoch": 0.7673284097838607, + "grad_norm": 6.787543773651123, + "learning_rate": 8.552399205968924e-05, + "loss": 0.8515, + "step": 11325 + }, + { + "epoch": 0.7673961650518328, + "grad_norm": 6.377131462097168, + "learning_rate": 8.552262304059142e-05, + "loss": 0.7469, + "step": 11326 + }, + { + "epoch": 0.7674639203198048, + "grad_norm": 6.438225746154785, + "learning_rate": 8.55212540214936e-05, + "loss": 0.8255, + "step": 11327 + }, + { + "epoch": 0.7675316755877769, + "grad_norm": 8.09716510772705, + "learning_rate": 8.551988500239578e-05, + "loss": 1.0083, + "step": 11328 + }, + { + "epoch": 0.767599430855749, + "grad_norm": 6.187016487121582, + "learning_rate": 8.551851598329798e-05, + "loss": 0.8532, + "step": 11329 + }, + { + "epoch": 0.7676671861237211, + "grad_norm": 6.157771587371826, + "learning_rate": 8.551714696420016e-05, + "loss": 0.7899, + "step": 11330 + }, + { + "epoch": 0.7677349413916932, + "grad_norm": 6.330532073974609, + "learning_rate": 8.551577794510234e-05, + "loss": 0.8263, + "step": 11331 + }, + { + "epoch": 0.7678026966596653, + "grad_norm": 6.730197906494141, + "learning_rate": 8.551440892600452e-05, + "loss": 0.7482, + "step": 11332 + }, + { + "epoch": 0.7678704519276374, + "grad_norm": 9.031699180603027, + "learning_rate": 8.55130399069067e-05, + "loss": 0.7286, + "step": 11333 + }, + { + "epoch": 0.7679382071956095, + "grad_norm": 7.25761604309082, + "learning_rate": 8.551167088780889e-05, + "loss": 1.0674, + "step": 11334 + }, + { + "epoch": 0.7680059624635815, + "grad_norm": 6.347570896148682, + "learning_rate": 8.551030186871107e-05, + "loss": 0.7851, + "step": 11335 + }, + { + "epoch": 0.7680737177315536, + "grad_norm": 5.421909809112549, + "learning_rate": 8.550893284961325e-05, + "loss": 0.869, + "step": 11336 + }, + { + "epoch": 0.7681414729995257, + "grad_norm": 6.994184970855713, + "learning_rate": 8.550756383051543e-05, + "loss": 0.9512, + "step": 11337 + }, + { + "epoch": 0.7682092282674978, + "grad_norm": 5.763876914978027, + "learning_rate": 8.550619481141763e-05, + "loss": 0.7909, + "step": 11338 + }, + { + "epoch": 0.7682769835354699, + "grad_norm": 5.998788833618164, + "learning_rate": 8.55048257923198e-05, + "loss": 0.8029, + "step": 11339 + }, + { + "epoch": 0.768344738803442, + "grad_norm": 5.988320827484131, + "learning_rate": 8.550345677322199e-05, + "loss": 0.7934, + "step": 11340 + }, + { + "epoch": 0.7684124940714141, + "grad_norm": 4.84094762802124, + "learning_rate": 8.550208775412417e-05, + "loss": 0.8953, + "step": 11341 + }, + { + "epoch": 0.7684802493393862, + "grad_norm": 6.44489049911499, + "learning_rate": 8.550071873502635e-05, + "loss": 1.0824, + "step": 11342 + }, + { + "epoch": 0.7685480046073582, + "grad_norm": 5.414109230041504, + "learning_rate": 8.549934971592854e-05, + "loss": 0.8749, + "step": 11343 + }, + { + "epoch": 0.7686157598753303, + "grad_norm": 5.344674110412598, + "learning_rate": 8.549798069683072e-05, + "loss": 0.5946, + "step": 11344 + }, + { + "epoch": 0.7686835151433024, + "grad_norm": 5.795503616333008, + "learning_rate": 8.54966116777329e-05, + "loss": 0.8889, + "step": 11345 + }, + { + "epoch": 0.7687512704112744, + "grad_norm": 6.298269271850586, + "learning_rate": 8.549524265863508e-05, + "loss": 0.5105, + "step": 11346 + }, + { + "epoch": 0.7688190256792465, + "grad_norm": 5.435570240020752, + "learning_rate": 8.549387363953728e-05, + "loss": 0.738, + "step": 11347 + }, + { + "epoch": 0.7688867809472186, + "grad_norm": 5.928055763244629, + "learning_rate": 8.549250462043946e-05, + "loss": 0.8317, + "step": 11348 + }, + { + "epoch": 0.7689545362151907, + "grad_norm": 6.4229350090026855, + "learning_rate": 8.549113560134164e-05, + "loss": 0.6707, + "step": 11349 + }, + { + "epoch": 0.7690222914831628, + "grad_norm": 6.353605270385742, + "learning_rate": 8.548976658224383e-05, + "loss": 0.9008, + "step": 11350 + }, + { + "epoch": 0.7690900467511349, + "grad_norm": 5.449488639831543, + "learning_rate": 8.548839756314601e-05, + "loss": 0.966, + "step": 11351 + }, + { + "epoch": 0.769157802019107, + "grad_norm": 4.989541053771973, + "learning_rate": 8.548702854404819e-05, + "loss": 0.604, + "step": 11352 + }, + { + "epoch": 0.7692255572870791, + "grad_norm": 5.058244228363037, + "learning_rate": 8.548565952495039e-05, + "loss": 0.6988, + "step": 11353 + }, + { + "epoch": 0.7692933125550512, + "grad_norm": 5.852552890777588, + "learning_rate": 8.548429050585257e-05, + "loss": 0.6343, + "step": 11354 + }, + { + "epoch": 0.7693610678230233, + "grad_norm": 7.096445560455322, + "learning_rate": 8.548292148675475e-05, + "loss": 0.7653, + "step": 11355 + }, + { + "epoch": 0.7694288230909954, + "grad_norm": 6.468287467956543, + "learning_rate": 8.548155246765693e-05, + "loss": 0.7799, + "step": 11356 + }, + { + "epoch": 0.7694965783589675, + "grad_norm": 7.869673252105713, + "learning_rate": 8.548018344855912e-05, + "loss": 0.9937, + "step": 11357 + }, + { + "epoch": 0.7695643336269395, + "grad_norm": 6.300357341766357, + "learning_rate": 8.54788144294613e-05, + "loss": 0.644, + "step": 11358 + }, + { + "epoch": 0.7696320888949115, + "grad_norm": 6.4069318771362305, + "learning_rate": 8.547744541036348e-05, + "loss": 0.8422, + "step": 11359 + }, + { + "epoch": 0.7696998441628836, + "grad_norm": 5.846930980682373, + "learning_rate": 8.547607639126566e-05, + "loss": 0.7335, + "step": 11360 + }, + { + "epoch": 0.7697675994308557, + "grad_norm": 6.80328893661499, + "learning_rate": 8.547470737216785e-05, + "loss": 0.6947, + "step": 11361 + }, + { + "epoch": 0.7698353546988278, + "grad_norm": 5.2702412605285645, + "learning_rate": 8.547333835307004e-05, + "loss": 0.6963, + "step": 11362 + }, + { + "epoch": 0.7699031099667999, + "grad_norm": 5.755353927612305, + "learning_rate": 8.547196933397222e-05, + "loss": 0.7093, + "step": 11363 + }, + { + "epoch": 0.769970865234772, + "grad_norm": 5.961780071258545, + "learning_rate": 8.54706003148744e-05, + "loss": 0.8871, + "step": 11364 + }, + { + "epoch": 0.7700386205027441, + "grad_norm": 5.585000514984131, + "learning_rate": 8.546923129577658e-05, + "loss": 0.677, + "step": 11365 + }, + { + "epoch": 0.7701063757707162, + "grad_norm": 5.904151916503906, + "learning_rate": 8.546786227667877e-05, + "loss": 0.8357, + "step": 11366 + }, + { + "epoch": 0.7701741310386883, + "grad_norm": 5.614160537719727, + "learning_rate": 8.546649325758095e-05, + "loss": 0.8266, + "step": 11367 + }, + { + "epoch": 0.7702418863066604, + "grad_norm": 5.502987861633301, + "learning_rate": 8.546512423848313e-05, + "loss": 0.715, + "step": 11368 + }, + { + "epoch": 0.7703096415746324, + "grad_norm": 5.5060625076293945, + "learning_rate": 8.546375521938531e-05, + "loss": 0.6332, + "step": 11369 + }, + { + "epoch": 0.7703773968426045, + "grad_norm": 6.361515522003174, + "learning_rate": 8.54623862002875e-05, + "loss": 0.8958, + "step": 11370 + }, + { + "epoch": 0.7704451521105766, + "grad_norm": 5.42423152923584, + "learning_rate": 8.546101718118969e-05, + "loss": 0.5965, + "step": 11371 + }, + { + "epoch": 0.7705129073785487, + "grad_norm": 7.705050945281982, + "learning_rate": 8.545964816209187e-05, + "loss": 0.8397, + "step": 11372 + }, + { + "epoch": 0.7705806626465208, + "grad_norm": 8.009198188781738, + "learning_rate": 8.545827914299405e-05, + "loss": 0.861, + "step": 11373 + }, + { + "epoch": 0.7706484179144929, + "grad_norm": 7.349557876586914, + "learning_rate": 8.545691012389623e-05, + "loss": 1.1441, + "step": 11374 + }, + { + "epoch": 0.770716173182465, + "grad_norm": 6.47999382019043, + "learning_rate": 8.545554110479842e-05, + "loss": 0.8032, + "step": 11375 + }, + { + "epoch": 0.770783928450437, + "grad_norm": 8.485747337341309, + "learning_rate": 8.54541720857006e-05, + "loss": 0.8073, + "step": 11376 + }, + { + "epoch": 0.7708516837184091, + "grad_norm": 7.178839683532715, + "learning_rate": 8.545280306660278e-05, + "loss": 0.8595, + "step": 11377 + }, + { + "epoch": 0.7709194389863812, + "grad_norm": 7.898037910461426, + "learning_rate": 8.545143404750496e-05, + "loss": 1.0232, + "step": 11378 + }, + { + "epoch": 0.7709871942543532, + "grad_norm": 6.554230690002441, + "learning_rate": 8.545006502840716e-05, + "loss": 0.8139, + "step": 11379 + }, + { + "epoch": 0.7710549495223253, + "grad_norm": 5.184169769287109, + "learning_rate": 8.544869600930934e-05, + "loss": 0.8208, + "step": 11380 + }, + { + "epoch": 0.7711227047902974, + "grad_norm": 6.4749579429626465, + "learning_rate": 8.544732699021152e-05, + "loss": 0.9345, + "step": 11381 + }, + { + "epoch": 0.7711904600582695, + "grad_norm": 6.739956378936768, + "learning_rate": 8.54459579711137e-05, + "loss": 0.8543, + "step": 11382 + }, + { + "epoch": 0.7712582153262416, + "grad_norm": 6.217097759246826, + "learning_rate": 8.544458895201588e-05, + "loss": 1.0393, + "step": 11383 + }, + { + "epoch": 0.7713259705942137, + "grad_norm": 5.0891828536987305, + "learning_rate": 8.544321993291807e-05, + "loss": 0.7836, + "step": 11384 + }, + { + "epoch": 0.7713937258621858, + "grad_norm": 4.892578125, + "learning_rate": 8.544185091382025e-05, + "loss": 0.8143, + "step": 11385 + }, + { + "epoch": 0.7714614811301579, + "grad_norm": 5.4404168128967285, + "learning_rate": 8.544048189472243e-05, + "loss": 0.8841, + "step": 11386 + }, + { + "epoch": 0.77152923639813, + "grad_norm": 6.725840091705322, + "learning_rate": 8.543911287562461e-05, + "loss": 0.9308, + "step": 11387 + }, + { + "epoch": 0.7715969916661021, + "grad_norm": 4.622491359710693, + "learning_rate": 8.543774385652679e-05, + "loss": 0.5187, + "step": 11388 + }, + { + "epoch": 0.7716647469340742, + "grad_norm": 5.4316182136535645, + "learning_rate": 8.543637483742899e-05, + "loss": 0.7526, + "step": 11389 + }, + { + "epoch": 0.7717325022020463, + "grad_norm": 6.089479923248291, + "learning_rate": 8.543500581833117e-05, + "loss": 0.8785, + "step": 11390 + }, + { + "epoch": 0.7718002574700183, + "grad_norm": 7.127860069274902, + "learning_rate": 8.543363679923335e-05, + "loss": 1.0935, + "step": 11391 + }, + { + "epoch": 0.7718680127379903, + "grad_norm": 6.1316399574279785, + "learning_rate": 8.543226778013553e-05, + "loss": 0.7755, + "step": 11392 + }, + { + "epoch": 0.7719357680059624, + "grad_norm": 5.672332286834717, + "learning_rate": 8.543089876103772e-05, + "loss": 0.8099, + "step": 11393 + }, + { + "epoch": 0.7720035232739345, + "grad_norm": 7.249478816986084, + "learning_rate": 8.54295297419399e-05, + "loss": 0.7333, + "step": 11394 + }, + { + "epoch": 0.7720712785419066, + "grad_norm": 6.270430088043213, + "learning_rate": 8.542816072284208e-05, + "loss": 0.7662, + "step": 11395 + }, + { + "epoch": 0.7721390338098787, + "grad_norm": 5.953348636627197, + "learning_rate": 8.542679170374428e-05, + "loss": 0.7991, + "step": 11396 + }, + { + "epoch": 0.7722067890778508, + "grad_norm": 9.44437313079834, + "learning_rate": 8.542542268464646e-05, + "loss": 0.8061, + "step": 11397 + }, + { + "epoch": 0.7722745443458229, + "grad_norm": 6.52545690536499, + "learning_rate": 8.542405366554864e-05, + "loss": 1.0734, + "step": 11398 + }, + { + "epoch": 0.772342299613795, + "grad_norm": 6.201472282409668, + "learning_rate": 8.542268464645083e-05, + "loss": 1.0589, + "step": 11399 + }, + { + "epoch": 0.7724100548817671, + "grad_norm": 6.648289680480957, + "learning_rate": 8.542131562735301e-05, + "loss": 0.6268, + "step": 11400 + }, + { + "epoch": 0.7724778101497392, + "grad_norm": 6.1713433265686035, + "learning_rate": 8.541994660825519e-05, + "loss": 0.8162, + "step": 11401 + }, + { + "epoch": 0.7725455654177112, + "grad_norm": 6.964521884918213, + "learning_rate": 8.541857758915738e-05, + "loss": 0.8242, + "step": 11402 + }, + { + "epoch": 0.7726133206856833, + "grad_norm": 6.025942802429199, + "learning_rate": 8.541720857005956e-05, + "loss": 0.748, + "step": 11403 + }, + { + "epoch": 0.7726810759536554, + "grad_norm": 5.319873809814453, + "learning_rate": 8.541583955096175e-05, + "loss": 0.7437, + "step": 11404 + }, + { + "epoch": 0.7727488312216275, + "grad_norm": 7.246333599090576, + "learning_rate": 8.541447053186393e-05, + "loss": 0.7307, + "step": 11405 + }, + { + "epoch": 0.7728165864895996, + "grad_norm": 8.031798362731934, + "learning_rate": 8.54131015127661e-05, + "loss": 1.2193, + "step": 11406 + }, + { + "epoch": 0.7728843417575717, + "grad_norm": 5.135364532470703, + "learning_rate": 8.54117324936683e-05, + "loss": 0.7107, + "step": 11407 + }, + { + "epoch": 0.7729520970255437, + "grad_norm": 8.0211820602417, + "learning_rate": 8.541036347457048e-05, + "loss": 0.7786, + "step": 11408 + }, + { + "epoch": 0.7730198522935158, + "grad_norm": 7.091327667236328, + "learning_rate": 8.540899445547266e-05, + "loss": 0.7232, + "step": 11409 + }, + { + "epoch": 0.7730876075614879, + "grad_norm": 7.18704891204834, + "learning_rate": 8.540762543637484e-05, + "loss": 1.0204, + "step": 11410 + }, + { + "epoch": 0.77315536282946, + "grad_norm": 5.484143257141113, + "learning_rate": 8.540625641727702e-05, + "loss": 0.9076, + "step": 11411 + }, + { + "epoch": 0.773223118097432, + "grad_norm": 8.248093605041504, + "learning_rate": 8.540488739817921e-05, + "loss": 0.8882, + "step": 11412 + }, + { + "epoch": 0.7732908733654041, + "grad_norm": 5.386202335357666, + "learning_rate": 8.54035183790814e-05, + "loss": 0.8552, + "step": 11413 + }, + { + "epoch": 0.7733586286333762, + "grad_norm": 5.230656623840332, + "learning_rate": 8.540214935998358e-05, + "loss": 0.6101, + "step": 11414 + }, + { + "epoch": 0.7734263839013483, + "grad_norm": 6.001654624938965, + "learning_rate": 8.540078034088576e-05, + "loss": 0.905, + "step": 11415 + }, + { + "epoch": 0.7734941391693204, + "grad_norm": 5.581679344177246, + "learning_rate": 8.539941132178795e-05, + "loss": 0.8538, + "step": 11416 + }, + { + "epoch": 0.7735618944372925, + "grad_norm": 5.559355735778809, + "learning_rate": 8.539804230269013e-05, + "loss": 0.8776, + "step": 11417 + }, + { + "epoch": 0.7736296497052646, + "grad_norm": 4.970002174377441, + "learning_rate": 8.539667328359231e-05, + "loss": 0.6365, + "step": 11418 + }, + { + "epoch": 0.7736974049732367, + "grad_norm": 6.487758159637451, + "learning_rate": 8.539530426449449e-05, + "loss": 1.0137, + "step": 11419 + }, + { + "epoch": 0.7737651602412088, + "grad_norm": 6.612691402435303, + "learning_rate": 8.539393524539667e-05, + "loss": 0.9539, + "step": 11420 + }, + { + "epoch": 0.7738329155091809, + "grad_norm": 9.316730499267578, + "learning_rate": 8.539256622629887e-05, + "loss": 0.761, + "step": 11421 + }, + { + "epoch": 0.773900670777153, + "grad_norm": 6.791004180908203, + "learning_rate": 8.539119720720105e-05, + "loss": 0.9281, + "step": 11422 + }, + { + "epoch": 0.7739684260451251, + "grad_norm": 6.122042179107666, + "learning_rate": 8.538982818810323e-05, + "loss": 0.8082, + "step": 11423 + }, + { + "epoch": 0.774036181313097, + "grad_norm": 5.878390312194824, + "learning_rate": 8.53884591690054e-05, + "loss": 0.7934, + "step": 11424 + }, + { + "epoch": 0.7741039365810691, + "grad_norm": 5.959982395172119, + "learning_rate": 8.53870901499076e-05, + "loss": 0.7919, + "step": 11425 + }, + { + "epoch": 0.7741716918490412, + "grad_norm": 7.212625980377197, + "learning_rate": 8.538572113080978e-05, + "loss": 0.6346, + "step": 11426 + }, + { + "epoch": 0.7742394471170133, + "grad_norm": 5.554856300354004, + "learning_rate": 8.538435211171196e-05, + "loss": 0.757, + "step": 11427 + }, + { + "epoch": 0.7743072023849854, + "grad_norm": 7.405214309692383, + "learning_rate": 8.538298309261414e-05, + "loss": 0.8239, + "step": 11428 + }, + { + "epoch": 0.7743749576529575, + "grad_norm": 5.671273231506348, + "learning_rate": 8.538161407351632e-05, + "loss": 0.6547, + "step": 11429 + }, + { + "epoch": 0.7744427129209296, + "grad_norm": 5.7590436935424805, + "learning_rate": 8.538024505441852e-05, + "loss": 0.6526, + "step": 11430 + }, + { + "epoch": 0.7745104681889017, + "grad_norm": 5.3632283210754395, + "learning_rate": 8.53788760353207e-05, + "loss": 0.6368, + "step": 11431 + }, + { + "epoch": 0.7745782234568738, + "grad_norm": 5.514582633972168, + "learning_rate": 8.537750701622288e-05, + "loss": 0.8242, + "step": 11432 + }, + { + "epoch": 0.7746459787248459, + "grad_norm": 6.291776657104492, + "learning_rate": 8.537613799712506e-05, + "loss": 0.7272, + "step": 11433 + }, + { + "epoch": 0.774713733992818, + "grad_norm": 6.716917991638184, + "learning_rate": 8.537476897802725e-05, + "loss": 0.658, + "step": 11434 + }, + { + "epoch": 0.77478148926079, + "grad_norm": 6.479708194732666, + "learning_rate": 8.537339995892943e-05, + "loss": 1.0441, + "step": 11435 + }, + { + "epoch": 0.7748492445287621, + "grad_norm": 7.607060432434082, + "learning_rate": 8.537203093983161e-05, + "loss": 0.849, + "step": 11436 + }, + { + "epoch": 0.7749169997967342, + "grad_norm": 5.455706596374512, + "learning_rate": 8.537066192073379e-05, + "loss": 0.8267, + "step": 11437 + }, + { + "epoch": 0.7749847550647063, + "grad_norm": 6.571371078491211, + "learning_rate": 8.536929290163597e-05, + "loss": 0.9552, + "step": 11438 + }, + { + "epoch": 0.7750525103326784, + "grad_norm": 7.029003143310547, + "learning_rate": 8.536792388253817e-05, + "loss": 0.7186, + "step": 11439 + }, + { + "epoch": 0.7751202656006505, + "grad_norm": 6.117072582244873, + "learning_rate": 8.536655486344035e-05, + "loss": 0.6946, + "step": 11440 + }, + { + "epoch": 0.7751880208686225, + "grad_norm": 6.154125690460205, + "learning_rate": 8.536518584434253e-05, + "loss": 0.6694, + "step": 11441 + }, + { + "epoch": 0.7752557761365946, + "grad_norm": 5.271462917327881, + "learning_rate": 8.536381682524472e-05, + "loss": 0.6984, + "step": 11442 + }, + { + "epoch": 0.7753235314045667, + "grad_norm": 7.075232982635498, + "learning_rate": 8.53624478061469e-05, + "loss": 0.8232, + "step": 11443 + }, + { + "epoch": 0.7753912866725388, + "grad_norm": 8.889443397521973, + "learning_rate": 8.536107878704908e-05, + "loss": 0.9988, + "step": 11444 + }, + { + "epoch": 0.7754590419405109, + "grad_norm": 6.270964622497559, + "learning_rate": 8.535970976795127e-05, + "loss": 0.8725, + "step": 11445 + }, + { + "epoch": 0.7755267972084829, + "grad_norm": 6.906674861907959, + "learning_rate": 8.535834074885345e-05, + "loss": 0.8918, + "step": 11446 + }, + { + "epoch": 0.775594552476455, + "grad_norm": 7.697040557861328, + "learning_rate": 8.535697172975564e-05, + "loss": 0.8286, + "step": 11447 + }, + { + "epoch": 0.7756623077444271, + "grad_norm": 6.875545024871826, + "learning_rate": 8.535560271065783e-05, + "loss": 0.9181, + "step": 11448 + }, + { + "epoch": 0.7757300630123992, + "grad_norm": 6.059504985809326, + "learning_rate": 8.535423369156001e-05, + "loss": 0.7066, + "step": 11449 + }, + { + "epoch": 0.7757978182803713, + "grad_norm": 6.182839870452881, + "learning_rate": 8.535286467246219e-05, + "loss": 0.7472, + "step": 11450 + }, + { + "epoch": 0.7758655735483434, + "grad_norm": 6.222153663635254, + "learning_rate": 8.535149565336437e-05, + "loss": 0.7874, + "step": 11451 + }, + { + "epoch": 0.7759333288163155, + "grad_norm": 5.410420894622803, + "learning_rate": 8.535012663426655e-05, + "loss": 0.6223, + "step": 11452 + }, + { + "epoch": 0.7760010840842876, + "grad_norm": 6.875960350036621, + "learning_rate": 8.534875761516874e-05, + "loss": 0.668, + "step": 11453 + }, + { + "epoch": 0.7760688393522597, + "grad_norm": 6.519979953765869, + "learning_rate": 8.534738859607092e-05, + "loss": 0.6604, + "step": 11454 + }, + { + "epoch": 0.7761365946202318, + "grad_norm": 5.24510383605957, + "learning_rate": 8.53460195769731e-05, + "loss": 0.685, + "step": 11455 + }, + { + "epoch": 0.7762043498882039, + "grad_norm": 5.399496555328369, + "learning_rate": 8.534465055787529e-05, + "loss": 0.6585, + "step": 11456 + }, + { + "epoch": 0.7762721051561758, + "grad_norm": 6.070941925048828, + "learning_rate": 8.534328153877748e-05, + "loss": 0.6592, + "step": 11457 + }, + { + "epoch": 0.7763398604241479, + "grad_norm": 6.634647846221924, + "learning_rate": 8.534191251967966e-05, + "loss": 0.6772, + "step": 11458 + }, + { + "epoch": 0.77640761569212, + "grad_norm": 4.490801811218262, + "learning_rate": 8.534054350058184e-05, + "loss": 0.6787, + "step": 11459 + }, + { + "epoch": 0.7764753709600921, + "grad_norm": 9.601375579833984, + "learning_rate": 8.533917448148402e-05, + "loss": 0.7703, + "step": 11460 + }, + { + "epoch": 0.7765431262280642, + "grad_norm": 8.57113265991211, + "learning_rate": 8.53378054623862e-05, + "loss": 0.6969, + "step": 11461 + }, + { + "epoch": 0.7766108814960363, + "grad_norm": 5.441416263580322, + "learning_rate": 8.53364364432884e-05, + "loss": 0.7218, + "step": 11462 + }, + { + "epoch": 0.7766786367640084, + "grad_norm": 5.4659857749938965, + "learning_rate": 8.533506742419057e-05, + "loss": 0.7042, + "step": 11463 + }, + { + "epoch": 0.7767463920319805, + "grad_norm": 5.655581951141357, + "learning_rate": 8.533369840509276e-05, + "loss": 0.8021, + "step": 11464 + }, + { + "epoch": 0.7768141472999526, + "grad_norm": 5.530257225036621, + "learning_rate": 8.533232938599494e-05, + "loss": 0.8872, + "step": 11465 + }, + { + "epoch": 0.7768819025679247, + "grad_norm": 6.074810981750488, + "learning_rate": 8.533096036689712e-05, + "loss": 0.7353, + "step": 11466 + }, + { + "epoch": 0.7769496578358968, + "grad_norm": 5.74225378036499, + "learning_rate": 8.532959134779931e-05, + "loss": 0.9214, + "step": 11467 + }, + { + "epoch": 0.7770174131038688, + "grad_norm": 8.271156311035156, + "learning_rate": 8.532822232870149e-05, + "loss": 0.9913, + "step": 11468 + }, + { + "epoch": 0.7770851683718409, + "grad_norm": 5.124596118927002, + "learning_rate": 8.532685330960367e-05, + "loss": 0.6335, + "step": 11469 + }, + { + "epoch": 0.777152923639813, + "grad_norm": 6.918234348297119, + "learning_rate": 8.532548429050585e-05, + "loss": 0.9158, + "step": 11470 + }, + { + "epoch": 0.7772206789077851, + "grad_norm": 5.966104984283447, + "learning_rate": 8.532411527140804e-05, + "loss": 0.9534, + "step": 11471 + }, + { + "epoch": 0.7772884341757572, + "grad_norm": 6.7147536277771, + "learning_rate": 8.532274625231023e-05, + "loss": 0.7466, + "step": 11472 + }, + { + "epoch": 0.7773561894437292, + "grad_norm": 6.448988914489746, + "learning_rate": 8.53213772332124e-05, + "loss": 1.0013, + "step": 11473 + }, + { + "epoch": 0.7774239447117013, + "grad_norm": 4.238838195800781, + "learning_rate": 8.532000821411459e-05, + "loss": 0.6666, + "step": 11474 + }, + { + "epoch": 0.7774916999796734, + "grad_norm": 6.865900993347168, + "learning_rate": 8.531863919501677e-05, + "loss": 0.7966, + "step": 11475 + }, + { + "epoch": 0.7775594552476455, + "grad_norm": 5.297105312347412, + "learning_rate": 8.531727017591896e-05, + "loss": 0.8075, + "step": 11476 + }, + { + "epoch": 0.7776272105156176, + "grad_norm": 6.865470886230469, + "learning_rate": 8.531590115682114e-05, + "loss": 0.7531, + "step": 11477 + }, + { + "epoch": 0.7776949657835897, + "grad_norm": 12.770363807678223, + "learning_rate": 8.531453213772332e-05, + "loss": 0.765, + "step": 11478 + }, + { + "epoch": 0.7777627210515617, + "grad_norm": 5.952977180480957, + "learning_rate": 8.53131631186255e-05, + "loss": 0.6781, + "step": 11479 + }, + { + "epoch": 0.7778304763195338, + "grad_norm": 6.006490230560303, + "learning_rate": 8.53117940995277e-05, + "loss": 0.7149, + "step": 11480 + }, + { + "epoch": 0.7778982315875059, + "grad_norm": 5.353832244873047, + "learning_rate": 8.531042508042988e-05, + "loss": 0.5234, + "step": 11481 + }, + { + "epoch": 0.777965986855478, + "grad_norm": 7.221078395843506, + "learning_rate": 8.530905606133206e-05, + "loss": 0.8062, + "step": 11482 + }, + { + "epoch": 0.7780337421234501, + "grad_norm": 5.661943435668945, + "learning_rate": 8.530768704223424e-05, + "loss": 0.7662, + "step": 11483 + }, + { + "epoch": 0.7781014973914222, + "grad_norm": 7.68919038772583, + "learning_rate": 8.530631802313642e-05, + "loss": 0.9475, + "step": 11484 + }, + { + "epoch": 0.7781692526593943, + "grad_norm": 6.683036804199219, + "learning_rate": 8.530494900403861e-05, + "loss": 0.8855, + "step": 11485 + }, + { + "epoch": 0.7782370079273664, + "grad_norm": 6.948663234710693, + "learning_rate": 8.530357998494079e-05, + "loss": 0.8388, + "step": 11486 + }, + { + "epoch": 0.7783047631953385, + "grad_norm": 6.295228004455566, + "learning_rate": 8.530221096584297e-05, + "loss": 0.8796, + "step": 11487 + }, + { + "epoch": 0.7783725184633106, + "grad_norm": 5.598083019256592, + "learning_rate": 8.530084194674516e-05, + "loss": 1.1443, + "step": 11488 + }, + { + "epoch": 0.7784402737312827, + "grad_norm": 6.99757194519043, + "learning_rate": 8.529947292764735e-05, + "loss": 0.9209, + "step": 11489 + }, + { + "epoch": 0.7785080289992546, + "grad_norm": 4.9374189376831055, + "learning_rate": 8.529810390854953e-05, + "loss": 0.7056, + "step": 11490 + }, + { + "epoch": 0.7785757842672267, + "grad_norm": 5.831791877746582, + "learning_rate": 8.529673488945172e-05, + "loss": 0.9647, + "step": 11491 + }, + { + "epoch": 0.7786435395351988, + "grad_norm": 5.7701945304870605, + "learning_rate": 8.52953658703539e-05, + "loss": 0.5656, + "step": 11492 + }, + { + "epoch": 0.7787112948031709, + "grad_norm": 6.0373945236206055, + "learning_rate": 8.529399685125608e-05, + "loss": 0.7656, + "step": 11493 + }, + { + "epoch": 0.778779050071143, + "grad_norm": 5.0245184898376465, + "learning_rate": 8.529262783215827e-05, + "loss": 0.6954, + "step": 11494 + }, + { + "epoch": 0.7788468053391151, + "grad_norm": 5.802394390106201, + "learning_rate": 8.529125881306045e-05, + "loss": 0.7926, + "step": 11495 + }, + { + "epoch": 0.7789145606070872, + "grad_norm": 5.6064252853393555, + "learning_rate": 8.528988979396263e-05, + "loss": 0.6536, + "step": 11496 + }, + { + "epoch": 0.7789823158750593, + "grad_norm": 6.606382846832275, + "learning_rate": 8.528852077486481e-05, + "loss": 0.8506, + "step": 11497 + }, + { + "epoch": 0.7790500711430314, + "grad_norm": 5.858041286468506, + "learning_rate": 8.5287151755767e-05, + "loss": 0.9821, + "step": 11498 + }, + { + "epoch": 0.7791178264110035, + "grad_norm": 7.08564567565918, + "learning_rate": 8.528578273666919e-05, + "loss": 0.8554, + "step": 11499 + }, + { + "epoch": 0.7791855816789756, + "grad_norm": 8.084799766540527, + "learning_rate": 8.528441371757137e-05, + "loss": 0.7015, + "step": 11500 + }, + { + "epoch": 0.7792533369469477, + "grad_norm": 5.143333911895752, + "learning_rate": 8.528304469847355e-05, + "loss": 0.815, + "step": 11501 + }, + { + "epoch": 0.7793210922149197, + "grad_norm": 6.680109977722168, + "learning_rate": 8.528167567937573e-05, + "loss": 0.8661, + "step": 11502 + }, + { + "epoch": 0.7793888474828918, + "grad_norm": 6.3008341789245605, + "learning_rate": 8.528030666027792e-05, + "loss": 0.9275, + "step": 11503 + }, + { + "epoch": 0.7794566027508639, + "grad_norm": 6.935196876525879, + "learning_rate": 8.52789376411801e-05, + "loss": 0.6915, + "step": 11504 + }, + { + "epoch": 0.779524358018836, + "grad_norm": 5.888092517852783, + "learning_rate": 8.527756862208228e-05, + "loss": 0.6035, + "step": 11505 + }, + { + "epoch": 0.779592113286808, + "grad_norm": 6.207918167114258, + "learning_rate": 8.527619960298447e-05, + "loss": 0.6619, + "step": 11506 + }, + { + "epoch": 0.7796598685547801, + "grad_norm": 6.820822238922119, + "learning_rate": 8.527483058388665e-05, + "loss": 0.9905, + "step": 11507 + }, + { + "epoch": 0.7797276238227522, + "grad_norm": 6.440364837646484, + "learning_rate": 8.527346156478884e-05, + "loss": 1.0333, + "step": 11508 + }, + { + "epoch": 0.7797953790907243, + "grad_norm": 6.21598482131958, + "learning_rate": 8.527209254569102e-05, + "loss": 0.6562, + "step": 11509 + }, + { + "epoch": 0.7798631343586964, + "grad_norm": 8.483455657958984, + "learning_rate": 8.52707235265932e-05, + "loss": 0.6767, + "step": 11510 + }, + { + "epoch": 0.7799308896266685, + "grad_norm": 6.934547424316406, + "learning_rate": 8.526935450749538e-05, + "loss": 0.8354, + "step": 11511 + }, + { + "epoch": 0.7799986448946405, + "grad_norm": 8.640101432800293, + "learning_rate": 8.526798548839757e-05, + "loss": 0.9985, + "step": 11512 + }, + { + "epoch": 0.7800664001626126, + "grad_norm": 5.356478214263916, + "learning_rate": 8.526661646929975e-05, + "loss": 0.8524, + "step": 11513 + }, + { + "epoch": 0.7801341554305847, + "grad_norm": 6.551882743835449, + "learning_rate": 8.526524745020193e-05, + "loss": 0.9502, + "step": 11514 + }, + { + "epoch": 0.7802019106985568, + "grad_norm": 6.971212387084961, + "learning_rate": 8.526387843110412e-05, + "loss": 0.909, + "step": 11515 + }, + { + "epoch": 0.7802696659665289, + "grad_norm": 7.040616035461426, + "learning_rate": 8.52625094120063e-05, + "loss": 1.1058, + "step": 11516 + }, + { + "epoch": 0.780337421234501, + "grad_norm": 5.899086952209473, + "learning_rate": 8.526114039290849e-05, + "loss": 0.7611, + "step": 11517 + }, + { + "epoch": 0.7804051765024731, + "grad_norm": 6.242020606994629, + "learning_rate": 8.525977137381067e-05, + "loss": 0.7059, + "step": 11518 + }, + { + "epoch": 0.7804729317704452, + "grad_norm": 6.418991565704346, + "learning_rate": 8.525840235471285e-05, + "loss": 0.8625, + "step": 11519 + }, + { + "epoch": 0.7805406870384173, + "grad_norm": 4.487674236297607, + "learning_rate": 8.525703333561503e-05, + "loss": 0.7343, + "step": 11520 + }, + { + "epoch": 0.7806084423063894, + "grad_norm": 5.526739597320557, + "learning_rate": 8.525566431651721e-05, + "loss": 0.8331, + "step": 11521 + }, + { + "epoch": 0.7806761975743614, + "grad_norm": 7.298591136932373, + "learning_rate": 8.52542952974194e-05, + "loss": 1.0112, + "step": 11522 + }, + { + "epoch": 0.7807439528423334, + "grad_norm": 7.665398120880127, + "learning_rate": 8.525292627832159e-05, + "loss": 0.8248, + "step": 11523 + }, + { + "epoch": 0.7808117081103055, + "grad_norm": 5.981564044952393, + "learning_rate": 8.525155725922377e-05, + "loss": 0.8557, + "step": 11524 + }, + { + "epoch": 0.7808794633782776, + "grad_norm": 6.268359184265137, + "learning_rate": 8.525018824012595e-05, + "loss": 0.5943, + "step": 11525 + }, + { + "epoch": 0.7809472186462497, + "grad_norm": 5.51030969619751, + "learning_rate": 8.524881922102814e-05, + "loss": 0.7657, + "step": 11526 + }, + { + "epoch": 0.7810149739142218, + "grad_norm": 6.035849094390869, + "learning_rate": 8.524745020193032e-05, + "loss": 0.7501, + "step": 11527 + }, + { + "epoch": 0.7810827291821939, + "grad_norm": 5.455019474029541, + "learning_rate": 8.52460811828325e-05, + "loss": 0.6627, + "step": 11528 + }, + { + "epoch": 0.781150484450166, + "grad_norm": 5.731950283050537, + "learning_rate": 8.524471216373468e-05, + "loss": 0.7456, + "step": 11529 + }, + { + "epoch": 0.7812182397181381, + "grad_norm": 6.446641445159912, + "learning_rate": 8.524334314463686e-05, + "loss": 0.668, + "step": 11530 + }, + { + "epoch": 0.7812859949861102, + "grad_norm": 6.093938827514648, + "learning_rate": 8.524197412553905e-05, + "loss": 0.7248, + "step": 11531 + }, + { + "epoch": 0.7813537502540823, + "grad_norm": 5.524089813232422, + "learning_rate": 8.524060510644124e-05, + "loss": 0.8411, + "step": 11532 + }, + { + "epoch": 0.7814215055220544, + "grad_norm": 6.203104496002197, + "learning_rate": 8.523923608734342e-05, + "loss": 0.8882, + "step": 11533 + }, + { + "epoch": 0.7814892607900265, + "grad_norm": 5.39666748046875, + "learning_rate": 8.52378670682456e-05, + "loss": 0.8111, + "step": 11534 + }, + { + "epoch": 0.7815570160579985, + "grad_norm": 5.305294990539551, + "learning_rate": 8.523649804914779e-05, + "loss": 0.6809, + "step": 11535 + }, + { + "epoch": 0.7816247713259706, + "grad_norm": 5.251042366027832, + "learning_rate": 8.523512903004997e-05, + "loss": 0.8533, + "step": 11536 + }, + { + "epoch": 0.7816925265939427, + "grad_norm": 7.346649169921875, + "learning_rate": 8.523376001095215e-05, + "loss": 0.857, + "step": 11537 + }, + { + "epoch": 0.7817602818619148, + "grad_norm": 6.78896427154541, + "learning_rate": 8.523239099185434e-05, + "loss": 0.77, + "step": 11538 + }, + { + "epoch": 0.7818280371298868, + "grad_norm": 6.815920352935791, + "learning_rate": 8.523102197275652e-05, + "loss": 0.6838, + "step": 11539 + }, + { + "epoch": 0.7818957923978589, + "grad_norm": 5.649730682373047, + "learning_rate": 8.52296529536587e-05, + "loss": 0.833, + "step": 11540 + }, + { + "epoch": 0.781963547665831, + "grad_norm": 7.253706932067871, + "learning_rate": 8.52282839345609e-05, + "loss": 0.8077, + "step": 11541 + }, + { + "epoch": 0.7820313029338031, + "grad_norm": 4.436539649963379, + "learning_rate": 8.522691491546308e-05, + "loss": 0.6202, + "step": 11542 + }, + { + "epoch": 0.7820990582017752, + "grad_norm": 5.917550563812256, + "learning_rate": 8.522554589636526e-05, + "loss": 0.82, + "step": 11543 + }, + { + "epoch": 0.7821668134697473, + "grad_norm": 5.576394081115723, + "learning_rate": 8.522417687726744e-05, + "loss": 0.6957, + "step": 11544 + }, + { + "epoch": 0.7822345687377193, + "grad_norm": 7.591821670532227, + "learning_rate": 8.522280785816963e-05, + "loss": 0.9026, + "step": 11545 + }, + { + "epoch": 0.7823023240056914, + "grad_norm": 8.16065502166748, + "learning_rate": 8.522143883907181e-05, + "loss": 1.0274, + "step": 11546 + }, + { + "epoch": 0.7823700792736635, + "grad_norm": 9.164224624633789, + "learning_rate": 8.5220069819974e-05, + "loss": 0.7935, + "step": 11547 + }, + { + "epoch": 0.7824378345416356, + "grad_norm": 5.825256824493408, + "learning_rate": 8.521870080087617e-05, + "loss": 0.8263, + "step": 11548 + }, + { + "epoch": 0.7825055898096077, + "grad_norm": 7.315790176391602, + "learning_rate": 8.521733178177837e-05, + "loss": 0.9282, + "step": 11549 + }, + { + "epoch": 0.7825733450775798, + "grad_norm": 6.677700519561768, + "learning_rate": 8.521596276268055e-05, + "loss": 1.011, + "step": 11550 + }, + { + "epoch": 0.7826411003455519, + "grad_norm": 7.136357307434082, + "learning_rate": 8.521459374358273e-05, + "loss": 0.7062, + "step": 11551 + }, + { + "epoch": 0.782708855613524, + "grad_norm": 6.2828168869018555, + "learning_rate": 8.521322472448491e-05, + "loss": 0.6732, + "step": 11552 + }, + { + "epoch": 0.7827766108814961, + "grad_norm": 6.77725887298584, + "learning_rate": 8.521185570538709e-05, + "loss": 0.8692, + "step": 11553 + }, + { + "epoch": 0.7828443661494682, + "grad_norm": 6.065176963806152, + "learning_rate": 8.521048668628928e-05, + "loss": 0.7296, + "step": 11554 + }, + { + "epoch": 0.7829121214174402, + "grad_norm": 6.393941402435303, + "learning_rate": 8.520911766719146e-05, + "loss": 0.7382, + "step": 11555 + }, + { + "epoch": 0.7829798766854122, + "grad_norm": 5.480118751525879, + "learning_rate": 8.520774864809364e-05, + "loss": 1.0189, + "step": 11556 + }, + { + "epoch": 0.7830476319533843, + "grad_norm": 5.192300796508789, + "learning_rate": 8.520637962899583e-05, + "loss": 0.7421, + "step": 11557 + }, + { + "epoch": 0.7831153872213564, + "grad_norm": 4.797957897186279, + "learning_rate": 8.520501060989802e-05, + "loss": 0.5936, + "step": 11558 + }, + { + "epoch": 0.7831831424893285, + "grad_norm": 5.605537414550781, + "learning_rate": 8.52036415908002e-05, + "loss": 0.6871, + "step": 11559 + }, + { + "epoch": 0.7832508977573006, + "grad_norm": 4.938405513763428, + "learning_rate": 8.520227257170238e-05, + "loss": 0.7197, + "step": 11560 + }, + { + "epoch": 0.7833186530252727, + "grad_norm": 5.017856597900391, + "learning_rate": 8.520090355260456e-05, + "loss": 0.9076, + "step": 11561 + }, + { + "epoch": 0.7833864082932448, + "grad_norm": 4.1797285079956055, + "learning_rate": 8.519953453350674e-05, + "loss": 0.7057, + "step": 11562 + }, + { + "epoch": 0.7834541635612169, + "grad_norm": 6.84492301940918, + "learning_rate": 8.519816551440893e-05, + "loss": 0.7758, + "step": 11563 + }, + { + "epoch": 0.783521918829189, + "grad_norm": 5.999660968780518, + "learning_rate": 8.519679649531111e-05, + "loss": 0.7158, + "step": 11564 + }, + { + "epoch": 0.7835896740971611, + "grad_norm": 6.352871894836426, + "learning_rate": 8.51954274762133e-05, + "loss": 0.8369, + "step": 11565 + }, + { + "epoch": 0.7836574293651332, + "grad_norm": 7.0241923332214355, + "learning_rate": 8.519405845711548e-05, + "loss": 0.8714, + "step": 11566 + }, + { + "epoch": 0.7837251846331053, + "grad_norm": 6.709263324737549, + "learning_rate": 8.519268943801767e-05, + "loss": 0.7421, + "step": 11567 + }, + { + "epoch": 0.7837929399010773, + "grad_norm": 6.393216133117676, + "learning_rate": 8.519132041891985e-05, + "loss": 0.7692, + "step": 11568 + }, + { + "epoch": 0.7838606951690494, + "grad_norm": 5.52962589263916, + "learning_rate": 8.518995139982203e-05, + "loss": 0.8172, + "step": 11569 + }, + { + "epoch": 0.7839284504370215, + "grad_norm": 5.78670072555542, + "learning_rate": 8.518858238072421e-05, + "loss": 0.8083, + "step": 11570 + }, + { + "epoch": 0.7839962057049935, + "grad_norm": 6.439652919769287, + "learning_rate": 8.518721336162639e-05, + "loss": 0.8051, + "step": 11571 + }, + { + "epoch": 0.7840639609729656, + "grad_norm": 7.955451011657715, + "learning_rate": 8.518584434252858e-05, + "loss": 0.8791, + "step": 11572 + }, + { + "epoch": 0.7841317162409377, + "grad_norm": 9.079748153686523, + "learning_rate": 8.518447532343076e-05, + "loss": 0.8787, + "step": 11573 + }, + { + "epoch": 0.7841994715089098, + "grad_norm": 7.1748366355896, + "learning_rate": 8.518310630433295e-05, + "loss": 0.9591, + "step": 11574 + }, + { + "epoch": 0.7842672267768819, + "grad_norm": 4.792274475097656, + "learning_rate": 8.518173728523513e-05, + "loss": 0.6126, + "step": 11575 + }, + { + "epoch": 0.784334982044854, + "grad_norm": 8.38217544555664, + "learning_rate": 8.51803682661373e-05, + "loss": 1.1548, + "step": 11576 + }, + { + "epoch": 0.7844027373128261, + "grad_norm": 8.210965156555176, + "learning_rate": 8.51789992470395e-05, + "loss": 0.879, + "step": 11577 + }, + { + "epoch": 0.7844704925807982, + "grad_norm": 6.52365255355835, + "learning_rate": 8.517763022794168e-05, + "loss": 1.1208, + "step": 11578 + }, + { + "epoch": 0.7845382478487702, + "grad_norm": 8.54340934753418, + "learning_rate": 8.517626120884386e-05, + "loss": 0.8938, + "step": 11579 + }, + { + "epoch": 0.7846060031167423, + "grad_norm": 4.85086727142334, + "learning_rate": 8.517489218974604e-05, + "loss": 0.5674, + "step": 11580 + }, + { + "epoch": 0.7846737583847144, + "grad_norm": 5.40913200378418, + "learning_rate": 8.517352317064823e-05, + "loss": 0.6512, + "step": 11581 + }, + { + "epoch": 0.7847415136526865, + "grad_norm": 5.471793174743652, + "learning_rate": 8.517215415155041e-05, + "loss": 0.6023, + "step": 11582 + }, + { + "epoch": 0.7848092689206586, + "grad_norm": 6.162317752838135, + "learning_rate": 8.51707851324526e-05, + "loss": 0.6595, + "step": 11583 + }, + { + "epoch": 0.7848770241886307, + "grad_norm": 4.846681594848633, + "learning_rate": 8.516941611335479e-05, + "loss": 0.7467, + "step": 11584 + }, + { + "epoch": 0.7849447794566028, + "grad_norm": 6.785501956939697, + "learning_rate": 8.516804709425697e-05, + "loss": 0.9231, + "step": 11585 + }, + { + "epoch": 0.7850125347245749, + "grad_norm": 5.503603935241699, + "learning_rate": 8.516667807515915e-05, + "loss": 0.6025, + "step": 11586 + }, + { + "epoch": 0.785080289992547, + "grad_norm": 6.446911334991455, + "learning_rate": 8.516530905606134e-05, + "loss": 0.8209, + "step": 11587 + }, + { + "epoch": 0.785148045260519, + "grad_norm": 6.04754114151001, + "learning_rate": 8.516394003696352e-05, + "loss": 0.7663, + "step": 11588 + }, + { + "epoch": 0.785215800528491, + "grad_norm": 8.239023208618164, + "learning_rate": 8.51625710178657e-05, + "loss": 1.06, + "step": 11589 + }, + { + "epoch": 0.7852835557964631, + "grad_norm": 5.434386253356934, + "learning_rate": 8.51612019987679e-05, + "loss": 0.6585, + "step": 11590 + }, + { + "epoch": 0.7853513110644352, + "grad_norm": 7.99647855758667, + "learning_rate": 8.515983297967008e-05, + "loss": 1.3215, + "step": 11591 + }, + { + "epoch": 0.7854190663324073, + "grad_norm": 6.199028491973877, + "learning_rate": 8.515846396057226e-05, + "loss": 0.7499, + "step": 11592 + }, + { + "epoch": 0.7854868216003794, + "grad_norm": 5.438849449157715, + "learning_rate": 8.515709494147444e-05, + "loss": 1.0288, + "step": 11593 + }, + { + "epoch": 0.7855545768683515, + "grad_norm": 6.933850288391113, + "learning_rate": 8.515572592237662e-05, + "loss": 0.8108, + "step": 11594 + }, + { + "epoch": 0.7856223321363236, + "grad_norm": 7.720444679260254, + "learning_rate": 8.515435690327881e-05, + "loss": 0.7747, + "step": 11595 + }, + { + "epoch": 0.7856900874042957, + "grad_norm": 5.852978706359863, + "learning_rate": 8.5152987884181e-05, + "loss": 0.9843, + "step": 11596 + }, + { + "epoch": 0.7857578426722678, + "grad_norm": 5.039735794067383, + "learning_rate": 8.515161886508317e-05, + "loss": 0.8137, + "step": 11597 + }, + { + "epoch": 0.7858255979402399, + "grad_norm": 5.9263529777526855, + "learning_rate": 8.515024984598535e-05, + "loss": 0.7445, + "step": 11598 + }, + { + "epoch": 0.785893353208212, + "grad_norm": 7.095968246459961, + "learning_rate": 8.514888082688753e-05, + "loss": 0.9883, + "step": 11599 + }, + { + "epoch": 0.7859611084761841, + "grad_norm": 7.839550018310547, + "learning_rate": 8.514751180778973e-05, + "loss": 0.8652, + "step": 11600 + }, + { + "epoch": 0.7860288637441561, + "grad_norm": 6.799243927001953, + "learning_rate": 8.514614278869191e-05, + "loss": 0.9413, + "step": 11601 + }, + { + "epoch": 0.7860966190121282, + "grad_norm": 5.903491020202637, + "learning_rate": 8.514477376959409e-05, + "loss": 0.7944, + "step": 11602 + }, + { + "epoch": 0.7861643742801003, + "grad_norm": 5.674754619598389, + "learning_rate": 8.514340475049627e-05, + "loss": 0.639, + "step": 11603 + }, + { + "epoch": 0.7862321295480723, + "grad_norm": 6.983302593231201, + "learning_rate": 8.514203573139846e-05, + "loss": 1.0028, + "step": 11604 + }, + { + "epoch": 0.7862998848160444, + "grad_norm": 6.731539726257324, + "learning_rate": 8.514066671230064e-05, + "loss": 0.8127, + "step": 11605 + }, + { + "epoch": 0.7863676400840165, + "grad_norm": 5.513458251953125, + "learning_rate": 8.513929769320282e-05, + "loss": 0.8266, + "step": 11606 + }, + { + "epoch": 0.7864353953519886, + "grad_norm": 5.592813968658447, + "learning_rate": 8.5137928674105e-05, + "loss": 0.8913, + "step": 11607 + }, + { + "epoch": 0.7865031506199607, + "grad_norm": 6.7772603034973145, + "learning_rate": 8.513655965500719e-05, + "loss": 0.9662, + "step": 11608 + }, + { + "epoch": 0.7865709058879328, + "grad_norm": 6.447290420532227, + "learning_rate": 8.513519063590938e-05, + "loss": 0.8599, + "step": 11609 + }, + { + "epoch": 0.7866386611559049, + "grad_norm": 5.317022800445557, + "learning_rate": 8.513382161681156e-05, + "loss": 0.7079, + "step": 11610 + }, + { + "epoch": 0.786706416423877, + "grad_norm": 6.355508327484131, + "learning_rate": 8.513245259771374e-05, + "loss": 0.783, + "step": 11611 + }, + { + "epoch": 0.786774171691849, + "grad_norm": 6.773859977722168, + "learning_rate": 8.513108357861592e-05, + "loss": 0.9146, + "step": 11612 + }, + { + "epoch": 0.7868419269598211, + "grad_norm": 6.100228786468506, + "learning_rate": 8.512971455951811e-05, + "loss": 0.8626, + "step": 11613 + }, + { + "epoch": 0.7869096822277932, + "grad_norm": 4.6573004722595215, + "learning_rate": 8.51283455404203e-05, + "loss": 0.4562, + "step": 11614 + }, + { + "epoch": 0.7869774374957653, + "grad_norm": 5.893970966339111, + "learning_rate": 8.512697652132247e-05, + "loss": 0.9437, + "step": 11615 + }, + { + "epoch": 0.7870451927637374, + "grad_norm": 6.907064914703369, + "learning_rate": 8.512560750222465e-05, + "loss": 0.7592, + "step": 11616 + }, + { + "epoch": 0.7871129480317095, + "grad_norm": 5.383542060852051, + "learning_rate": 8.512423848312684e-05, + "loss": 0.8429, + "step": 11617 + }, + { + "epoch": 0.7871807032996816, + "grad_norm": 4.651388168334961, + "learning_rate": 8.512286946402903e-05, + "loss": 0.7447, + "step": 11618 + }, + { + "epoch": 0.7872484585676537, + "grad_norm": 7.3148417472839355, + "learning_rate": 8.512150044493121e-05, + "loss": 0.905, + "step": 11619 + }, + { + "epoch": 0.7873162138356257, + "grad_norm": 6.7118353843688965, + "learning_rate": 8.512013142583339e-05, + "loss": 0.8032, + "step": 11620 + }, + { + "epoch": 0.7873839691035978, + "grad_norm": 7.792928695678711, + "learning_rate": 8.511876240673557e-05, + "loss": 0.7426, + "step": 11621 + }, + { + "epoch": 0.7874517243715699, + "grad_norm": 7.6762614250183105, + "learning_rate": 8.511739338763776e-05, + "loss": 0.7188, + "step": 11622 + }, + { + "epoch": 0.7875194796395419, + "grad_norm": 4.8527607917785645, + "learning_rate": 8.511602436853994e-05, + "loss": 0.4442, + "step": 11623 + }, + { + "epoch": 0.787587234907514, + "grad_norm": 4.9824957847595215, + "learning_rate": 8.511465534944212e-05, + "loss": 0.7234, + "step": 11624 + }, + { + "epoch": 0.7876549901754861, + "grad_norm": 6.530035495758057, + "learning_rate": 8.51132863303443e-05, + "loss": 0.8197, + "step": 11625 + }, + { + "epoch": 0.7877227454434582, + "grad_norm": 6.13261079788208, + "learning_rate": 8.511191731124649e-05, + "loss": 0.922, + "step": 11626 + }, + { + "epoch": 0.7877905007114303, + "grad_norm": 7.777993202209473, + "learning_rate": 8.511054829214868e-05, + "loss": 0.7275, + "step": 11627 + }, + { + "epoch": 0.7878582559794024, + "grad_norm": 5.4319281578063965, + "learning_rate": 8.510917927305086e-05, + "loss": 0.6158, + "step": 11628 + }, + { + "epoch": 0.7879260112473745, + "grad_norm": 7.162403583526611, + "learning_rate": 8.510781025395304e-05, + "loss": 0.6238, + "step": 11629 + }, + { + "epoch": 0.7879937665153466, + "grad_norm": 7.020395755767822, + "learning_rate": 8.510644123485523e-05, + "loss": 1.0743, + "step": 11630 + }, + { + "epoch": 0.7880615217833187, + "grad_norm": 7.972400665283203, + "learning_rate": 8.510507221575741e-05, + "loss": 0.8562, + "step": 11631 + }, + { + "epoch": 0.7881292770512908, + "grad_norm": 7.600156307220459, + "learning_rate": 8.51037031966596e-05, + "loss": 0.9075, + "step": 11632 + }, + { + "epoch": 0.7881970323192629, + "grad_norm": 6.119834899902344, + "learning_rate": 8.510233417756179e-05, + "loss": 1.0396, + "step": 11633 + }, + { + "epoch": 0.788264787587235, + "grad_norm": 6.254417896270752, + "learning_rate": 8.510096515846397e-05, + "loss": 0.8023, + "step": 11634 + }, + { + "epoch": 0.788332542855207, + "grad_norm": 7.466210842132568, + "learning_rate": 8.509959613936615e-05, + "loss": 0.9744, + "step": 11635 + }, + { + "epoch": 0.788400298123179, + "grad_norm": 6.635150909423828, + "learning_rate": 8.509822712026834e-05, + "loss": 0.8722, + "step": 11636 + }, + { + "epoch": 0.7884680533911511, + "grad_norm": 6.4519362449646, + "learning_rate": 8.509685810117052e-05, + "loss": 0.7355, + "step": 11637 + }, + { + "epoch": 0.7885358086591232, + "grad_norm": 6.042237758636475, + "learning_rate": 8.50954890820727e-05, + "loss": 0.7921, + "step": 11638 + }, + { + "epoch": 0.7886035639270953, + "grad_norm": 6.3814616203308105, + "learning_rate": 8.509412006297488e-05, + "loss": 0.8843, + "step": 11639 + }, + { + "epoch": 0.7886713191950674, + "grad_norm": 7.140514373779297, + "learning_rate": 8.509275104387706e-05, + "loss": 0.8913, + "step": 11640 + }, + { + "epoch": 0.7887390744630395, + "grad_norm": 5.622684478759766, + "learning_rate": 8.509138202477926e-05, + "loss": 0.8172, + "step": 11641 + }, + { + "epoch": 0.7888068297310116, + "grad_norm": 5.332169532775879, + "learning_rate": 8.509001300568144e-05, + "loss": 0.7585, + "step": 11642 + }, + { + "epoch": 0.7888745849989837, + "grad_norm": 5.908132553100586, + "learning_rate": 8.508864398658362e-05, + "loss": 0.9236, + "step": 11643 + }, + { + "epoch": 0.7889423402669558, + "grad_norm": 5.472514629364014, + "learning_rate": 8.50872749674858e-05, + "loss": 0.8923, + "step": 11644 + }, + { + "epoch": 0.7890100955349278, + "grad_norm": 6.155751705169678, + "learning_rate": 8.5085905948388e-05, + "loss": 0.9054, + "step": 11645 + }, + { + "epoch": 0.7890778508028999, + "grad_norm": 5.493722438812256, + "learning_rate": 8.508453692929017e-05, + "loss": 0.7483, + "step": 11646 + }, + { + "epoch": 0.789145606070872, + "grad_norm": 7.661139965057373, + "learning_rate": 8.508316791019235e-05, + "loss": 0.7433, + "step": 11647 + }, + { + "epoch": 0.7892133613388441, + "grad_norm": 6.037461757659912, + "learning_rate": 8.508179889109453e-05, + "loss": 0.933, + "step": 11648 + }, + { + "epoch": 0.7892811166068162, + "grad_norm": 5.072673797607422, + "learning_rate": 8.508042987199671e-05, + "loss": 0.8115, + "step": 11649 + }, + { + "epoch": 0.7893488718747883, + "grad_norm": 5.48813009262085, + "learning_rate": 8.507906085289891e-05, + "loss": 0.7581, + "step": 11650 + }, + { + "epoch": 0.7894166271427604, + "grad_norm": 7.160548686981201, + "learning_rate": 8.507769183380109e-05, + "loss": 0.8798, + "step": 11651 + }, + { + "epoch": 0.7894843824107325, + "grad_norm": 5.741982936859131, + "learning_rate": 8.507632281470327e-05, + "loss": 0.7952, + "step": 11652 + }, + { + "epoch": 0.7895521376787045, + "grad_norm": 6.27877950668335, + "learning_rate": 8.507495379560545e-05, + "loss": 0.9341, + "step": 11653 + }, + { + "epoch": 0.7896198929466766, + "grad_norm": 6.65634822845459, + "learning_rate": 8.507358477650763e-05, + "loss": 1.0704, + "step": 11654 + }, + { + "epoch": 0.7896876482146487, + "grad_norm": 4.268200397491455, + "learning_rate": 8.507221575740982e-05, + "loss": 0.6692, + "step": 11655 + }, + { + "epoch": 0.7897554034826207, + "grad_norm": 6.084795951843262, + "learning_rate": 8.5070846738312e-05, + "loss": 0.9024, + "step": 11656 + }, + { + "epoch": 0.7898231587505928, + "grad_norm": 6.263867378234863, + "learning_rate": 8.506947771921418e-05, + "loss": 0.7795, + "step": 11657 + }, + { + "epoch": 0.7898909140185649, + "grad_norm": 5.393991947174072, + "learning_rate": 8.506810870011636e-05, + "loss": 0.7585, + "step": 11658 + }, + { + "epoch": 0.789958669286537, + "grad_norm": 5.708785057067871, + "learning_rate": 8.506673968101856e-05, + "loss": 0.9771, + "step": 11659 + }, + { + "epoch": 0.7900264245545091, + "grad_norm": 5.779507637023926, + "learning_rate": 8.506537066192074e-05, + "loss": 0.7698, + "step": 11660 + }, + { + "epoch": 0.7900941798224812, + "grad_norm": 5.451954364776611, + "learning_rate": 8.506400164282292e-05, + "loss": 0.8186, + "step": 11661 + }, + { + "epoch": 0.7901619350904533, + "grad_norm": 6.849606513977051, + "learning_rate": 8.50626326237251e-05, + "loss": 0.5914, + "step": 11662 + }, + { + "epoch": 0.7902296903584254, + "grad_norm": 6.583099365234375, + "learning_rate": 8.506126360462728e-05, + "loss": 0.7948, + "step": 11663 + }, + { + "epoch": 0.7902974456263975, + "grad_norm": 5.769179821014404, + "learning_rate": 8.505989458552947e-05, + "loss": 0.7482, + "step": 11664 + }, + { + "epoch": 0.7903652008943696, + "grad_norm": 6.720520496368408, + "learning_rate": 8.505852556643165e-05, + "loss": 0.8047, + "step": 11665 + }, + { + "epoch": 0.7904329561623417, + "grad_norm": 5.712310791015625, + "learning_rate": 8.505715654733383e-05, + "loss": 0.744, + "step": 11666 + }, + { + "epoch": 0.7905007114303138, + "grad_norm": 5.840827941894531, + "learning_rate": 8.505578752823601e-05, + "loss": 0.6534, + "step": 11667 + }, + { + "epoch": 0.7905684666982858, + "grad_norm": 5.9518351554870605, + "learning_rate": 8.505441850913821e-05, + "loss": 0.9304, + "step": 11668 + }, + { + "epoch": 0.7906362219662578, + "grad_norm": 6.9761528968811035, + "learning_rate": 8.505304949004039e-05, + "loss": 0.8118, + "step": 11669 + }, + { + "epoch": 0.7907039772342299, + "grad_norm": 5.4346842765808105, + "learning_rate": 8.505168047094257e-05, + "loss": 0.7391, + "step": 11670 + }, + { + "epoch": 0.790771732502202, + "grad_norm": 6.628547191619873, + "learning_rate": 8.505031145184475e-05, + "loss": 0.5708, + "step": 11671 + }, + { + "epoch": 0.7908394877701741, + "grad_norm": 7.494357109069824, + "learning_rate": 8.504894243274693e-05, + "loss": 0.8687, + "step": 11672 + }, + { + "epoch": 0.7909072430381462, + "grad_norm": 4.743175029754639, + "learning_rate": 8.504757341364912e-05, + "loss": 0.7287, + "step": 11673 + }, + { + "epoch": 0.7909749983061183, + "grad_norm": 6.512623310089111, + "learning_rate": 8.50462043945513e-05, + "loss": 0.9047, + "step": 11674 + }, + { + "epoch": 0.7910427535740904, + "grad_norm": 7.142396926879883, + "learning_rate": 8.504483537545348e-05, + "loss": 0.7012, + "step": 11675 + }, + { + "epoch": 0.7911105088420625, + "grad_norm": 5.33573055267334, + "learning_rate": 8.504346635635568e-05, + "loss": 0.6606, + "step": 11676 + }, + { + "epoch": 0.7911782641100346, + "grad_norm": 6.757303714752197, + "learning_rate": 8.504209733725786e-05, + "loss": 0.8881, + "step": 11677 + }, + { + "epoch": 0.7912460193780066, + "grad_norm": 6.163415908813477, + "learning_rate": 8.504072831816004e-05, + "loss": 0.8034, + "step": 11678 + }, + { + "epoch": 0.7913137746459787, + "grad_norm": 5.699300765991211, + "learning_rate": 8.503935929906223e-05, + "loss": 0.9178, + "step": 11679 + }, + { + "epoch": 0.7913815299139508, + "grad_norm": 5.064122676849365, + "learning_rate": 8.503799027996441e-05, + "loss": 0.8219, + "step": 11680 + }, + { + "epoch": 0.7914492851819229, + "grad_norm": 5.224904537200928, + "learning_rate": 8.50366212608666e-05, + "loss": 0.7344, + "step": 11681 + }, + { + "epoch": 0.791517040449895, + "grad_norm": 4.929017543792725, + "learning_rate": 8.503525224176879e-05, + "loss": 0.7713, + "step": 11682 + }, + { + "epoch": 0.7915847957178671, + "grad_norm": 6.1249918937683105, + "learning_rate": 8.503388322267097e-05, + "loss": 0.8553, + "step": 11683 + }, + { + "epoch": 0.7916525509858392, + "grad_norm": 5.393836975097656, + "learning_rate": 8.503251420357315e-05, + "loss": 0.6989, + "step": 11684 + }, + { + "epoch": 0.7917203062538112, + "grad_norm": 6.436197757720947, + "learning_rate": 8.503114518447533e-05, + "loss": 1.0481, + "step": 11685 + }, + { + "epoch": 0.7917880615217833, + "grad_norm": 5.787166118621826, + "learning_rate": 8.502977616537751e-05, + "loss": 0.9203, + "step": 11686 + }, + { + "epoch": 0.7918558167897554, + "grad_norm": 5.892452716827393, + "learning_rate": 8.50284071462797e-05, + "loss": 0.9302, + "step": 11687 + }, + { + "epoch": 0.7919235720577275, + "grad_norm": 7.195859909057617, + "learning_rate": 8.502703812718188e-05, + "loss": 0.9175, + "step": 11688 + }, + { + "epoch": 0.7919913273256995, + "grad_norm": 6.343230247497559, + "learning_rate": 8.502566910808406e-05, + "loss": 0.7292, + "step": 11689 + }, + { + "epoch": 0.7920590825936716, + "grad_norm": 6.082936763763428, + "learning_rate": 8.502430008898624e-05, + "loss": 0.8826, + "step": 11690 + }, + { + "epoch": 0.7921268378616437, + "grad_norm": 5.115715980529785, + "learning_rate": 8.502293106988844e-05, + "loss": 0.6697, + "step": 11691 + }, + { + "epoch": 0.7921945931296158, + "grad_norm": 6.96610164642334, + "learning_rate": 8.502156205079062e-05, + "loss": 0.9914, + "step": 11692 + }, + { + "epoch": 0.7922623483975879, + "grad_norm": 7.457095146179199, + "learning_rate": 8.50201930316928e-05, + "loss": 0.7839, + "step": 11693 + }, + { + "epoch": 0.79233010366556, + "grad_norm": 7.025375843048096, + "learning_rate": 8.501882401259498e-05, + "loss": 1.0105, + "step": 11694 + }, + { + "epoch": 0.7923978589335321, + "grad_norm": 5.951284408569336, + "learning_rate": 8.501745499349716e-05, + "loss": 0.6889, + "step": 11695 + }, + { + "epoch": 0.7924656142015042, + "grad_norm": 5.650534629821777, + "learning_rate": 8.501608597439935e-05, + "loss": 0.7848, + "step": 11696 + }, + { + "epoch": 0.7925333694694763, + "grad_norm": 5.552826881408691, + "learning_rate": 8.501471695530153e-05, + "loss": 0.6213, + "step": 11697 + }, + { + "epoch": 0.7926011247374484, + "grad_norm": 5.661040306091309, + "learning_rate": 8.501334793620371e-05, + "loss": 0.5737, + "step": 11698 + }, + { + "epoch": 0.7926688800054205, + "grad_norm": 5.965978622436523, + "learning_rate": 8.50119789171059e-05, + "loss": 0.8278, + "step": 11699 + }, + { + "epoch": 0.7927366352733926, + "grad_norm": 5.854281902313232, + "learning_rate": 8.501060989800809e-05, + "loss": 0.831, + "step": 11700 + }, + { + "epoch": 0.7928043905413646, + "grad_norm": 6.782879829406738, + "learning_rate": 8.500924087891027e-05, + "loss": 0.8058, + "step": 11701 + }, + { + "epoch": 0.7928721458093366, + "grad_norm": 6.70954704284668, + "learning_rate": 8.500787185981245e-05, + "loss": 0.6725, + "step": 11702 + }, + { + "epoch": 0.7929399010773087, + "grad_norm": 5.684144020080566, + "learning_rate": 8.500650284071463e-05, + "loss": 0.8527, + "step": 11703 + }, + { + "epoch": 0.7930076563452808, + "grad_norm": 5.054625511169434, + "learning_rate": 8.500513382161681e-05, + "loss": 0.8579, + "step": 11704 + }, + { + "epoch": 0.7930754116132529, + "grad_norm": 6.6226887702941895, + "learning_rate": 8.5003764802519e-05, + "loss": 0.9449, + "step": 11705 + }, + { + "epoch": 0.793143166881225, + "grad_norm": 6.413197040557861, + "learning_rate": 8.500239578342118e-05, + "loss": 0.646, + "step": 11706 + }, + { + "epoch": 0.7932109221491971, + "grad_norm": 4.308269500732422, + "learning_rate": 8.500102676432336e-05, + "loss": 0.6515, + "step": 11707 + }, + { + "epoch": 0.7932786774171692, + "grad_norm": 5.270321846008301, + "learning_rate": 8.499965774522554e-05, + "loss": 0.6476, + "step": 11708 + }, + { + "epoch": 0.7933464326851413, + "grad_norm": 6.440086364746094, + "learning_rate": 8.499828872612772e-05, + "loss": 0.8295, + "step": 11709 + }, + { + "epoch": 0.7934141879531134, + "grad_norm": 6.078103065490723, + "learning_rate": 8.499691970702992e-05, + "loss": 0.8173, + "step": 11710 + }, + { + "epoch": 0.7934819432210855, + "grad_norm": 4.972411155700684, + "learning_rate": 8.49955506879321e-05, + "loss": 0.6696, + "step": 11711 + }, + { + "epoch": 0.7935496984890575, + "grad_norm": 5.865548610687256, + "learning_rate": 8.499418166883428e-05, + "loss": 0.7337, + "step": 11712 + }, + { + "epoch": 0.7936174537570296, + "grad_norm": 6.041398048400879, + "learning_rate": 8.499281264973646e-05, + "loss": 0.8586, + "step": 11713 + }, + { + "epoch": 0.7936852090250017, + "grad_norm": 8.569158554077148, + "learning_rate": 8.499144363063865e-05, + "loss": 0.9027, + "step": 11714 + }, + { + "epoch": 0.7937529642929738, + "grad_norm": 6.1328020095825195, + "learning_rate": 8.499007461154083e-05, + "loss": 0.7019, + "step": 11715 + }, + { + "epoch": 0.7938207195609459, + "grad_norm": 5.501848220825195, + "learning_rate": 8.498870559244301e-05, + "loss": 0.8227, + "step": 11716 + }, + { + "epoch": 0.793888474828918, + "grad_norm": 8.566222190856934, + "learning_rate": 8.49873365733452e-05, + "loss": 0.8148, + "step": 11717 + }, + { + "epoch": 0.79395623009689, + "grad_norm": 10.178439140319824, + "learning_rate": 8.498596755424737e-05, + "loss": 1.0213, + "step": 11718 + }, + { + "epoch": 0.7940239853648621, + "grad_norm": 7.6949310302734375, + "learning_rate": 8.498459853514957e-05, + "loss": 0.7581, + "step": 11719 + }, + { + "epoch": 0.7940917406328342, + "grad_norm": 5.988775730133057, + "learning_rate": 8.498322951605175e-05, + "loss": 0.7738, + "step": 11720 + }, + { + "epoch": 0.7941594959008063, + "grad_norm": 5.81986665725708, + "learning_rate": 8.498186049695393e-05, + "loss": 0.7798, + "step": 11721 + }, + { + "epoch": 0.7942272511687783, + "grad_norm": 5.84140157699585, + "learning_rate": 8.498049147785612e-05, + "loss": 0.8285, + "step": 11722 + }, + { + "epoch": 0.7942950064367504, + "grad_norm": 4.675839900970459, + "learning_rate": 8.49791224587583e-05, + "loss": 0.6703, + "step": 11723 + }, + { + "epoch": 0.7943627617047225, + "grad_norm": 5.857149124145508, + "learning_rate": 8.497775343966048e-05, + "loss": 0.8915, + "step": 11724 + }, + { + "epoch": 0.7944305169726946, + "grad_norm": 10.169214248657227, + "learning_rate": 8.497638442056268e-05, + "loss": 0.8606, + "step": 11725 + }, + { + "epoch": 0.7944982722406667, + "grad_norm": 5.2627058029174805, + "learning_rate": 8.497501540146486e-05, + "loss": 0.7705, + "step": 11726 + }, + { + "epoch": 0.7945660275086388, + "grad_norm": 7.3032355308532715, + "learning_rate": 8.497364638236704e-05, + "loss": 1.0857, + "step": 11727 + }, + { + "epoch": 0.7946337827766109, + "grad_norm": 7.499369144439697, + "learning_rate": 8.497227736326923e-05, + "loss": 0.8655, + "step": 11728 + }, + { + "epoch": 0.794701538044583, + "grad_norm": 6.25002908706665, + "learning_rate": 8.497090834417141e-05, + "loss": 0.7271, + "step": 11729 + }, + { + "epoch": 0.7947692933125551, + "grad_norm": 6.5583882331848145, + "learning_rate": 8.496953932507359e-05, + "loss": 0.771, + "step": 11730 + }, + { + "epoch": 0.7948370485805272, + "grad_norm": 6.737629413604736, + "learning_rate": 8.496817030597577e-05, + "loss": 0.8865, + "step": 11731 + }, + { + "epoch": 0.7949048038484993, + "grad_norm": 7.376718044281006, + "learning_rate": 8.496680128687795e-05, + "loss": 0.8407, + "step": 11732 + }, + { + "epoch": 0.7949725591164714, + "grad_norm": 7.077400207519531, + "learning_rate": 8.496543226778015e-05, + "loss": 0.8339, + "step": 11733 + }, + { + "epoch": 0.7950403143844433, + "grad_norm": 6.768246173858643, + "learning_rate": 8.496406324868233e-05, + "loss": 0.8654, + "step": 11734 + }, + { + "epoch": 0.7951080696524154, + "grad_norm": 5.732030391693115, + "learning_rate": 8.496269422958451e-05, + "loss": 0.9901, + "step": 11735 + }, + { + "epoch": 0.7951758249203875, + "grad_norm": 7.728169918060303, + "learning_rate": 8.496132521048669e-05, + "loss": 1.0275, + "step": 11736 + }, + { + "epoch": 0.7952435801883596, + "grad_norm": 4.867015838623047, + "learning_rate": 8.495995619138888e-05, + "loss": 0.7226, + "step": 11737 + }, + { + "epoch": 0.7953113354563317, + "grad_norm": 6.376992225646973, + "learning_rate": 8.495858717229106e-05, + "loss": 0.8501, + "step": 11738 + }, + { + "epoch": 0.7953790907243038, + "grad_norm": 6.90419340133667, + "learning_rate": 8.495721815319324e-05, + "loss": 0.5617, + "step": 11739 + }, + { + "epoch": 0.7954468459922759, + "grad_norm": 7.2407450675964355, + "learning_rate": 8.495584913409542e-05, + "loss": 0.8224, + "step": 11740 + }, + { + "epoch": 0.795514601260248, + "grad_norm": 5.104798316955566, + "learning_rate": 8.49544801149976e-05, + "loss": 0.761, + "step": 11741 + }, + { + "epoch": 0.7955823565282201, + "grad_norm": 5.49780797958374, + "learning_rate": 8.49531110958998e-05, + "loss": 0.6668, + "step": 11742 + }, + { + "epoch": 0.7956501117961922, + "grad_norm": 4.905865669250488, + "learning_rate": 8.495174207680198e-05, + "loss": 0.7844, + "step": 11743 + }, + { + "epoch": 0.7957178670641643, + "grad_norm": 6.6370391845703125, + "learning_rate": 8.495037305770416e-05, + "loss": 1.0102, + "step": 11744 + }, + { + "epoch": 0.7957856223321363, + "grad_norm": 6.089507579803467, + "learning_rate": 8.494900403860634e-05, + "loss": 0.8679, + "step": 11745 + }, + { + "epoch": 0.7958533776001084, + "grad_norm": 5.490042209625244, + "learning_rate": 8.494763501950853e-05, + "loss": 0.7481, + "step": 11746 + }, + { + "epoch": 0.7959211328680805, + "grad_norm": 5.85631799697876, + "learning_rate": 8.494626600041071e-05, + "loss": 0.7926, + "step": 11747 + }, + { + "epoch": 0.7959888881360526, + "grad_norm": 6.780750274658203, + "learning_rate": 8.49448969813129e-05, + "loss": 0.5902, + "step": 11748 + }, + { + "epoch": 0.7960566434040247, + "grad_norm": 6.697319984436035, + "learning_rate": 8.494352796221507e-05, + "loss": 0.835, + "step": 11749 + }, + { + "epoch": 0.7961243986719968, + "grad_norm": 6.056969165802002, + "learning_rate": 8.494215894311725e-05, + "loss": 0.8725, + "step": 11750 + }, + { + "epoch": 0.7961921539399688, + "grad_norm": 9.009320259094238, + "learning_rate": 8.494078992401945e-05, + "loss": 0.8103, + "step": 11751 + }, + { + "epoch": 0.7962599092079409, + "grad_norm": 7.897705554962158, + "learning_rate": 8.493942090492163e-05, + "loss": 0.7913, + "step": 11752 + }, + { + "epoch": 0.796327664475913, + "grad_norm": 5.677363872528076, + "learning_rate": 8.493805188582381e-05, + "loss": 0.6845, + "step": 11753 + }, + { + "epoch": 0.7963954197438851, + "grad_norm": 6.340780735015869, + "learning_rate": 8.493668286672599e-05, + "loss": 0.6519, + "step": 11754 + }, + { + "epoch": 0.7964631750118571, + "grad_norm": 5.341319561004639, + "learning_rate": 8.493531384762818e-05, + "loss": 0.7437, + "step": 11755 + }, + { + "epoch": 0.7965309302798292, + "grad_norm": 5.388099193572998, + "learning_rate": 8.493394482853036e-05, + "loss": 0.6879, + "step": 11756 + }, + { + "epoch": 0.7965986855478013, + "grad_norm": 6.492825031280518, + "learning_rate": 8.493257580943254e-05, + "loss": 0.8562, + "step": 11757 + }, + { + "epoch": 0.7966664408157734, + "grad_norm": 6.953293323516846, + "learning_rate": 8.493120679033472e-05, + "loss": 0.826, + "step": 11758 + }, + { + "epoch": 0.7967341960837455, + "grad_norm": 5.61476993560791, + "learning_rate": 8.49298377712369e-05, + "loss": 0.7604, + "step": 11759 + }, + { + "epoch": 0.7968019513517176, + "grad_norm": 7.391281604766846, + "learning_rate": 8.49284687521391e-05, + "loss": 0.9048, + "step": 11760 + }, + { + "epoch": 0.7968697066196897, + "grad_norm": 5.455954074859619, + "learning_rate": 8.492709973304128e-05, + "loss": 0.8972, + "step": 11761 + }, + { + "epoch": 0.7969374618876618, + "grad_norm": 10.402437210083008, + "learning_rate": 8.492573071394346e-05, + "loss": 0.8567, + "step": 11762 + }, + { + "epoch": 0.7970052171556339, + "grad_norm": 7.704123497009277, + "learning_rate": 8.492436169484564e-05, + "loss": 0.8926, + "step": 11763 + }, + { + "epoch": 0.797072972423606, + "grad_norm": 10.505579948425293, + "learning_rate": 8.492299267574782e-05, + "loss": 0.7695, + "step": 11764 + }, + { + "epoch": 0.7971407276915781, + "grad_norm": 8.137372016906738, + "learning_rate": 8.492162365665001e-05, + "loss": 0.814, + "step": 11765 + }, + { + "epoch": 0.7972084829595502, + "grad_norm": 7.7297587394714355, + "learning_rate": 8.49202546375522e-05, + "loss": 0.9038, + "step": 11766 + }, + { + "epoch": 0.7972762382275221, + "grad_norm": 5.5629048347473145, + "learning_rate": 8.491888561845437e-05, + "loss": 0.5899, + "step": 11767 + }, + { + "epoch": 0.7973439934954942, + "grad_norm": 9.380592346191406, + "learning_rate": 8.491751659935655e-05, + "loss": 0.8522, + "step": 11768 + }, + { + "epoch": 0.7974117487634663, + "grad_norm": 5.808100700378418, + "learning_rate": 8.491614758025875e-05, + "loss": 0.923, + "step": 11769 + }, + { + "epoch": 0.7974795040314384, + "grad_norm": 6.091804504394531, + "learning_rate": 8.491477856116093e-05, + "loss": 0.8042, + "step": 11770 + }, + { + "epoch": 0.7975472592994105, + "grad_norm": 6.675506114959717, + "learning_rate": 8.491340954206312e-05, + "loss": 1.0141, + "step": 11771 + }, + { + "epoch": 0.7976150145673826, + "grad_norm": 7.39612340927124, + "learning_rate": 8.49120405229653e-05, + "loss": 0.9027, + "step": 11772 + }, + { + "epoch": 0.7976827698353547, + "grad_norm": 7.455977439880371, + "learning_rate": 8.491067150386748e-05, + "loss": 0.7705, + "step": 11773 + }, + { + "epoch": 0.7977505251033268, + "grad_norm": 6.535350322723389, + "learning_rate": 8.490930248476968e-05, + "loss": 0.7648, + "step": 11774 + }, + { + "epoch": 0.7978182803712989, + "grad_norm": 8.165600776672363, + "learning_rate": 8.490793346567186e-05, + "loss": 0.9073, + "step": 11775 + }, + { + "epoch": 0.797886035639271, + "grad_norm": 5.424903869628906, + "learning_rate": 8.490656444657404e-05, + "loss": 0.7656, + "step": 11776 + }, + { + "epoch": 0.797953790907243, + "grad_norm": 6.095433235168457, + "learning_rate": 8.490519542747622e-05, + "loss": 0.5978, + "step": 11777 + }, + { + "epoch": 0.7980215461752151, + "grad_norm": 4.698237419128418, + "learning_rate": 8.490382640837841e-05, + "loss": 0.689, + "step": 11778 + }, + { + "epoch": 0.7980893014431872, + "grad_norm": 5.7089691162109375, + "learning_rate": 8.490245738928059e-05, + "loss": 0.719, + "step": 11779 + }, + { + "epoch": 0.7981570567111593, + "grad_norm": 4.3311848640441895, + "learning_rate": 8.490108837018277e-05, + "loss": 0.8484, + "step": 11780 + }, + { + "epoch": 0.7982248119791314, + "grad_norm": 5.905073165893555, + "learning_rate": 8.489971935108495e-05, + "loss": 0.7679, + "step": 11781 + }, + { + "epoch": 0.7982925672471035, + "grad_norm": 7.171839714050293, + "learning_rate": 8.489835033198713e-05, + "loss": 0.6716, + "step": 11782 + }, + { + "epoch": 0.7983603225150755, + "grad_norm": 8.157262802124023, + "learning_rate": 8.489698131288933e-05, + "loss": 0.7821, + "step": 11783 + }, + { + "epoch": 0.7984280777830476, + "grad_norm": 5.551645278930664, + "learning_rate": 8.489561229379151e-05, + "loss": 0.5619, + "step": 11784 + }, + { + "epoch": 0.7984958330510197, + "grad_norm": 6.759763240814209, + "learning_rate": 8.489424327469369e-05, + "loss": 0.9527, + "step": 11785 + }, + { + "epoch": 0.7985635883189918, + "grad_norm": 5.427465915679932, + "learning_rate": 8.489287425559587e-05, + "loss": 0.5603, + "step": 11786 + }, + { + "epoch": 0.7986313435869639, + "grad_norm": 5.436514377593994, + "learning_rate": 8.489150523649805e-05, + "loss": 0.6464, + "step": 11787 + }, + { + "epoch": 0.798699098854936, + "grad_norm": 5.558623313903809, + "learning_rate": 8.489013621740024e-05, + "loss": 0.7742, + "step": 11788 + }, + { + "epoch": 0.798766854122908, + "grad_norm": 6.840981960296631, + "learning_rate": 8.488876719830242e-05, + "loss": 0.9908, + "step": 11789 + }, + { + "epoch": 0.7988346093908801, + "grad_norm": 6.066009044647217, + "learning_rate": 8.48873981792046e-05, + "loss": 0.7004, + "step": 11790 + }, + { + "epoch": 0.7989023646588522, + "grad_norm": 5.6321024894714355, + "learning_rate": 8.488602916010678e-05, + "loss": 0.7741, + "step": 11791 + }, + { + "epoch": 0.7989701199268243, + "grad_norm": 6.667470932006836, + "learning_rate": 8.488466014100898e-05, + "loss": 0.9582, + "step": 11792 + }, + { + "epoch": 0.7990378751947964, + "grad_norm": 5.693469524383545, + "learning_rate": 8.488329112191116e-05, + "loss": 0.8868, + "step": 11793 + }, + { + "epoch": 0.7991056304627685, + "grad_norm": 4.270181179046631, + "learning_rate": 8.488192210281334e-05, + "loss": 0.5002, + "step": 11794 + }, + { + "epoch": 0.7991733857307406, + "grad_norm": 6.748290061950684, + "learning_rate": 8.488055308371552e-05, + "loss": 0.9551, + "step": 11795 + }, + { + "epoch": 0.7992411409987127, + "grad_norm": 6.088380336761475, + "learning_rate": 8.48791840646177e-05, + "loss": 0.735, + "step": 11796 + }, + { + "epoch": 0.7993088962666848, + "grad_norm": 7.105894565582275, + "learning_rate": 8.487781504551989e-05, + "loss": 0.8939, + "step": 11797 + }, + { + "epoch": 0.7993766515346569, + "grad_norm": 5.425162315368652, + "learning_rate": 8.487644602642207e-05, + "loss": 0.612, + "step": 11798 + }, + { + "epoch": 0.799444406802629, + "grad_norm": 6.511662006378174, + "learning_rate": 8.487507700732425e-05, + "loss": 0.6636, + "step": 11799 + }, + { + "epoch": 0.7995121620706009, + "grad_norm": 6.1298828125, + "learning_rate": 8.487370798822643e-05, + "loss": 0.7554, + "step": 11800 + }, + { + "epoch": 0.799579917338573, + "grad_norm": 8.377950668334961, + "learning_rate": 8.487233896912863e-05, + "loss": 0.9965, + "step": 11801 + }, + { + "epoch": 0.7996476726065451, + "grad_norm": 5.563699245452881, + "learning_rate": 8.487096995003081e-05, + "loss": 0.9248, + "step": 11802 + }, + { + "epoch": 0.7997154278745172, + "grad_norm": 5.939857006072998, + "learning_rate": 8.486960093093299e-05, + "loss": 0.822, + "step": 11803 + }, + { + "epoch": 0.7997831831424893, + "grad_norm": 7.076834678649902, + "learning_rate": 8.486823191183517e-05, + "loss": 0.9286, + "step": 11804 + }, + { + "epoch": 0.7998509384104614, + "grad_norm": 5.123041152954102, + "learning_rate": 8.486686289273735e-05, + "loss": 0.5529, + "step": 11805 + }, + { + "epoch": 0.7999186936784335, + "grad_norm": 7.112299919128418, + "learning_rate": 8.486549387363954e-05, + "loss": 1.0114, + "step": 11806 + }, + { + "epoch": 0.7999864489464056, + "grad_norm": 5.081669807434082, + "learning_rate": 8.486412485454172e-05, + "loss": 0.6341, + "step": 11807 + }, + { + "epoch": 0.8000542042143777, + "grad_norm": 4.5973310470581055, + "learning_rate": 8.48627558354439e-05, + "loss": 0.6517, + "step": 11808 + }, + { + "epoch": 0.8001219594823498, + "grad_norm": 11.162668228149414, + "learning_rate": 8.486138681634608e-05, + "loss": 0.8336, + "step": 11809 + }, + { + "epoch": 0.8001897147503219, + "grad_norm": 6.119869709014893, + "learning_rate": 8.486001779724826e-05, + "loss": 0.8596, + "step": 11810 + }, + { + "epoch": 0.800257470018294, + "grad_norm": 6.878213405609131, + "learning_rate": 8.485864877815046e-05, + "loss": 1.0812, + "step": 11811 + }, + { + "epoch": 0.800325225286266, + "grad_norm": 5.8862786293029785, + "learning_rate": 8.485727975905264e-05, + "loss": 0.5654, + "step": 11812 + }, + { + "epoch": 0.8003929805542381, + "grad_norm": 5.611292362213135, + "learning_rate": 8.485591073995482e-05, + "loss": 0.7241, + "step": 11813 + }, + { + "epoch": 0.8004607358222102, + "grad_norm": 5.294071197509766, + "learning_rate": 8.4854541720857e-05, + "loss": 0.8368, + "step": 11814 + }, + { + "epoch": 0.8005284910901823, + "grad_norm": 7.11854362487793, + "learning_rate": 8.485317270175919e-05, + "loss": 0.7055, + "step": 11815 + }, + { + "epoch": 0.8005962463581543, + "grad_norm": 6.1037373542785645, + "learning_rate": 8.485180368266137e-05, + "loss": 0.9473, + "step": 11816 + }, + { + "epoch": 0.8006640016261264, + "grad_norm": 7.161137580871582, + "learning_rate": 8.485043466356355e-05, + "loss": 0.909, + "step": 11817 + }, + { + "epoch": 0.8007317568940985, + "grad_norm": 5.295969009399414, + "learning_rate": 8.484906564446575e-05, + "loss": 0.7958, + "step": 11818 + }, + { + "epoch": 0.8007995121620706, + "grad_norm": 7.93539571762085, + "learning_rate": 8.484769662536793e-05, + "loss": 0.823, + "step": 11819 + }, + { + "epoch": 0.8008672674300427, + "grad_norm": 10.116933822631836, + "learning_rate": 8.484632760627011e-05, + "loss": 0.7204, + "step": 11820 + }, + { + "epoch": 0.8009350226980148, + "grad_norm": 5.872991561889648, + "learning_rate": 8.48449585871723e-05, + "loss": 0.7758, + "step": 11821 + }, + { + "epoch": 0.8010027779659868, + "grad_norm": 5.063296318054199, + "learning_rate": 8.484358956807448e-05, + "loss": 0.7124, + "step": 11822 + }, + { + "epoch": 0.8010705332339589, + "grad_norm": 7.007580757141113, + "learning_rate": 8.484222054897666e-05, + "loss": 1.0091, + "step": 11823 + }, + { + "epoch": 0.801138288501931, + "grad_norm": 7.909097671508789, + "learning_rate": 8.484085152987886e-05, + "loss": 0.7811, + "step": 11824 + }, + { + "epoch": 0.8012060437699031, + "grad_norm": 6.1278486251831055, + "learning_rate": 8.483948251078104e-05, + "loss": 1.0353, + "step": 11825 + }, + { + "epoch": 0.8012737990378752, + "grad_norm": 5.439823627471924, + "learning_rate": 8.483811349168322e-05, + "loss": 0.6914, + "step": 11826 + }, + { + "epoch": 0.8013415543058473, + "grad_norm": 5.741817474365234, + "learning_rate": 8.48367444725854e-05, + "loss": 0.6987, + "step": 11827 + }, + { + "epoch": 0.8014093095738194, + "grad_norm": 5.603649139404297, + "learning_rate": 8.483537545348758e-05, + "loss": 0.5928, + "step": 11828 + }, + { + "epoch": 0.8014770648417915, + "grad_norm": 5.263033390045166, + "learning_rate": 8.483400643438977e-05, + "loss": 0.6639, + "step": 11829 + }, + { + "epoch": 0.8015448201097636, + "grad_norm": 5.7066874504089355, + "learning_rate": 8.483263741529195e-05, + "loss": 0.8714, + "step": 11830 + }, + { + "epoch": 0.8016125753777357, + "grad_norm": 7.526313781738281, + "learning_rate": 8.483126839619413e-05, + "loss": 0.8741, + "step": 11831 + }, + { + "epoch": 0.8016803306457077, + "grad_norm": 7.491227626800537, + "learning_rate": 8.482989937709631e-05, + "loss": 0.7122, + "step": 11832 + }, + { + "epoch": 0.8017480859136797, + "grad_norm": 5.182889938354492, + "learning_rate": 8.482853035799851e-05, + "loss": 0.7191, + "step": 11833 + }, + { + "epoch": 0.8018158411816518, + "grad_norm": 7.201566219329834, + "learning_rate": 8.482716133890069e-05, + "loss": 1.0528, + "step": 11834 + }, + { + "epoch": 0.8018835964496239, + "grad_norm": 6.984012126922607, + "learning_rate": 8.482579231980287e-05, + "loss": 0.943, + "step": 11835 + }, + { + "epoch": 0.801951351717596, + "grad_norm": 6.960568904876709, + "learning_rate": 8.482442330070505e-05, + "loss": 0.8563, + "step": 11836 + }, + { + "epoch": 0.8020191069855681, + "grad_norm": 5.898118495941162, + "learning_rate": 8.482305428160723e-05, + "loss": 0.5819, + "step": 11837 + }, + { + "epoch": 0.8020868622535402, + "grad_norm": 7.0784592628479, + "learning_rate": 8.482168526250942e-05, + "loss": 1.0053, + "step": 11838 + }, + { + "epoch": 0.8021546175215123, + "grad_norm": 5.741284370422363, + "learning_rate": 8.48203162434116e-05, + "loss": 0.8304, + "step": 11839 + }, + { + "epoch": 0.8022223727894844, + "grad_norm": 6.108719348907471, + "learning_rate": 8.481894722431378e-05, + "loss": 0.7938, + "step": 11840 + }, + { + "epoch": 0.8022901280574565, + "grad_norm": 8.281269073486328, + "learning_rate": 8.481757820521596e-05, + "loss": 0.8584, + "step": 11841 + }, + { + "epoch": 0.8023578833254286, + "grad_norm": 9.610613822937012, + "learning_rate": 8.481620918611814e-05, + "loss": 0.9888, + "step": 11842 + }, + { + "epoch": 0.8024256385934007, + "grad_norm": 7.831075191497803, + "learning_rate": 8.481484016702034e-05, + "loss": 0.7218, + "step": 11843 + }, + { + "epoch": 0.8024933938613728, + "grad_norm": 5.8057756423950195, + "learning_rate": 8.481347114792252e-05, + "loss": 0.7296, + "step": 11844 + }, + { + "epoch": 0.8025611491293448, + "grad_norm": 7.156800270080566, + "learning_rate": 8.48121021288247e-05, + "loss": 0.8085, + "step": 11845 + }, + { + "epoch": 0.8026289043973169, + "grad_norm": 8.218076705932617, + "learning_rate": 8.481073310972688e-05, + "loss": 0.8111, + "step": 11846 + }, + { + "epoch": 0.802696659665289, + "grad_norm": 5.695756435394287, + "learning_rate": 8.480936409062907e-05, + "loss": 0.6723, + "step": 11847 + }, + { + "epoch": 0.802764414933261, + "grad_norm": 6.302215576171875, + "learning_rate": 8.480799507153125e-05, + "loss": 0.9768, + "step": 11848 + }, + { + "epoch": 0.8028321702012331, + "grad_norm": 6.877220630645752, + "learning_rate": 8.480662605243343e-05, + "loss": 0.756, + "step": 11849 + }, + { + "epoch": 0.8028999254692052, + "grad_norm": 5.5505805015563965, + "learning_rate": 8.480525703333561e-05, + "loss": 0.8171, + "step": 11850 + }, + { + "epoch": 0.8029676807371773, + "grad_norm": 6.355271339416504, + "learning_rate": 8.48038880142378e-05, + "loss": 0.791, + "step": 11851 + }, + { + "epoch": 0.8030354360051494, + "grad_norm": 5.083590984344482, + "learning_rate": 8.480251899513999e-05, + "loss": 0.6983, + "step": 11852 + }, + { + "epoch": 0.8031031912731215, + "grad_norm": 5.824821472167969, + "learning_rate": 8.480114997604217e-05, + "loss": 0.8647, + "step": 11853 + }, + { + "epoch": 0.8031709465410936, + "grad_norm": 5.969753742218018, + "learning_rate": 8.479978095694435e-05, + "loss": 0.6172, + "step": 11854 + }, + { + "epoch": 0.8032387018090656, + "grad_norm": 6.784320831298828, + "learning_rate": 8.479841193784653e-05, + "loss": 0.9374, + "step": 11855 + }, + { + "epoch": 0.8033064570770377, + "grad_norm": 8.921832084655762, + "learning_rate": 8.479704291874872e-05, + "loss": 0.6769, + "step": 11856 + }, + { + "epoch": 0.8033742123450098, + "grad_norm": 5.738458633422852, + "learning_rate": 8.47956738996509e-05, + "loss": 0.784, + "step": 11857 + }, + { + "epoch": 0.8034419676129819, + "grad_norm": 5.5067949295043945, + "learning_rate": 8.479430488055308e-05, + "loss": 0.8279, + "step": 11858 + }, + { + "epoch": 0.803509722880954, + "grad_norm": 7.618658065795898, + "learning_rate": 8.479293586145526e-05, + "loss": 0.7528, + "step": 11859 + }, + { + "epoch": 0.8035774781489261, + "grad_norm": 7.020671844482422, + "learning_rate": 8.479156684235744e-05, + "loss": 0.8522, + "step": 11860 + }, + { + "epoch": 0.8036452334168982, + "grad_norm": 6.179223537445068, + "learning_rate": 8.479019782325964e-05, + "loss": 0.7367, + "step": 11861 + }, + { + "epoch": 0.8037129886848703, + "grad_norm": 7.495842933654785, + "learning_rate": 8.478882880416182e-05, + "loss": 0.9065, + "step": 11862 + }, + { + "epoch": 0.8037807439528424, + "grad_norm": 5.692570686340332, + "learning_rate": 8.4787459785064e-05, + "loss": 0.964, + "step": 11863 + }, + { + "epoch": 0.8038484992208145, + "grad_norm": 6.177666664123535, + "learning_rate": 8.478609076596619e-05, + "loss": 1.0165, + "step": 11864 + }, + { + "epoch": 0.8039162544887865, + "grad_norm": 5.492598533630371, + "learning_rate": 8.478472174686837e-05, + "loss": 0.7103, + "step": 11865 + }, + { + "epoch": 0.8039840097567585, + "grad_norm": 6.331650257110596, + "learning_rate": 8.478335272777055e-05, + "loss": 0.7544, + "step": 11866 + }, + { + "epoch": 0.8040517650247306, + "grad_norm": 6.0005412101745605, + "learning_rate": 8.478198370867275e-05, + "loss": 1.0188, + "step": 11867 + }, + { + "epoch": 0.8041195202927027, + "grad_norm": 5.383848190307617, + "learning_rate": 8.478061468957493e-05, + "loss": 0.6002, + "step": 11868 + }, + { + "epoch": 0.8041872755606748, + "grad_norm": 5.411609172821045, + "learning_rate": 8.477924567047711e-05, + "loss": 0.7154, + "step": 11869 + }, + { + "epoch": 0.8042550308286469, + "grad_norm": 6.1650614738464355, + "learning_rate": 8.47778766513793e-05, + "loss": 0.6797, + "step": 11870 + }, + { + "epoch": 0.804322786096619, + "grad_norm": 5.903098106384277, + "learning_rate": 8.477650763228148e-05, + "loss": 0.8813, + "step": 11871 + }, + { + "epoch": 0.8043905413645911, + "grad_norm": 5.579502105712891, + "learning_rate": 8.477513861318366e-05, + "loss": 0.8935, + "step": 11872 + }, + { + "epoch": 0.8044582966325632, + "grad_norm": 5.614695072174072, + "learning_rate": 8.477376959408584e-05, + "loss": 0.7914, + "step": 11873 + }, + { + "epoch": 0.8045260519005353, + "grad_norm": 8.45317268371582, + "learning_rate": 8.477240057498802e-05, + "loss": 0.8245, + "step": 11874 + }, + { + "epoch": 0.8045938071685074, + "grad_norm": 4.028397560119629, + "learning_rate": 8.477103155589022e-05, + "loss": 0.7105, + "step": 11875 + }, + { + "epoch": 0.8046615624364795, + "grad_norm": 6.691359519958496, + "learning_rate": 8.47696625367924e-05, + "loss": 0.8871, + "step": 11876 + }, + { + "epoch": 0.8047293177044516, + "grad_norm": 5.918386936187744, + "learning_rate": 8.476829351769458e-05, + "loss": 0.9558, + "step": 11877 + }, + { + "epoch": 0.8047970729724236, + "grad_norm": 5.369225978851318, + "learning_rate": 8.476692449859676e-05, + "loss": 0.9613, + "step": 11878 + }, + { + "epoch": 0.8048648282403957, + "grad_norm": 6.17078161239624, + "learning_rate": 8.476555547949895e-05, + "loss": 0.969, + "step": 11879 + }, + { + "epoch": 0.8049325835083678, + "grad_norm": 7.199222087860107, + "learning_rate": 8.476418646040113e-05, + "loss": 1.0042, + "step": 11880 + }, + { + "epoch": 0.8050003387763398, + "grad_norm": 4.739121913909912, + "learning_rate": 8.476281744130331e-05, + "loss": 0.695, + "step": 11881 + }, + { + "epoch": 0.8050680940443119, + "grad_norm": 5.670197486877441, + "learning_rate": 8.476144842220549e-05, + "loss": 0.7452, + "step": 11882 + }, + { + "epoch": 0.805135849312284, + "grad_norm": 5.977322578430176, + "learning_rate": 8.476007940310767e-05, + "loss": 0.9352, + "step": 11883 + }, + { + "epoch": 0.8052036045802561, + "grad_norm": 4.873468399047852, + "learning_rate": 8.475871038400987e-05, + "loss": 0.857, + "step": 11884 + }, + { + "epoch": 0.8052713598482282, + "grad_norm": 6.405252933502197, + "learning_rate": 8.475734136491205e-05, + "loss": 0.8847, + "step": 11885 + }, + { + "epoch": 0.8053391151162003, + "grad_norm": 7.104851722717285, + "learning_rate": 8.475597234581423e-05, + "loss": 0.9097, + "step": 11886 + }, + { + "epoch": 0.8054068703841724, + "grad_norm": 7.579138278961182, + "learning_rate": 8.475460332671641e-05, + "loss": 1.1768, + "step": 11887 + }, + { + "epoch": 0.8054746256521444, + "grad_norm": 7.450385093688965, + "learning_rate": 8.47532343076186e-05, + "loss": 1.1769, + "step": 11888 + }, + { + "epoch": 0.8055423809201165, + "grad_norm": 5.401349067687988, + "learning_rate": 8.475186528852078e-05, + "loss": 0.7389, + "step": 11889 + }, + { + "epoch": 0.8056101361880886, + "grad_norm": 7.890493869781494, + "learning_rate": 8.475049626942296e-05, + "loss": 0.8679, + "step": 11890 + }, + { + "epoch": 0.8056778914560607, + "grad_norm": 6.528770446777344, + "learning_rate": 8.474912725032514e-05, + "loss": 0.637, + "step": 11891 + }, + { + "epoch": 0.8057456467240328, + "grad_norm": 5.3305277824401855, + "learning_rate": 8.474775823122732e-05, + "loss": 0.7671, + "step": 11892 + }, + { + "epoch": 0.8058134019920049, + "grad_norm": 6.40350866317749, + "learning_rate": 8.474638921212952e-05, + "loss": 1.055, + "step": 11893 + }, + { + "epoch": 0.805881157259977, + "grad_norm": 7.686482906341553, + "learning_rate": 8.47450201930317e-05, + "loss": 0.7023, + "step": 11894 + }, + { + "epoch": 0.8059489125279491, + "grad_norm": 5.42139196395874, + "learning_rate": 8.474365117393388e-05, + "loss": 0.7528, + "step": 11895 + }, + { + "epoch": 0.8060166677959212, + "grad_norm": 5.809099197387695, + "learning_rate": 8.474228215483606e-05, + "loss": 0.8843, + "step": 11896 + }, + { + "epoch": 0.8060844230638932, + "grad_norm": 7.206499099731445, + "learning_rate": 8.474091313573824e-05, + "loss": 0.9504, + "step": 11897 + }, + { + "epoch": 0.8061521783318653, + "grad_norm": 9.313186645507812, + "learning_rate": 8.473954411664043e-05, + "loss": 0.665, + "step": 11898 + }, + { + "epoch": 0.8062199335998373, + "grad_norm": 6.453405857086182, + "learning_rate": 8.473817509754261e-05, + "loss": 0.6757, + "step": 11899 + }, + { + "epoch": 0.8062876888678094, + "grad_norm": 6.309181213378906, + "learning_rate": 8.473680607844479e-05, + "loss": 0.8481, + "step": 11900 + }, + { + "epoch": 0.8063554441357815, + "grad_norm": 7.917797088623047, + "learning_rate": 8.473543705934697e-05, + "loss": 0.7202, + "step": 11901 + }, + { + "epoch": 0.8064231994037536, + "grad_norm": 5.929610252380371, + "learning_rate": 8.473406804024917e-05, + "loss": 0.8476, + "step": 11902 + }, + { + "epoch": 0.8064909546717257, + "grad_norm": 7.981934070587158, + "learning_rate": 8.473269902115135e-05, + "loss": 0.6972, + "step": 11903 + }, + { + "epoch": 0.8065587099396978, + "grad_norm": 6.3853440284729, + "learning_rate": 8.473133000205353e-05, + "loss": 0.881, + "step": 11904 + }, + { + "epoch": 0.8066264652076699, + "grad_norm": 6.253373622894287, + "learning_rate": 8.472996098295571e-05, + "loss": 0.8463, + "step": 11905 + }, + { + "epoch": 0.806694220475642, + "grad_norm": 5.279558181762695, + "learning_rate": 8.472859196385789e-05, + "loss": 0.8723, + "step": 11906 + }, + { + "epoch": 0.8067619757436141, + "grad_norm": 7.0585126876831055, + "learning_rate": 8.472722294476008e-05, + "loss": 0.8384, + "step": 11907 + }, + { + "epoch": 0.8068297310115862, + "grad_norm": 9.070088386535645, + "learning_rate": 8.472585392566226e-05, + "loss": 0.6578, + "step": 11908 + }, + { + "epoch": 0.8068974862795583, + "grad_norm": 7.324275970458984, + "learning_rate": 8.472448490656444e-05, + "loss": 0.7454, + "step": 11909 + }, + { + "epoch": 0.8069652415475304, + "grad_norm": 5.854486465454102, + "learning_rate": 8.472311588746664e-05, + "loss": 0.9014, + "step": 11910 + }, + { + "epoch": 0.8070329968155024, + "grad_norm": 7.762482166290283, + "learning_rate": 8.472174686836882e-05, + "loss": 0.8265, + "step": 11911 + }, + { + "epoch": 0.8071007520834745, + "grad_norm": 7.545839786529541, + "learning_rate": 8.4720377849271e-05, + "loss": 0.7831, + "step": 11912 + }, + { + "epoch": 0.8071685073514466, + "grad_norm": 4.952934741973877, + "learning_rate": 8.471900883017319e-05, + "loss": 0.6514, + "step": 11913 + }, + { + "epoch": 0.8072362626194186, + "grad_norm": 5.232631206512451, + "learning_rate": 8.471763981107537e-05, + "loss": 0.6642, + "step": 11914 + }, + { + "epoch": 0.8073040178873907, + "grad_norm": 6.2648844718933105, + "learning_rate": 8.471627079197755e-05, + "loss": 0.9277, + "step": 11915 + }, + { + "epoch": 0.8073717731553628, + "grad_norm": 6.143033027648926, + "learning_rate": 8.471490177287975e-05, + "loss": 0.6482, + "step": 11916 + }, + { + "epoch": 0.8074395284233349, + "grad_norm": 7.901699066162109, + "learning_rate": 8.471353275378193e-05, + "loss": 0.739, + "step": 11917 + }, + { + "epoch": 0.807507283691307, + "grad_norm": 7.756877899169922, + "learning_rate": 8.471216373468411e-05, + "loss": 0.8307, + "step": 11918 + }, + { + "epoch": 0.8075750389592791, + "grad_norm": 8.90708065032959, + "learning_rate": 8.471079471558629e-05, + "loss": 1.0344, + "step": 11919 + }, + { + "epoch": 0.8076427942272512, + "grad_norm": 6.328546524047852, + "learning_rate": 8.470942569648847e-05, + "loss": 0.8514, + "step": 11920 + }, + { + "epoch": 0.8077105494952233, + "grad_norm": 7.169960975646973, + "learning_rate": 8.470805667739066e-05, + "loss": 0.8809, + "step": 11921 + }, + { + "epoch": 0.8077783047631953, + "grad_norm": 6.628053188323975, + "learning_rate": 8.470668765829284e-05, + "loss": 0.9416, + "step": 11922 + }, + { + "epoch": 0.8078460600311674, + "grad_norm": 7.435145378112793, + "learning_rate": 8.470531863919502e-05, + "loss": 0.6911, + "step": 11923 + }, + { + "epoch": 0.8079138152991395, + "grad_norm": 5.010408878326416, + "learning_rate": 8.47039496200972e-05, + "loss": 0.7926, + "step": 11924 + }, + { + "epoch": 0.8079815705671116, + "grad_norm": 5.625992298126221, + "learning_rate": 8.47025806009994e-05, + "loss": 0.8863, + "step": 11925 + }, + { + "epoch": 0.8080493258350837, + "grad_norm": 8.575138092041016, + "learning_rate": 8.470121158190158e-05, + "loss": 0.7859, + "step": 11926 + }, + { + "epoch": 0.8081170811030558, + "grad_norm": 5.657655239105225, + "learning_rate": 8.469984256280376e-05, + "loss": 0.7417, + "step": 11927 + }, + { + "epoch": 0.8081848363710279, + "grad_norm": 6.131776809692383, + "learning_rate": 8.469847354370594e-05, + "loss": 0.8232, + "step": 11928 + }, + { + "epoch": 0.808252591639, + "grad_norm": 7.183122158050537, + "learning_rate": 8.469710452460812e-05, + "loss": 0.7562, + "step": 11929 + }, + { + "epoch": 0.808320346906972, + "grad_norm": 6.112959384918213, + "learning_rate": 8.469573550551031e-05, + "loss": 0.8744, + "step": 11930 + }, + { + "epoch": 0.8083881021749441, + "grad_norm": 6.5298590660095215, + "learning_rate": 8.469436648641249e-05, + "loss": 0.8624, + "step": 11931 + }, + { + "epoch": 0.8084558574429161, + "grad_norm": 7.30040979385376, + "learning_rate": 8.469299746731467e-05, + "loss": 0.8141, + "step": 11932 + }, + { + "epoch": 0.8085236127108882, + "grad_norm": 5.7774977684021, + "learning_rate": 8.469162844821685e-05, + "loss": 0.7541, + "step": 11933 + }, + { + "epoch": 0.8085913679788603, + "grad_norm": 6.179437637329102, + "learning_rate": 8.469025942911905e-05, + "loss": 0.7737, + "step": 11934 + }, + { + "epoch": 0.8086591232468324, + "grad_norm": 5.715566635131836, + "learning_rate": 8.468889041002123e-05, + "loss": 0.7055, + "step": 11935 + }, + { + "epoch": 0.8087268785148045, + "grad_norm": 5.082167148590088, + "learning_rate": 8.468752139092341e-05, + "loss": 0.7135, + "step": 11936 + }, + { + "epoch": 0.8087946337827766, + "grad_norm": 5.4100117683410645, + "learning_rate": 8.468615237182559e-05, + "loss": 0.6587, + "step": 11937 + }, + { + "epoch": 0.8088623890507487, + "grad_norm": 5.654635429382324, + "learning_rate": 8.468478335272777e-05, + "loss": 0.8371, + "step": 11938 + }, + { + "epoch": 0.8089301443187208, + "grad_norm": 13.093804359436035, + "learning_rate": 8.468341433362996e-05, + "loss": 0.7622, + "step": 11939 + }, + { + "epoch": 0.8089978995866929, + "grad_norm": 6.519128322601318, + "learning_rate": 8.468204531453214e-05, + "loss": 0.5443, + "step": 11940 + }, + { + "epoch": 0.809065654854665, + "grad_norm": 6.814082622528076, + "learning_rate": 8.468067629543432e-05, + "loss": 0.7527, + "step": 11941 + }, + { + "epoch": 0.8091334101226371, + "grad_norm": 5.064624786376953, + "learning_rate": 8.46793072763365e-05, + "loss": 0.7392, + "step": 11942 + }, + { + "epoch": 0.8092011653906092, + "grad_norm": 6.819398403167725, + "learning_rate": 8.467793825723868e-05, + "loss": 0.8395, + "step": 11943 + }, + { + "epoch": 0.8092689206585812, + "grad_norm": 5.4765520095825195, + "learning_rate": 8.467656923814088e-05, + "loss": 0.6145, + "step": 11944 + }, + { + "epoch": 0.8093366759265533, + "grad_norm": 5.149988651275635, + "learning_rate": 8.467520021904306e-05, + "loss": 0.6676, + "step": 11945 + }, + { + "epoch": 0.8094044311945253, + "grad_norm": 5.497957229614258, + "learning_rate": 8.467383119994524e-05, + "loss": 0.8605, + "step": 11946 + }, + { + "epoch": 0.8094721864624974, + "grad_norm": 6.20892333984375, + "learning_rate": 8.467246218084742e-05, + "loss": 0.7578, + "step": 11947 + }, + { + "epoch": 0.8095399417304695, + "grad_norm": 5.368823528289795, + "learning_rate": 8.467109316174961e-05, + "loss": 0.6105, + "step": 11948 + }, + { + "epoch": 0.8096076969984416, + "grad_norm": 5.05849552154541, + "learning_rate": 8.466972414265179e-05, + "loss": 0.6165, + "step": 11949 + }, + { + "epoch": 0.8096754522664137, + "grad_norm": 6.1569366455078125, + "learning_rate": 8.466835512355397e-05, + "loss": 0.6865, + "step": 11950 + }, + { + "epoch": 0.8097432075343858, + "grad_norm": 4.664434432983398, + "learning_rate": 8.466698610445615e-05, + "loss": 0.7348, + "step": 11951 + }, + { + "epoch": 0.8098109628023579, + "grad_norm": 5.691340446472168, + "learning_rate": 8.466561708535833e-05, + "loss": 0.625, + "step": 11952 + }, + { + "epoch": 0.80987871807033, + "grad_norm": 5.343607425689697, + "learning_rate": 8.466424806626053e-05, + "loss": 0.5954, + "step": 11953 + }, + { + "epoch": 0.809946473338302, + "grad_norm": 7.289133548736572, + "learning_rate": 8.466287904716271e-05, + "loss": 0.8564, + "step": 11954 + }, + { + "epoch": 0.8100142286062741, + "grad_norm": 5.425955772399902, + "learning_rate": 8.466151002806489e-05, + "loss": 0.6588, + "step": 11955 + }, + { + "epoch": 0.8100819838742462, + "grad_norm": 6.205384731292725, + "learning_rate": 8.466014100896708e-05, + "loss": 0.8809, + "step": 11956 + }, + { + "epoch": 0.8101497391422183, + "grad_norm": 7.989570140838623, + "learning_rate": 8.465877198986926e-05, + "loss": 0.6923, + "step": 11957 + }, + { + "epoch": 0.8102174944101904, + "grad_norm": 5.3845343589782715, + "learning_rate": 8.465740297077144e-05, + "loss": 0.645, + "step": 11958 + }, + { + "epoch": 0.8102852496781625, + "grad_norm": 5.856838703155518, + "learning_rate": 8.465603395167364e-05, + "loss": 0.9173, + "step": 11959 + }, + { + "epoch": 0.8103530049461346, + "grad_norm": 7.78700590133667, + "learning_rate": 8.465466493257582e-05, + "loss": 0.7514, + "step": 11960 + }, + { + "epoch": 0.8104207602141067, + "grad_norm": 6.29768705368042, + "learning_rate": 8.4653295913478e-05, + "loss": 0.862, + "step": 11961 + }, + { + "epoch": 0.8104885154820788, + "grad_norm": 5.1076579093933105, + "learning_rate": 8.465192689438019e-05, + "loss": 0.6395, + "step": 11962 + }, + { + "epoch": 0.8105562707500508, + "grad_norm": 7.518921375274658, + "learning_rate": 8.465055787528237e-05, + "loss": 0.9251, + "step": 11963 + }, + { + "epoch": 0.8106240260180229, + "grad_norm": 5.740368843078613, + "learning_rate": 8.464918885618455e-05, + "loss": 0.8354, + "step": 11964 + }, + { + "epoch": 0.810691781285995, + "grad_norm": 7.306612491607666, + "learning_rate": 8.464781983708673e-05, + "loss": 0.7414, + "step": 11965 + }, + { + "epoch": 0.810759536553967, + "grad_norm": 5.480811595916748, + "learning_rate": 8.464645081798893e-05, + "loss": 0.7021, + "step": 11966 + }, + { + "epoch": 0.8108272918219391, + "grad_norm": 6.626734733581543, + "learning_rate": 8.46450817988911e-05, + "loss": 0.7866, + "step": 11967 + }, + { + "epoch": 0.8108950470899112, + "grad_norm": 5.154781818389893, + "learning_rate": 8.464371277979329e-05, + "loss": 0.653, + "step": 11968 + }, + { + "epoch": 0.8109628023578833, + "grad_norm": 6.271585464477539, + "learning_rate": 8.464234376069547e-05, + "loss": 0.9318, + "step": 11969 + }, + { + "epoch": 0.8110305576258554, + "grad_norm": 6.008039951324463, + "learning_rate": 8.464097474159765e-05, + "loss": 0.6224, + "step": 11970 + }, + { + "epoch": 0.8110983128938275, + "grad_norm": 6.575869560241699, + "learning_rate": 8.463960572249984e-05, + "loss": 0.685, + "step": 11971 + }, + { + "epoch": 0.8111660681617996, + "grad_norm": 6.166112899780273, + "learning_rate": 8.463823670340202e-05, + "loss": 0.6738, + "step": 11972 + }, + { + "epoch": 0.8112338234297717, + "grad_norm": 6.191090106964111, + "learning_rate": 8.46368676843042e-05, + "loss": 0.9672, + "step": 11973 + }, + { + "epoch": 0.8113015786977438, + "grad_norm": 5.888790130615234, + "learning_rate": 8.463549866520638e-05, + "loss": 0.5671, + "step": 11974 + }, + { + "epoch": 0.8113693339657159, + "grad_norm": 6.156980514526367, + "learning_rate": 8.463412964610856e-05, + "loss": 0.6792, + "step": 11975 + }, + { + "epoch": 0.811437089233688, + "grad_norm": 6.996181011199951, + "learning_rate": 8.463276062701076e-05, + "loss": 0.9247, + "step": 11976 + }, + { + "epoch": 0.81150484450166, + "grad_norm": 7.532526016235352, + "learning_rate": 8.463139160791294e-05, + "loss": 0.7807, + "step": 11977 + }, + { + "epoch": 0.8115725997696321, + "grad_norm": 5.303388595581055, + "learning_rate": 8.463002258881512e-05, + "loss": 0.6836, + "step": 11978 + }, + { + "epoch": 0.8116403550376041, + "grad_norm": 5.388332843780518, + "learning_rate": 8.46286535697173e-05, + "loss": 0.8513, + "step": 11979 + }, + { + "epoch": 0.8117081103055762, + "grad_norm": 5.5608601570129395, + "learning_rate": 8.462728455061949e-05, + "loss": 0.6047, + "step": 11980 + }, + { + "epoch": 0.8117758655735483, + "grad_norm": 6.766813278198242, + "learning_rate": 8.462591553152167e-05, + "loss": 0.8592, + "step": 11981 + }, + { + "epoch": 0.8118436208415204, + "grad_norm": 6.640246868133545, + "learning_rate": 8.462454651242385e-05, + "loss": 0.6319, + "step": 11982 + }, + { + "epoch": 0.8119113761094925, + "grad_norm": 6.7665815353393555, + "learning_rate": 8.462317749332603e-05, + "loss": 0.8778, + "step": 11983 + }, + { + "epoch": 0.8119791313774646, + "grad_norm": 5.935091018676758, + "learning_rate": 8.462180847422821e-05, + "loss": 0.6987, + "step": 11984 + }, + { + "epoch": 0.8120468866454367, + "grad_norm": 4.797418594360352, + "learning_rate": 8.46204394551304e-05, + "loss": 0.6862, + "step": 11985 + }, + { + "epoch": 0.8121146419134088, + "grad_norm": 5.728845119476318, + "learning_rate": 8.461907043603259e-05, + "loss": 0.8859, + "step": 11986 + }, + { + "epoch": 0.8121823971813809, + "grad_norm": 6.456442356109619, + "learning_rate": 8.461770141693477e-05, + "loss": 0.8329, + "step": 11987 + }, + { + "epoch": 0.812250152449353, + "grad_norm": 6.974035739898682, + "learning_rate": 8.461633239783695e-05, + "loss": 0.8881, + "step": 11988 + }, + { + "epoch": 0.812317907717325, + "grad_norm": 6.6539788246154785, + "learning_rate": 8.461496337873914e-05, + "loss": 1.1583, + "step": 11989 + }, + { + "epoch": 0.8123856629852971, + "grad_norm": 4.992013931274414, + "learning_rate": 8.461359435964132e-05, + "loss": 0.6866, + "step": 11990 + }, + { + "epoch": 0.8124534182532692, + "grad_norm": 6.416220664978027, + "learning_rate": 8.46122253405435e-05, + "loss": 1.1336, + "step": 11991 + }, + { + "epoch": 0.8125211735212413, + "grad_norm": 6.000530242919922, + "learning_rate": 8.461085632144568e-05, + "loss": 0.8182, + "step": 11992 + }, + { + "epoch": 0.8125889287892134, + "grad_norm": 6.560791492462158, + "learning_rate": 8.460948730234786e-05, + "loss": 0.5594, + "step": 11993 + }, + { + "epoch": 0.8126566840571855, + "grad_norm": 5.342809200286865, + "learning_rate": 8.460811828325006e-05, + "loss": 0.9214, + "step": 11994 + }, + { + "epoch": 0.8127244393251575, + "grad_norm": 6.472506046295166, + "learning_rate": 8.460674926415224e-05, + "loss": 0.9376, + "step": 11995 + }, + { + "epoch": 0.8127921945931296, + "grad_norm": 10.083342552185059, + "learning_rate": 8.460538024505442e-05, + "loss": 0.8582, + "step": 11996 + }, + { + "epoch": 0.8128599498611017, + "grad_norm": 6.755568504333496, + "learning_rate": 8.46040112259566e-05, + "loss": 1.2787, + "step": 11997 + }, + { + "epoch": 0.8129277051290738, + "grad_norm": 5.924015998840332, + "learning_rate": 8.460264220685878e-05, + "loss": 0.7204, + "step": 11998 + }, + { + "epoch": 0.8129954603970458, + "grad_norm": 7.797226428985596, + "learning_rate": 8.460127318776097e-05, + "loss": 1.0529, + "step": 11999 + }, + { + "epoch": 0.8130632156650179, + "grad_norm": 6.317507743835449, + "learning_rate": 8.459990416866315e-05, + "loss": 0.4736, + "step": 12000 + }, + { + "epoch": 0.81313097093299, + "grad_norm": 6.195952415466309, + "learning_rate": 8.459853514956533e-05, + "loss": 0.8751, + "step": 12001 + }, + { + "epoch": 0.8131987262009621, + "grad_norm": 8.634666442871094, + "learning_rate": 8.459716613046753e-05, + "loss": 0.8552, + "step": 12002 + }, + { + "epoch": 0.8132664814689342, + "grad_norm": 6.352993488311768, + "learning_rate": 8.459579711136971e-05, + "loss": 0.9277, + "step": 12003 + }, + { + "epoch": 0.8133342367369063, + "grad_norm": 6.608835697174072, + "learning_rate": 8.459442809227189e-05, + "loss": 0.9077, + "step": 12004 + }, + { + "epoch": 0.8134019920048784, + "grad_norm": 5.515098571777344, + "learning_rate": 8.459305907317408e-05, + "loss": 0.7215, + "step": 12005 + }, + { + "epoch": 0.8134697472728505, + "grad_norm": 5.592660427093506, + "learning_rate": 8.459169005407626e-05, + "loss": 0.6318, + "step": 12006 + }, + { + "epoch": 0.8135375025408226, + "grad_norm": 6.810677528381348, + "learning_rate": 8.459032103497844e-05, + "loss": 0.8533, + "step": 12007 + }, + { + "epoch": 0.8136052578087947, + "grad_norm": 6.685205936431885, + "learning_rate": 8.458895201588064e-05, + "loss": 0.8084, + "step": 12008 + }, + { + "epoch": 0.8136730130767668, + "grad_norm": 5.3733062744140625, + "learning_rate": 8.458758299678282e-05, + "loss": 0.8156, + "step": 12009 + }, + { + "epoch": 0.8137407683447389, + "grad_norm": 4.988927841186523, + "learning_rate": 8.4586213977685e-05, + "loss": 0.777, + "step": 12010 + }, + { + "epoch": 0.8138085236127109, + "grad_norm": 7.371654510498047, + "learning_rate": 8.458484495858718e-05, + "loss": 0.7513, + "step": 12011 + }, + { + "epoch": 0.8138762788806829, + "grad_norm": 6.63214111328125, + "learning_rate": 8.458347593948937e-05, + "loss": 0.7991, + "step": 12012 + }, + { + "epoch": 0.813944034148655, + "grad_norm": 6.847991466522217, + "learning_rate": 8.458210692039155e-05, + "loss": 0.8741, + "step": 12013 + }, + { + "epoch": 0.8140117894166271, + "grad_norm": 6.964975357055664, + "learning_rate": 8.458073790129373e-05, + "loss": 0.8665, + "step": 12014 + }, + { + "epoch": 0.8140795446845992, + "grad_norm": 6.188068866729736, + "learning_rate": 8.457936888219591e-05, + "loss": 0.8174, + "step": 12015 + }, + { + "epoch": 0.8141472999525713, + "grad_norm": 8.014631271362305, + "learning_rate": 8.457799986309809e-05, + "loss": 0.8603, + "step": 12016 + }, + { + "epoch": 0.8142150552205434, + "grad_norm": 4.80557107925415, + "learning_rate": 8.457663084400029e-05, + "loss": 0.7971, + "step": 12017 + }, + { + "epoch": 0.8142828104885155, + "grad_norm": 4.926700115203857, + "learning_rate": 8.457526182490247e-05, + "loss": 0.6986, + "step": 12018 + }, + { + "epoch": 0.8143505657564876, + "grad_norm": 7.593190670013428, + "learning_rate": 8.457389280580465e-05, + "loss": 0.8656, + "step": 12019 + }, + { + "epoch": 0.8144183210244597, + "grad_norm": 5.325191497802734, + "learning_rate": 8.457252378670683e-05, + "loss": 0.7471, + "step": 12020 + }, + { + "epoch": 0.8144860762924317, + "grad_norm": 6.783299446105957, + "learning_rate": 8.457115476760902e-05, + "loss": 0.9003, + "step": 12021 + }, + { + "epoch": 0.8145538315604038, + "grad_norm": 5.972321033477783, + "learning_rate": 8.45697857485112e-05, + "loss": 0.9259, + "step": 12022 + }, + { + "epoch": 0.8146215868283759, + "grad_norm": 4.9444708824157715, + "learning_rate": 8.456841672941338e-05, + "loss": 0.6313, + "step": 12023 + }, + { + "epoch": 0.814689342096348, + "grad_norm": 6.034842014312744, + "learning_rate": 8.456704771031556e-05, + "loss": 0.736, + "step": 12024 + }, + { + "epoch": 0.8147570973643201, + "grad_norm": 7.463682651519775, + "learning_rate": 8.456567869121774e-05, + "loss": 0.8651, + "step": 12025 + }, + { + "epoch": 0.8148248526322922, + "grad_norm": 6.878032207489014, + "learning_rate": 8.456430967211994e-05, + "loss": 0.9458, + "step": 12026 + }, + { + "epoch": 0.8148926079002643, + "grad_norm": 6.859936237335205, + "learning_rate": 8.456294065302212e-05, + "loss": 0.9125, + "step": 12027 + }, + { + "epoch": 0.8149603631682363, + "grad_norm": 6.0320329666137695, + "learning_rate": 8.45615716339243e-05, + "loss": 0.6817, + "step": 12028 + }, + { + "epoch": 0.8150281184362084, + "grad_norm": 6.321547031402588, + "learning_rate": 8.456020261482648e-05, + "loss": 0.7538, + "step": 12029 + }, + { + "epoch": 0.8150958737041805, + "grad_norm": 6.318676471710205, + "learning_rate": 8.455883359572866e-05, + "loss": 0.9262, + "step": 12030 + }, + { + "epoch": 0.8151636289721526, + "grad_norm": 5.807433605194092, + "learning_rate": 8.455746457663085e-05, + "loss": 0.7597, + "step": 12031 + }, + { + "epoch": 0.8152313842401246, + "grad_norm": 6.104518413543701, + "learning_rate": 8.455609555753303e-05, + "loss": 0.7186, + "step": 12032 + }, + { + "epoch": 0.8152991395080967, + "grad_norm": 6.957059860229492, + "learning_rate": 8.455472653843521e-05, + "loss": 0.7533, + "step": 12033 + }, + { + "epoch": 0.8153668947760688, + "grad_norm": 6.928465366363525, + "learning_rate": 8.455335751933739e-05, + "loss": 0.578, + "step": 12034 + }, + { + "epoch": 0.8154346500440409, + "grad_norm": 6.699448108673096, + "learning_rate": 8.455198850023959e-05, + "loss": 0.7888, + "step": 12035 + }, + { + "epoch": 0.815502405312013, + "grad_norm": 7.328460693359375, + "learning_rate": 8.455061948114177e-05, + "loss": 0.72, + "step": 12036 + }, + { + "epoch": 0.8155701605799851, + "grad_norm": 7.082894802093506, + "learning_rate": 8.454925046204395e-05, + "loss": 0.9115, + "step": 12037 + }, + { + "epoch": 0.8156379158479572, + "grad_norm": 5.156605243682861, + "learning_rate": 8.454788144294613e-05, + "loss": 0.748, + "step": 12038 + }, + { + "epoch": 0.8157056711159293, + "grad_norm": 6.401536464691162, + "learning_rate": 8.454651242384831e-05, + "loss": 0.8282, + "step": 12039 + }, + { + "epoch": 0.8157734263839014, + "grad_norm": 7.056277275085449, + "learning_rate": 8.45451434047505e-05, + "loss": 0.9653, + "step": 12040 + }, + { + "epoch": 0.8158411816518735, + "grad_norm": 5.628291130065918, + "learning_rate": 8.454377438565268e-05, + "loss": 0.8733, + "step": 12041 + }, + { + "epoch": 0.8159089369198456, + "grad_norm": 5.863224506378174, + "learning_rate": 8.454240536655486e-05, + "loss": 0.8301, + "step": 12042 + }, + { + "epoch": 0.8159766921878177, + "grad_norm": 7.33843994140625, + "learning_rate": 8.454103634745704e-05, + "loss": 0.6597, + "step": 12043 + }, + { + "epoch": 0.8160444474557896, + "grad_norm": 5.626278400421143, + "learning_rate": 8.453966732835924e-05, + "loss": 0.7648, + "step": 12044 + }, + { + "epoch": 0.8161122027237617, + "grad_norm": 5.470703125, + "learning_rate": 8.453829830926142e-05, + "loss": 0.6636, + "step": 12045 + }, + { + "epoch": 0.8161799579917338, + "grad_norm": 4.597399711608887, + "learning_rate": 8.45369292901636e-05, + "loss": 0.7107, + "step": 12046 + }, + { + "epoch": 0.8162477132597059, + "grad_norm": 6.172791957855225, + "learning_rate": 8.453556027106578e-05, + "loss": 0.7516, + "step": 12047 + }, + { + "epoch": 0.816315468527678, + "grad_norm": 6.869264125823975, + "learning_rate": 8.453419125196796e-05, + "loss": 0.7626, + "step": 12048 + }, + { + "epoch": 0.8163832237956501, + "grad_norm": 6.974149703979492, + "learning_rate": 8.453282223287015e-05, + "loss": 0.8517, + "step": 12049 + }, + { + "epoch": 0.8164509790636222, + "grad_norm": 6.083059787750244, + "learning_rate": 8.453145321377233e-05, + "loss": 0.7275, + "step": 12050 + }, + { + "epoch": 0.8165187343315943, + "grad_norm": 4.947962284088135, + "learning_rate": 8.453008419467451e-05, + "loss": 0.663, + "step": 12051 + }, + { + "epoch": 0.8165864895995664, + "grad_norm": 5.22273588180542, + "learning_rate": 8.45287151755767e-05, + "loss": 0.6265, + "step": 12052 + }, + { + "epoch": 0.8166542448675385, + "grad_norm": 7.42067289352417, + "learning_rate": 8.452734615647889e-05, + "loss": 1.1131, + "step": 12053 + }, + { + "epoch": 0.8167220001355106, + "grad_norm": 6.975278854370117, + "learning_rate": 8.452597713738107e-05, + "loss": 0.8143, + "step": 12054 + }, + { + "epoch": 0.8167897554034826, + "grad_norm": 5.899443626403809, + "learning_rate": 8.452460811828326e-05, + "loss": 0.7033, + "step": 12055 + }, + { + "epoch": 0.8168575106714547, + "grad_norm": 5.515699863433838, + "learning_rate": 8.452323909918544e-05, + "loss": 0.676, + "step": 12056 + }, + { + "epoch": 0.8169252659394268, + "grad_norm": 5.615140438079834, + "learning_rate": 8.452187008008762e-05, + "loss": 0.6282, + "step": 12057 + }, + { + "epoch": 0.8169930212073989, + "grad_norm": 5.279138088226318, + "learning_rate": 8.452050106098982e-05, + "loss": 0.7139, + "step": 12058 + }, + { + "epoch": 0.817060776475371, + "grad_norm": 4.64931583404541, + "learning_rate": 8.4519132041892e-05, + "loss": 0.7691, + "step": 12059 + }, + { + "epoch": 0.817128531743343, + "grad_norm": 5.363344192504883, + "learning_rate": 8.451776302279418e-05, + "loss": 0.8622, + "step": 12060 + }, + { + "epoch": 0.8171962870113151, + "grad_norm": 5.616733551025391, + "learning_rate": 8.451639400369636e-05, + "loss": 0.729, + "step": 12061 + }, + { + "epoch": 0.8172640422792872, + "grad_norm": 6.5094451904296875, + "learning_rate": 8.451502498459854e-05, + "loss": 0.7148, + "step": 12062 + }, + { + "epoch": 0.8173317975472593, + "grad_norm": 6.721555233001709, + "learning_rate": 8.451365596550073e-05, + "loss": 0.8657, + "step": 12063 + }, + { + "epoch": 0.8173995528152314, + "grad_norm": 6.426924705505371, + "learning_rate": 8.451228694640291e-05, + "loss": 0.7729, + "step": 12064 + }, + { + "epoch": 0.8174673080832034, + "grad_norm": 5.894415378570557, + "learning_rate": 8.451091792730509e-05, + "loss": 0.8842, + "step": 12065 + }, + { + "epoch": 0.8175350633511755, + "grad_norm": 9.44097900390625, + "learning_rate": 8.450954890820727e-05, + "loss": 0.7221, + "step": 12066 + }, + { + "epoch": 0.8176028186191476, + "grad_norm": 9.030364990234375, + "learning_rate": 8.450817988910947e-05, + "loss": 0.9898, + "step": 12067 + }, + { + "epoch": 0.8176705738871197, + "grad_norm": 5.958207607269287, + "learning_rate": 8.450681087001165e-05, + "loss": 0.9868, + "step": 12068 + }, + { + "epoch": 0.8177383291550918, + "grad_norm": 5.822267532348633, + "learning_rate": 8.450544185091383e-05, + "loss": 0.7575, + "step": 12069 + }, + { + "epoch": 0.8178060844230639, + "grad_norm": 6.314889430999756, + "learning_rate": 8.4504072831816e-05, + "loss": 0.8706, + "step": 12070 + }, + { + "epoch": 0.817873839691036, + "grad_norm": 9.256656646728516, + "learning_rate": 8.450270381271819e-05, + "loss": 0.7976, + "step": 12071 + }, + { + "epoch": 0.8179415949590081, + "grad_norm": 5.871959209442139, + "learning_rate": 8.450133479362038e-05, + "loss": 0.6814, + "step": 12072 + }, + { + "epoch": 0.8180093502269802, + "grad_norm": 9.304550170898438, + "learning_rate": 8.449996577452256e-05, + "loss": 0.6588, + "step": 12073 + }, + { + "epoch": 0.8180771054949523, + "grad_norm": 5.510218620300293, + "learning_rate": 8.449859675542474e-05, + "loss": 0.5854, + "step": 12074 + }, + { + "epoch": 0.8181448607629244, + "grad_norm": 4.799395561218262, + "learning_rate": 8.449722773632692e-05, + "loss": 0.6387, + "step": 12075 + }, + { + "epoch": 0.8182126160308965, + "grad_norm": 7.109929084777832, + "learning_rate": 8.44958587172291e-05, + "loss": 1.0768, + "step": 12076 + }, + { + "epoch": 0.8182803712988684, + "grad_norm": 5.443954944610596, + "learning_rate": 8.44944896981313e-05, + "loss": 0.7782, + "step": 12077 + }, + { + "epoch": 0.8183481265668405, + "grad_norm": 5.463802814483643, + "learning_rate": 8.449312067903348e-05, + "loss": 0.8792, + "step": 12078 + }, + { + "epoch": 0.8184158818348126, + "grad_norm": 6.221611022949219, + "learning_rate": 8.449175165993566e-05, + "loss": 0.786, + "step": 12079 + }, + { + "epoch": 0.8184836371027847, + "grad_norm": 5.399687767028809, + "learning_rate": 8.449038264083784e-05, + "loss": 0.7112, + "step": 12080 + }, + { + "epoch": 0.8185513923707568, + "grad_norm": 6.230489730834961, + "learning_rate": 8.448901362174003e-05, + "loss": 0.6785, + "step": 12081 + }, + { + "epoch": 0.8186191476387289, + "grad_norm": 6.096298694610596, + "learning_rate": 8.448764460264221e-05, + "loss": 0.641, + "step": 12082 + }, + { + "epoch": 0.818686902906701, + "grad_norm": 5.790489673614502, + "learning_rate": 8.448627558354439e-05, + "loss": 0.8582, + "step": 12083 + }, + { + "epoch": 0.8187546581746731, + "grad_norm": 5.866037845611572, + "learning_rate": 8.448490656444657e-05, + "loss": 0.8527, + "step": 12084 + }, + { + "epoch": 0.8188224134426452, + "grad_norm": 6.1855854988098145, + "learning_rate": 8.448353754534875e-05, + "loss": 0.8359, + "step": 12085 + }, + { + "epoch": 0.8188901687106173, + "grad_norm": 6.506312370300293, + "learning_rate": 8.448216852625095e-05, + "loss": 0.6803, + "step": 12086 + }, + { + "epoch": 0.8189579239785894, + "grad_norm": 8.091931343078613, + "learning_rate": 8.448079950715313e-05, + "loss": 1.0264, + "step": 12087 + }, + { + "epoch": 0.8190256792465614, + "grad_norm": 6.240049362182617, + "learning_rate": 8.447943048805531e-05, + "loss": 0.7127, + "step": 12088 + }, + { + "epoch": 0.8190934345145335, + "grad_norm": 6.7533650398254395, + "learning_rate": 8.447806146895749e-05, + "loss": 1.0634, + "step": 12089 + }, + { + "epoch": 0.8191611897825056, + "grad_norm": 5.064426422119141, + "learning_rate": 8.447669244985968e-05, + "loss": 0.6213, + "step": 12090 + }, + { + "epoch": 0.8192289450504777, + "grad_norm": 5.491628646850586, + "learning_rate": 8.447532343076186e-05, + "loss": 0.5707, + "step": 12091 + }, + { + "epoch": 0.8192967003184498, + "grad_norm": 7.105623722076416, + "learning_rate": 8.447395441166404e-05, + "loss": 0.9926, + "step": 12092 + }, + { + "epoch": 0.8193644555864218, + "grad_norm": 7.82690954208374, + "learning_rate": 8.447258539256622e-05, + "loss": 0.7902, + "step": 12093 + }, + { + "epoch": 0.8194322108543939, + "grad_norm": 5.6463494300842285, + "learning_rate": 8.44712163734684e-05, + "loss": 0.8812, + "step": 12094 + }, + { + "epoch": 0.819499966122366, + "grad_norm": 7.104325771331787, + "learning_rate": 8.44698473543706e-05, + "loss": 0.6544, + "step": 12095 + }, + { + "epoch": 0.8195677213903381, + "grad_norm": 5.303103923797607, + "learning_rate": 8.446847833527278e-05, + "loss": 0.8304, + "step": 12096 + }, + { + "epoch": 0.8196354766583102, + "grad_norm": 6.566699504852295, + "learning_rate": 8.446710931617496e-05, + "loss": 0.8582, + "step": 12097 + }, + { + "epoch": 0.8197032319262822, + "grad_norm": 7.369137287139893, + "learning_rate": 8.446574029707715e-05, + "loss": 0.7852, + "step": 12098 + }, + { + "epoch": 0.8197709871942543, + "grad_norm": 6.183825492858887, + "learning_rate": 8.446437127797933e-05, + "loss": 0.8196, + "step": 12099 + }, + { + "epoch": 0.8198387424622264, + "grad_norm": 4.837382793426514, + "learning_rate": 8.446300225888151e-05, + "loss": 0.6597, + "step": 12100 + }, + { + "epoch": 0.8199064977301985, + "grad_norm": 6.405309200286865, + "learning_rate": 8.44616332397837e-05, + "loss": 0.7934, + "step": 12101 + }, + { + "epoch": 0.8199742529981706, + "grad_norm": 6.547097682952881, + "learning_rate": 8.446026422068589e-05, + "loss": 0.8375, + "step": 12102 + }, + { + "epoch": 0.8200420082661427, + "grad_norm": 6.001138687133789, + "learning_rate": 8.445889520158807e-05, + "loss": 0.7713, + "step": 12103 + }, + { + "epoch": 0.8201097635341148, + "grad_norm": 5.217280864715576, + "learning_rate": 8.445752618249026e-05, + "loss": 0.605, + "step": 12104 + }, + { + "epoch": 0.8201775188020869, + "grad_norm": 5.498340129852295, + "learning_rate": 8.445615716339244e-05, + "loss": 0.7789, + "step": 12105 + }, + { + "epoch": 0.820245274070059, + "grad_norm": 9.02701187133789, + "learning_rate": 8.445478814429462e-05, + "loss": 0.8801, + "step": 12106 + }, + { + "epoch": 0.8203130293380311, + "grad_norm": 7.654047966003418, + "learning_rate": 8.44534191251968e-05, + "loss": 0.7072, + "step": 12107 + }, + { + "epoch": 0.8203807846060032, + "grad_norm": 5.503271102905273, + "learning_rate": 8.445205010609898e-05, + "loss": 0.7304, + "step": 12108 + }, + { + "epoch": 0.8204485398739751, + "grad_norm": 5.020559310913086, + "learning_rate": 8.445068108700118e-05, + "loss": 0.5477, + "step": 12109 + }, + { + "epoch": 0.8205162951419472, + "grad_norm": 6.803164958953857, + "learning_rate": 8.444931206790336e-05, + "loss": 0.922, + "step": 12110 + }, + { + "epoch": 0.8205840504099193, + "grad_norm": 5.567500114440918, + "learning_rate": 8.444794304880554e-05, + "loss": 0.5649, + "step": 12111 + }, + { + "epoch": 0.8206518056778914, + "grad_norm": 6.515629291534424, + "learning_rate": 8.444657402970772e-05, + "loss": 0.5943, + "step": 12112 + }, + { + "epoch": 0.8207195609458635, + "grad_norm": 6.220151424407959, + "learning_rate": 8.444520501060991e-05, + "loss": 0.7958, + "step": 12113 + }, + { + "epoch": 0.8207873162138356, + "grad_norm": 6.00366735458374, + "learning_rate": 8.444383599151209e-05, + "loss": 0.6698, + "step": 12114 + }, + { + "epoch": 0.8208550714818077, + "grad_norm": 7.6385955810546875, + "learning_rate": 8.444246697241427e-05, + "loss": 0.7866, + "step": 12115 + }, + { + "epoch": 0.8209228267497798, + "grad_norm": 4.94298791885376, + "learning_rate": 8.444109795331645e-05, + "loss": 0.6862, + "step": 12116 + }, + { + "epoch": 0.8209905820177519, + "grad_norm": 4.838351726531982, + "learning_rate": 8.443972893421863e-05, + "loss": 0.6723, + "step": 12117 + }, + { + "epoch": 0.821058337285724, + "grad_norm": 5.836402893066406, + "learning_rate": 8.443835991512083e-05, + "loss": 0.6631, + "step": 12118 + }, + { + "epoch": 0.8211260925536961, + "grad_norm": 6.094921588897705, + "learning_rate": 8.4436990896023e-05, + "loss": 0.6569, + "step": 12119 + }, + { + "epoch": 0.8211938478216682, + "grad_norm": 6.998372554779053, + "learning_rate": 8.443562187692519e-05, + "loss": 0.8489, + "step": 12120 + }, + { + "epoch": 0.8212616030896402, + "grad_norm": 8.239564895629883, + "learning_rate": 8.443425285782737e-05, + "loss": 0.9627, + "step": 12121 + }, + { + "epoch": 0.8213293583576123, + "grad_norm": 5.836661338806152, + "learning_rate": 8.443288383872956e-05, + "loss": 0.6482, + "step": 12122 + }, + { + "epoch": 0.8213971136255844, + "grad_norm": 5.142320156097412, + "learning_rate": 8.443151481963174e-05, + "loss": 0.5763, + "step": 12123 + }, + { + "epoch": 0.8214648688935565, + "grad_norm": 5.773630619049072, + "learning_rate": 8.443014580053392e-05, + "loss": 0.7495, + "step": 12124 + }, + { + "epoch": 0.8215326241615286, + "grad_norm": 6.265152931213379, + "learning_rate": 8.44287767814361e-05, + "loss": 0.732, + "step": 12125 + }, + { + "epoch": 0.8216003794295006, + "grad_norm": 5.69442081451416, + "learning_rate": 8.442740776233828e-05, + "loss": 0.7479, + "step": 12126 + }, + { + "epoch": 0.8216681346974727, + "grad_norm": 5.151772499084473, + "learning_rate": 8.442603874324048e-05, + "loss": 0.9404, + "step": 12127 + }, + { + "epoch": 0.8217358899654448, + "grad_norm": 5.117092609405518, + "learning_rate": 8.442466972414266e-05, + "loss": 0.8087, + "step": 12128 + }, + { + "epoch": 0.8218036452334169, + "grad_norm": 6.638974666595459, + "learning_rate": 8.442330070504484e-05, + "loss": 1.018, + "step": 12129 + }, + { + "epoch": 0.821871400501389, + "grad_norm": 5.713891983032227, + "learning_rate": 8.442193168594702e-05, + "loss": 0.7464, + "step": 12130 + }, + { + "epoch": 0.821939155769361, + "grad_norm": 5.336922645568848, + "learning_rate": 8.44205626668492e-05, + "loss": 0.7876, + "step": 12131 + }, + { + "epoch": 0.8220069110373331, + "grad_norm": 6.789290904998779, + "learning_rate": 8.441919364775139e-05, + "loss": 0.8613, + "step": 12132 + }, + { + "epoch": 0.8220746663053052, + "grad_norm": 5.347286701202393, + "learning_rate": 8.441782462865357e-05, + "loss": 0.899, + "step": 12133 + }, + { + "epoch": 0.8221424215732773, + "grad_norm": 5.311189651489258, + "learning_rate": 8.441645560955575e-05, + "loss": 0.6064, + "step": 12134 + }, + { + "epoch": 0.8222101768412494, + "grad_norm": 5.930995464324951, + "learning_rate": 8.441508659045793e-05, + "loss": 0.7213, + "step": 12135 + }, + { + "epoch": 0.8222779321092215, + "grad_norm": 7.024041652679443, + "learning_rate": 8.441371757136013e-05, + "loss": 0.7926, + "step": 12136 + }, + { + "epoch": 0.8223456873771936, + "grad_norm": 5.4607768058776855, + "learning_rate": 8.44123485522623e-05, + "loss": 0.752, + "step": 12137 + }, + { + "epoch": 0.8224134426451657, + "grad_norm": 6.063724994659424, + "learning_rate": 8.441097953316449e-05, + "loss": 0.6209, + "step": 12138 + }, + { + "epoch": 0.8224811979131378, + "grad_norm": 7.265159606933594, + "learning_rate": 8.440961051406667e-05, + "loss": 0.65, + "step": 12139 + }, + { + "epoch": 0.8225489531811099, + "grad_norm": 7.551514148712158, + "learning_rate": 8.440824149496885e-05, + "loss": 1.0007, + "step": 12140 + }, + { + "epoch": 0.822616708449082, + "grad_norm": 4.997889995574951, + "learning_rate": 8.440687247587104e-05, + "loss": 0.6301, + "step": 12141 + }, + { + "epoch": 0.822684463717054, + "grad_norm": 5.728504657745361, + "learning_rate": 8.440550345677322e-05, + "loss": 0.8397, + "step": 12142 + }, + { + "epoch": 0.822752218985026, + "grad_norm": 6.213530540466309, + "learning_rate": 8.44041344376754e-05, + "loss": 0.7488, + "step": 12143 + }, + { + "epoch": 0.8228199742529981, + "grad_norm": 8.768404006958008, + "learning_rate": 8.44027654185776e-05, + "loss": 0.9469, + "step": 12144 + }, + { + "epoch": 0.8228877295209702, + "grad_norm": 7.084804534912109, + "learning_rate": 8.440139639947978e-05, + "loss": 0.697, + "step": 12145 + }, + { + "epoch": 0.8229554847889423, + "grad_norm": 4.6725239753723145, + "learning_rate": 8.440002738038196e-05, + "loss": 0.6902, + "step": 12146 + }, + { + "epoch": 0.8230232400569144, + "grad_norm": 5.927494049072266, + "learning_rate": 8.439865836128415e-05, + "loss": 0.7912, + "step": 12147 + }, + { + "epoch": 0.8230909953248865, + "grad_norm": 5.5850419998168945, + "learning_rate": 8.439728934218633e-05, + "loss": 0.6394, + "step": 12148 + }, + { + "epoch": 0.8231587505928586, + "grad_norm": 6.570766448974609, + "learning_rate": 8.439592032308851e-05, + "loss": 1.004, + "step": 12149 + }, + { + "epoch": 0.8232265058608307, + "grad_norm": 6.343209266662598, + "learning_rate": 8.43945513039907e-05, + "loss": 0.5969, + "step": 12150 + }, + { + "epoch": 0.8232942611288028, + "grad_norm": 7.24255895614624, + "learning_rate": 8.439318228489289e-05, + "loss": 0.9409, + "step": 12151 + }, + { + "epoch": 0.8233620163967749, + "grad_norm": 5.600708961486816, + "learning_rate": 8.439181326579507e-05, + "loss": 1.0606, + "step": 12152 + }, + { + "epoch": 0.823429771664747, + "grad_norm": 7.920993804931641, + "learning_rate": 8.439044424669725e-05, + "loss": 0.8452, + "step": 12153 + }, + { + "epoch": 0.823497526932719, + "grad_norm": 6.625662326812744, + "learning_rate": 8.438907522759944e-05, + "loss": 0.8304, + "step": 12154 + }, + { + "epoch": 0.8235652822006911, + "grad_norm": 7.728579998016357, + "learning_rate": 8.438770620850162e-05, + "loss": 0.7777, + "step": 12155 + }, + { + "epoch": 0.8236330374686632, + "grad_norm": 7.7875752449035645, + "learning_rate": 8.43863371894038e-05, + "loss": 0.8318, + "step": 12156 + }, + { + "epoch": 0.8237007927366353, + "grad_norm": 5.524309158325195, + "learning_rate": 8.438496817030598e-05, + "loss": 0.7924, + "step": 12157 + }, + { + "epoch": 0.8237685480046073, + "grad_norm": 4.976869106292725, + "learning_rate": 8.438359915120816e-05, + "loss": 0.7819, + "step": 12158 + }, + { + "epoch": 0.8238363032725794, + "grad_norm": 7.557702541351318, + "learning_rate": 8.438223013211036e-05, + "loss": 0.6508, + "step": 12159 + }, + { + "epoch": 0.8239040585405515, + "grad_norm": 6.236000061035156, + "learning_rate": 8.438086111301254e-05, + "loss": 0.9233, + "step": 12160 + }, + { + "epoch": 0.8239718138085236, + "grad_norm": 4.986820220947266, + "learning_rate": 8.437949209391472e-05, + "loss": 0.7341, + "step": 12161 + }, + { + "epoch": 0.8240395690764957, + "grad_norm": 6.939243793487549, + "learning_rate": 8.43781230748169e-05, + "loss": 1.0755, + "step": 12162 + }, + { + "epoch": 0.8241073243444678, + "grad_norm": 5.3395843505859375, + "learning_rate": 8.437675405571908e-05, + "loss": 0.6432, + "step": 12163 + }, + { + "epoch": 0.8241750796124399, + "grad_norm": 5.462789058685303, + "learning_rate": 8.437538503662127e-05, + "loss": 0.7004, + "step": 12164 + }, + { + "epoch": 0.824242834880412, + "grad_norm": 7.232882022857666, + "learning_rate": 8.437401601752345e-05, + "loss": 0.7802, + "step": 12165 + }, + { + "epoch": 0.824310590148384, + "grad_norm": 4.660044193267822, + "learning_rate": 8.437264699842563e-05, + "loss": 0.6253, + "step": 12166 + }, + { + "epoch": 0.8243783454163561, + "grad_norm": 6.779306888580322, + "learning_rate": 8.437127797932781e-05, + "loss": 0.9231, + "step": 12167 + }, + { + "epoch": 0.8244461006843282, + "grad_norm": 5.5832905769348145, + "learning_rate": 8.436990896023e-05, + "loss": 0.8409, + "step": 12168 + }, + { + "epoch": 0.8245138559523003, + "grad_norm": 6.603589057922363, + "learning_rate": 8.436853994113219e-05, + "loss": 0.897, + "step": 12169 + }, + { + "epoch": 0.8245816112202724, + "grad_norm": 7.075769424438477, + "learning_rate": 8.436717092203437e-05, + "loss": 0.6168, + "step": 12170 + }, + { + "epoch": 0.8246493664882445, + "grad_norm": 5.7542948722839355, + "learning_rate": 8.436580190293655e-05, + "loss": 0.7924, + "step": 12171 + }, + { + "epoch": 0.8247171217562166, + "grad_norm": 6.221360683441162, + "learning_rate": 8.436443288383873e-05, + "loss": 0.8444, + "step": 12172 + }, + { + "epoch": 0.8247848770241887, + "grad_norm": 5.970016002655029, + "learning_rate": 8.436306386474092e-05, + "loss": 0.8126, + "step": 12173 + }, + { + "epoch": 0.8248526322921608, + "grad_norm": 5.611728668212891, + "learning_rate": 8.43616948456431e-05, + "loss": 0.8069, + "step": 12174 + }, + { + "epoch": 0.8249203875601328, + "grad_norm": 5.260378360748291, + "learning_rate": 8.436032582654528e-05, + "loss": 0.7535, + "step": 12175 + }, + { + "epoch": 0.8249881428281048, + "grad_norm": 11.273239135742188, + "learning_rate": 8.435895680744746e-05, + "loss": 0.6942, + "step": 12176 + }, + { + "epoch": 0.8250558980960769, + "grad_norm": 5.525880813598633, + "learning_rate": 8.435758778834966e-05, + "loss": 0.8583, + "step": 12177 + }, + { + "epoch": 0.825123653364049, + "grad_norm": 8.410612106323242, + "learning_rate": 8.435621876925184e-05, + "loss": 0.852, + "step": 12178 + }, + { + "epoch": 0.8251914086320211, + "grad_norm": 6.29329252243042, + "learning_rate": 8.435484975015402e-05, + "loss": 0.7961, + "step": 12179 + }, + { + "epoch": 0.8252591638999932, + "grad_norm": 5.848037242889404, + "learning_rate": 8.43534807310562e-05, + "loss": 0.7964, + "step": 12180 + }, + { + "epoch": 0.8253269191679653, + "grad_norm": 5.5981669425964355, + "learning_rate": 8.435211171195838e-05, + "loss": 0.7474, + "step": 12181 + }, + { + "epoch": 0.8253946744359374, + "grad_norm": 7.036654949188232, + "learning_rate": 8.435074269286057e-05, + "loss": 0.9101, + "step": 12182 + }, + { + "epoch": 0.8254624297039095, + "grad_norm": 6.488468170166016, + "learning_rate": 8.434937367376275e-05, + "loss": 1.1791, + "step": 12183 + }, + { + "epoch": 0.8255301849718816, + "grad_norm": 6.368350505828857, + "learning_rate": 8.434800465466493e-05, + "loss": 0.7757, + "step": 12184 + }, + { + "epoch": 0.8255979402398537, + "grad_norm": 5.316969394683838, + "learning_rate": 8.434663563556711e-05, + "loss": 0.7681, + "step": 12185 + }, + { + "epoch": 0.8256656955078258, + "grad_norm": 6.011645793914795, + "learning_rate": 8.434526661646929e-05, + "loss": 0.7887, + "step": 12186 + }, + { + "epoch": 0.8257334507757979, + "grad_norm": 6.3625664710998535, + "learning_rate": 8.434389759737149e-05, + "loss": 0.7103, + "step": 12187 + }, + { + "epoch": 0.8258012060437699, + "grad_norm": 5.478143215179443, + "learning_rate": 8.434252857827367e-05, + "loss": 0.6532, + "step": 12188 + }, + { + "epoch": 0.825868961311742, + "grad_norm": 6.654770851135254, + "learning_rate": 8.434115955917585e-05, + "loss": 0.7328, + "step": 12189 + }, + { + "epoch": 0.8259367165797141, + "grad_norm": 5.170722007751465, + "learning_rate": 8.433979054007804e-05, + "loss": 0.7592, + "step": 12190 + }, + { + "epoch": 0.8260044718476861, + "grad_norm": 5.70284366607666, + "learning_rate": 8.433842152098022e-05, + "loss": 0.7273, + "step": 12191 + }, + { + "epoch": 0.8260722271156582, + "grad_norm": 5.822709560394287, + "learning_rate": 8.43370525018824e-05, + "loss": 0.9407, + "step": 12192 + }, + { + "epoch": 0.8261399823836303, + "grad_norm": 7.529512882232666, + "learning_rate": 8.43356834827846e-05, + "loss": 0.9318, + "step": 12193 + }, + { + "epoch": 0.8262077376516024, + "grad_norm": 5.989007949829102, + "learning_rate": 8.433431446368678e-05, + "loss": 0.6854, + "step": 12194 + }, + { + "epoch": 0.8262754929195745, + "grad_norm": 5.390767574310303, + "learning_rate": 8.433294544458896e-05, + "loss": 0.9111, + "step": 12195 + }, + { + "epoch": 0.8263432481875466, + "grad_norm": 6.274799346923828, + "learning_rate": 8.433157642549115e-05, + "loss": 0.5621, + "step": 12196 + }, + { + "epoch": 0.8264110034555187, + "grad_norm": 6.217073917388916, + "learning_rate": 8.433020740639333e-05, + "loss": 0.8127, + "step": 12197 + }, + { + "epoch": 0.8264787587234907, + "grad_norm": 5.437521457672119, + "learning_rate": 8.432883838729551e-05, + "loss": 0.6947, + "step": 12198 + }, + { + "epoch": 0.8265465139914628, + "grad_norm": 6.302811145782471, + "learning_rate": 8.432746936819769e-05, + "loss": 0.855, + "step": 12199 + }, + { + "epoch": 0.8266142692594349, + "grad_norm": 6.268338203430176, + "learning_rate": 8.432610034909988e-05, + "loss": 0.8669, + "step": 12200 + }, + { + "epoch": 0.826682024527407, + "grad_norm": 6.356218338012695, + "learning_rate": 8.432473133000207e-05, + "loss": 0.7883, + "step": 12201 + }, + { + "epoch": 0.8267497797953791, + "grad_norm": 7.442529201507568, + "learning_rate": 8.432336231090425e-05, + "loss": 1.159, + "step": 12202 + }, + { + "epoch": 0.8268175350633512, + "grad_norm": 6.274961471557617, + "learning_rate": 8.432199329180643e-05, + "loss": 1.0073, + "step": 12203 + }, + { + "epoch": 0.8268852903313233, + "grad_norm": 5.910490036010742, + "learning_rate": 8.43206242727086e-05, + "loss": 0.781, + "step": 12204 + }, + { + "epoch": 0.8269530455992954, + "grad_norm": 8.246582984924316, + "learning_rate": 8.43192552536108e-05, + "loss": 0.9337, + "step": 12205 + }, + { + "epoch": 0.8270208008672675, + "grad_norm": 6.15574312210083, + "learning_rate": 8.431788623451298e-05, + "loss": 0.7707, + "step": 12206 + }, + { + "epoch": 0.8270885561352395, + "grad_norm": 5.993718147277832, + "learning_rate": 8.431651721541516e-05, + "loss": 0.73, + "step": 12207 + }, + { + "epoch": 0.8271563114032116, + "grad_norm": 6.5491533279418945, + "learning_rate": 8.431514819631734e-05, + "loss": 0.6576, + "step": 12208 + }, + { + "epoch": 0.8272240666711836, + "grad_norm": 5.608559608459473, + "learning_rate": 8.431377917721953e-05, + "loss": 0.7199, + "step": 12209 + }, + { + "epoch": 0.8272918219391557, + "grad_norm": 6.178080081939697, + "learning_rate": 8.431241015812172e-05, + "loss": 0.5885, + "step": 12210 + }, + { + "epoch": 0.8273595772071278, + "grad_norm": 6.405505657196045, + "learning_rate": 8.43110411390239e-05, + "loss": 0.7247, + "step": 12211 + }, + { + "epoch": 0.8274273324750999, + "grad_norm": 4.907893180847168, + "learning_rate": 8.430967211992608e-05, + "loss": 0.6837, + "step": 12212 + }, + { + "epoch": 0.827495087743072, + "grad_norm": 4.041346549987793, + "learning_rate": 8.430830310082826e-05, + "loss": 0.5236, + "step": 12213 + }, + { + "epoch": 0.8275628430110441, + "grad_norm": 5.496605396270752, + "learning_rate": 8.430693408173045e-05, + "loss": 0.6765, + "step": 12214 + }, + { + "epoch": 0.8276305982790162, + "grad_norm": 5.595060348510742, + "learning_rate": 8.430556506263263e-05, + "loss": 1.0309, + "step": 12215 + }, + { + "epoch": 0.8276983535469883, + "grad_norm": 5.200067043304443, + "learning_rate": 8.430419604353481e-05, + "loss": 0.6944, + "step": 12216 + }, + { + "epoch": 0.8277661088149604, + "grad_norm": 6.38013219833374, + "learning_rate": 8.430282702443699e-05, + "loss": 1.1399, + "step": 12217 + }, + { + "epoch": 0.8278338640829325, + "grad_norm": 5.848254203796387, + "learning_rate": 8.430145800533917e-05, + "loss": 0.7259, + "step": 12218 + }, + { + "epoch": 0.8279016193509046, + "grad_norm": 7.698366641998291, + "learning_rate": 8.430008898624137e-05, + "loss": 0.8736, + "step": 12219 + }, + { + "epoch": 0.8279693746188767, + "grad_norm": 6.256243705749512, + "learning_rate": 8.429871996714355e-05, + "loss": 0.8018, + "step": 12220 + }, + { + "epoch": 0.8280371298868487, + "grad_norm": 7.630728721618652, + "learning_rate": 8.429735094804573e-05, + "loss": 0.6287, + "step": 12221 + }, + { + "epoch": 0.8281048851548208, + "grad_norm": 5.663332939147949, + "learning_rate": 8.42959819289479e-05, + "loss": 0.7079, + "step": 12222 + }, + { + "epoch": 0.8281726404227929, + "grad_norm": 9.478702545166016, + "learning_rate": 8.42946129098501e-05, + "loss": 0.9036, + "step": 12223 + }, + { + "epoch": 0.8282403956907649, + "grad_norm": 5.983333110809326, + "learning_rate": 8.429324389075228e-05, + "loss": 0.777, + "step": 12224 + }, + { + "epoch": 0.828308150958737, + "grad_norm": 5.364030361175537, + "learning_rate": 8.429187487165446e-05, + "loss": 0.8591, + "step": 12225 + }, + { + "epoch": 0.8283759062267091, + "grad_norm": 6.575251579284668, + "learning_rate": 8.429050585255664e-05, + "loss": 0.8834, + "step": 12226 + }, + { + "epoch": 0.8284436614946812, + "grad_norm": 6.744650840759277, + "learning_rate": 8.428913683345882e-05, + "loss": 0.8544, + "step": 12227 + }, + { + "epoch": 0.8285114167626533, + "grad_norm": 7.059446334838867, + "learning_rate": 8.428776781436102e-05, + "loss": 0.7385, + "step": 12228 + }, + { + "epoch": 0.8285791720306254, + "grad_norm": 5.541356086730957, + "learning_rate": 8.42863987952632e-05, + "loss": 0.7784, + "step": 12229 + }, + { + "epoch": 0.8286469272985975, + "grad_norm": 7.66465425491333, + "learning_rate": 8.428502977616538e-05, + "loss": 0.7394, + "step": 12230 + }, + { + "epoch": 0.8287146825665695, + "grad_norm": 5.495204448699951, + "learning_rate": 8.428366075706756e-05, + "loss": 0.6893, + "step": 12231 + }, + { + "epoch": 0.8287824378345416, + "grad_norm": 5.460160255432129, + "learning_rate": 8.428229173796975e-05, + "loss": 0.8543, + "step": 12232 + }, + { + "epoch": 0.8288501931025137, + "grad_norm": 5.867201805114746, + "learning_rate": 8.428092271887193e-05, + "loss": 0.7469, + "step": 12233 + }, + { + "epoch": 0.8289179483704858, + "grad_norm": 5.914271354675293, + "learning_rate": 8.427955369977411e-05, + "loss": 0.9328, + "step": 12234 + }, + { + "epoch": 0.8289857036384579, + "grad_norm": 7.290322303771973, + "learning_rate": 8.427818468067629e-05, + "loss": 0.936, + "step": 12235 + }, + { + "epoch": 0.82905345890643, + "grad_norm": 7.530186653137207, + "learning_rate": 8.427681566157849e-05, + "loss": 0.7333, + "step": 12236 + }, + { + "epoch": 0.8291212141744021, + "grad_norm": 7.260166168212891, + "learning_rate": 8.427544664248067e-05, + "loss": 0.7697, + "step": 12237 + }, + { + "epoch": 0.8291889694423742, + "grad_norm": 5.8253703117370605, + "learning_rate": 8.427407762338285e-05, + "loss": 0.8838, + "step": 12238 + }, + { + "epoch": 0.8292567247103463, + "grad_norm": 6.672026634216309, + "learning_rate": 8.427270860428504e-05, + "loss": 0.5896, + "step": 12239 + }, + { + "epoch": 0.8293244799783183, + "grad_norm": 5.347162246704102, + "learning_rate": 8.427133958518722e-05, + "loss": 0.7497, + "step": 12240 + }, + { + "epoch": 0.8293922352462904, + "grad_norm": 8.071971893310547, + "learning_rate": 8.42699705660894e-05, + "loss": 0.9985, + "step": 12241 + }, + { + "epoch": 0.8294599905142624, + "grad_norm": 5.0009636878967285, + "learning_rate": 8.42686015469916e-05, + "loss": 0.8166, + "step": 12242 + }, + { + "epoch": 0.8295277457822345, + "grad_norm": 5.593808174133301, + "learning_rate": 8.426723252789377e-05, + "loss": 0.6928, + "step": 12243 + }, + { + "epoch": 0.8295955010502066, + "grad_norm": 5.249474048614502, + "learning_rate": 8.426586350879596e-05, + "loss": 0.7625, + "step": 12244 + }, + { + "epoch": 0.8296632563181787, + "grad_norm": 5.920688152313232, + "learning_rate": 8.426449448969814e-05, + "loss": 0.7011, + "step": 12245 + }, + { + "epoch": 0.8297310115861508, + "grad_norm": 9.127151489257812, + "learning_rate": 8.426312547060033e-05, + "loss": 0.917, + "step": 12246 + }, + { + "epoch": 0.8297987668541229, + "grad_norm": 6.6722822189331055, + "learning_rate": 8.426175645150251e-05, + "loss": 0.7894, + "step": 12247 + }, + { + "epoch": 0.829866522122095, + "grad_norm": 7.910020351409912, + "learning_rate": 8.426038743240469e-05, + "loss": 0.6994, + "step": 12248 + }, + { + "epoch": 0.8299342773900671, + "grad_norm": 7.736839294433594, + "learning_rate": 8.425901841330687e-05, + "loss": 0.6598, + "step": 12249 + }, + { + "epoch": 0.8300020326580392, + "grad_norm": 5.101329803466797, + "learning_rate": 8.425764939420905e-05, + "loss": 0.8365, + "step": 12250 + }, + { + "epoch": 0.8300697879260113, + "grad_norm": 6.494842052459717, + "learning_rate": 8.425628037511124e-05, + "loss": 0.8936, + "step": 12251 + }, + { + "epoch": 0.8301375431939834, + "grad_norm": 5.946401596069336, + "learning_rate": 8.425491135601343e-05, + "loss": 0.8427, + "step": 12252 + }, + { + "epoch": 0.8302052984619555, + "grad_norm": 5.827462673187256, + "learning_rate": 8.42535423369156e-05, + "loss": 0.7694, + "step": 12253 + }, + { + "epoch": 0.8302730537299275, + "grad_norm": 5.955854415893555, + "learning_rate": 8.425217331781779e-05, + "loss": 0.8644, + "step": 12254 + }, + { + "epoch": 0.8303408089978996, + "grad_norm": 5.749096870422363, + "learning_rate": 8.425080429871998e-05, + "loss": 0.7225, + "step": 12255 + }, + { + "epoch": 0.8304085642658716, + "grad_norm": 5.03810453414917, + "learning_rate": 8.424943527962216e-05, + "loss": 0.5006, + "step": 12256 + }, + { + "epoch": 0.8304763195338437, + "grad_norm": 7.007089614868164, + "learning_rate": 8.424806626052434e-05, + "loss": 0.9485, + "step": 12257 + }, + { + "epoch": 0.8305440748018158, + "grad_norm": 5.671133041381836, + "learning_rate": 8.424669724142652e-05, + "loss": 0.6842, + "step": 12258 + }, + { + "epoch": 0.8306118300697879, + "grad_norm": 4.326511383056641, + "learning_rate": 8.42453282223287e-05, + "loss": 0.7619, + "step": 12259 + }, + { + "epoch": 0.83067958533776, + "grad_norm": 4.897543907165527, + "learning_rate": 8.42439592032309e-05, + "loss": 0.6889, + "step": 12260 + }, + { + "epoch": 0.8307473406057321, + "grad_norm": 5.780319690704346, + "learning_rate": 8.424259018413308e-05, + "loss": 1.0366, + "step": 12261 + }, + { + "epoch": 0.8308150958737042, + "grad_norm": 6.754616737365723, + "learning_rate": 8.424122116503526e-05, + "loss": 0.9706, + "step": 12262 + }, + { + "epoch": 0.8308828511416763, + "grad_norm": 5.684625625610352, + "learning_rate": 8.423985214593744e-05, + "loss": 0.7783, + "step": 12263 + }, + { + "epoch": 0.8309506064096484, + "grad_norm": 5.692160129547119, + "learning_rate": 8.423848312683962e-05, + "loss": 0.6993, + "step": 12264 + }, + { + "epoch": 0.8310183616776204, + "grad_norm": 5.838659286499023, + "learning_rate": 8.423711410774181e-05, + "loss": 0.8698, + "step": 12265 + }, + { + "epoch": 0.8310861169455925, + "grad_norm": 5.119133472442627, + "learning_rate": 8.423574508864399e-05, + "loss": 0.8026, + "step": 12266 + }, + { + "epoch": 0.8311538722135646, + "grad_norm": 6.305530071258545, + "learning_rate": 8.423437606954617e-05, + "loss": 0.8705, + "step": 12267 + }, + { + "epoch": 0.8312216274815367, + "grad_norm": 5.659543037414551, + "learning_rate": 8.423300705044835e-05, + "loss": 0.7701, + "step": 12268 + }, + { + "epoch": 0.8312893827495088, + "grad_norm": 6.706612586975098, + "learning_rate": 8.423163803135055e-05, + "loss": 0.6325, + "step": 12269 + }, + { + "epoch": 0.8313571380174809, + "grad_norm": 6.535792827606201, + "learning_rate": 8.423026901225273e-05, + "loss": 0.7957, + "step": 12270 + }, + { + "epoch": 0.831424893285453, + "grad_norm": 5.140613079071045, + "learning_rate": 8.42288999931549e-05, + "loss": 0.7975, + "step": 12271 + }, + { + "epoch": 0.831492648553425, + "grad_norm": 5.579657554626465, + "learning_rate": 8.422753097405709e-05, + "loss": 0.6756, + "step": 12272 + }, + { + "epoch": 0.8315604038213971, + "grad_norm": 8.427629470825195, + "learning_rate": 8.422616195495927e-05, + "loss": 0.7253, + "step": 12273 + }, + { + "epoch": 0.8316281590893692, + "grad_norm": 5.536694526672363, + "learning_rate": 8.422479293586146e-05, + "loss": 0.8035, + "step": 12274 + }, + { + "epoch": 0.8316959143573412, + "grad_norm": 7.416363716125488, + "learning_rate": 8.422342391676364e-05, + "loss": 1.0227, + "step": 12275 + }, + { + "epoch": 0.8317636696253133, + "grad_norm": 8.551451683044434, + "learning_rate": 8.422205489766582e-05, + "loss": 0.9759, + "step": 12276 + }, + { + "epoch": 0.8318314248932854, + "grad_norm": 6.636923789978027, + "learning_rate": 8.4220685878568e-05, + "loss": 0.8017, + "step": 12277 + }, + { + "epoch": 0.8318991801612575, + "grad_norm": 5.85496711730957, + "learning_rate": 8.42193168594702e-05, + "loss": 0.7672, + "step": 12278 + }, + { + "epoch": 0.8319669354292296, + "grad_norm": 5.499538898468018, + "learning_rate": 8.421794784037238e-05, + "loss": 0.6539, + "step": 12279 + }, + { + "epoch": 0.8320346906972017, + "grad_norm": 5.9310150146484375, + "learning_rate": 8.421657882127456e-05, + "loss": 0.6745, + "step": 12280 + }, + { + "epoch": 0.8321024459651738, + "grad_norm": 5.379483222961426, + "learning_rate": 8.421520980217674e-05, + "loss": 0.7548, + "step": 12281 + }, + { + "epoch": 0.8321702012331459, + "grad_norm": 5.287676811218262, + "learning_rate": 8.421384078307892e-05, + "loss": 0.7051, + "step": 12282 + }, + { + "epoch": 0.832237956501118, + "grad_norm": 8.057753562927246, + "learning_rate": 8.421247176398111e-05, + "loss": 0.7589, + "step": 12283 + }, + { + "epoch": 0.8323057117690901, + "grad_norm": 6.534327983856201, + "learning_rate": 8.421110274488329e-05, + "loss": 0.7855, + "step": 12284 + }, + { + "epoch": 0.8323734670370622, + "grad_norm": 7.2967753410339355, + "learning_rate": 8.420973372578547e-05, + "loss": 0.7163, + "step": 12285 + }, + { + "epoch": 0.8324412223050343, + "grad_norm": 7.196539402008057, + "learning_rate": 8.420836470668767e-05, + "loss": 0.8134, + "step": 12286 + }, + { + "epoch": 0.8325089775730063, + "grad_norm": 5.886240005493164, + "learning_rate": 8.420699568758985e-05, + "loss": 0.7407, + "step": 12287 + }, + { + "epoch": 0.8325767328409784, + "grad_norm": 6.262639045715332, + "learning_rate": 8.420562666849204e-05, + "loss": 0.7359, + "step": 12288 + }, + { + "epoch": 0.8326444881089504, + "grad_norm": 6.491570949554443, + "learning_rate": 8.420425764939422e-05, + "loss": 0.7787, + "step": 12289 + }, + { + "epoch": 0.8327122433769225, + "grad_norm": 5.070222854614258, + "learning_rate": 8.42028886302964e-05, + "loss": 0.7032, + "step": 12290 + }, + { + "epoch": 0.8327799986448946, + "grad_norm": 5.188915252685547, + "learning_rate": 8.420151961119858e-05, + "loss": 0.7538, + "step": 12291 + }, + { + "epoch": 0.8328477539128667, + "grad_norm": 5.764748573303223, + "learning_rate": 8.420015059210077e-05, + "loss": 0.8324, + "step": 12292 + }, + { + "epoch": 0.8329155091808388, + "grad_norm": 5.611788272857666, + "learning_rate": 8.419878157300295e-05, + "loss": 0.6855, + "step": 12293 + }, + { + "epoch": 0.8329832644488109, + "grad_norm": 5.202719688415527, + "learning_rate": 8.419741255390513e-05, + "loss": 0.7333, + "step": 12294 + }, + { + "epoch": 0.833051019716783, + "grad_norm": 5.643661975860596, + "learning_rate": 8.419604353480732e-05, + "loss": 0.8347, + "step": 12295 + }, + { + "epoch": 0.8331187749847551, + "grad_norm": 5.144847869873047, + "learning_rate": 8.41946745157095e-05, + "loss": 0.6602, + "step": 12296 + }, + { + "epoch": 0.8331865302527272, + "grad_norm": 6.511287212371826, + "learning_rate": 8.419330549661169e-05, + "loss": 0.6691, + "step": 12297 + }, + { + "epoch": 0.8332542855206992, + "grad_norm": 6.4027276039123535, + "learning_rate": 8.419193647751387e-05, + "loss": 0.7781, + "step": 12298 + }, + { + "epoch": 0.8333220407886713, + "grad_norm": 6.630940914154053, + "learning_rate": 8.419056745841605e-05, + "loss": 0.8879, + "step": 12299 + }, + { + "epoch": 0.8333897960566434, + "grad_norm": 7.193549633026123, + "learning_rate": 8.418919843931823e-05, + "loss": 1.0063, + "step": 12300 + }, + { + "epoch": 0.8334575513246155, + "grad_norm": 5.909510612487793, + "learning_rate": 8.418782942022042e-05, + "loss": 0.8406, + "step": 12301 + }, + { + "epoch": 0.8335253065925876, + "grad_norm": 5.470668315887451, + "learning_rate": 8.41864604011226e-05, + "loss": 0.7256, + "step": 12302 + }, + { + "epoch": 0.8335930618605597, + "grad_norm": 5.57338809967041, + "learning_rate": 8.418509138202479e-05, + "loss": 0.6156, + "step": 12303 + }, + { + "epoch": 0.8336608171285318, + "grad_norm": 5.950277805328369, + "learning_rate": 8.418372236292697e-05, + "loss": 0.7232, + "step": 12304 + }, + { + "epoch": 0.8337285723965038, + "grad_norm": 4.642901420593262, + "learning_rate": 8.418235334382915e-05, + "loss": 0.634, + "step": 12305 + }, + { + "epoch": 0.8337963276644759, + "grad_norm": 7.471027374267578, + "learning_rate": 8.418098432473134e-05, + "loss": 1.0318, + "step": 12306 + }, + { + "epoch": 0.833864082932448, + "grad_norm": 5.720177173614502, + "learning_rate": 8.417961530563352e-05, + "loss": 0.806, + "step": 12307 + }, + { + "epoch": 0.83393183820042, + "grad_norm": 7.918213844299316, + "learning_rate": 8.41782462865357e-05, + "loss": 0.8564, + "step": 12308 + }, + { + "epoch": 0.8339995934683921, + "grad_norm": 6.492531776428223, + "learning_rate": 8.417687726743788e-05, + "loss": 1.0221, + "step": 12309 + }, + { + "epoch": 0.8340673487363642, + "grad_norm": 5.253319263458252, + "learning_rate": 8.417550824834007e-05, + "loss": 0.7599, + "step": 12310 + }, + { + "epoch": 0.8341351040043363, + "grad_norm": 6.8917975425720215, + "learning_rate": 8.417413922924225e-05, + "loss": 0.7486, + "step": 12311 + }, + { + "epoch": 0.8342028592723084, + "grad_norm": 7.639297962188721, + "learning_rate": 8.417277021014444e-05, + "loss": 0.9052, + "step": 12312 + }, + { + "epoch": 0.8342706145402805, + "grad_norm": 6.974343776702881, + "learning_rate": 8.417140119104662e-05, + "loss": 0.6297, + "step": 12313 + }, + { + "epoch": 0.8343383698082526, + "grad_norm": 5.5928802490234375, + "learning_rate": 8.41700321719488e-05, + "loss": 0.8208, + "step": 12314 + }, + { + "epoch": 0.8344061250762247, + "grad_norm": 5.503357410430908, + "learning_rate": 8.416866315285099e-05, + "loss": 0.7472, + "step": 12315 + }, + { + "epoch": 0.8344738803441968, + "grad_norm": 6.915671348571777, + "learning_rate": 8.416729413375317e-05, + "loss": 0.7894, + "step": 12316 + }, + { + "epoch": 0.8345416356121689, + "grad_norm": 8.041422843933105, + "learning_rate": 8.416592511465535e-05, + "loss": 0.9854, + "step": 12317 + }, + { + "epoch": 0.834609390880141, + "grad_norm": 6.027585506439209, + "learning_rate": 8.416455609555753e-05, + "loss": 1.0832, + "step": 12318 + }, + { + "epoch": 0.8346771461481131, + "grad_norm": 7.771341800689697, + "learning_rate": 8.416318707645971e-05, + "loss": 0.9197, + "step": 12319 + }, + { + "epoch": 0.8347449014160851, + "grad_norm": 5.460988521575928, + "learning_rate": 8.41618180573619e-05, + "loss": 0.7626, + "step": 12320 + }, + { + "epoch": 0.8348126566840571, + "grad_norm": 5.957553863525391, + "learning_rate": 8.416044903826409e-05, + "loss": 0.5958, + "step": 12321 + }, + { + "epoch": 0.8348804119520292, + "grad_norm": 6.164322853088379, + "learning_rate": 8.415908001916627e-05, + "loss": 0.7692, + "step": 12322 + }, + { + "epoch": 0.8349481672200013, + "grad_norm": 6.864661693572998, + "learning_rate": 8.415771100006845e-05, + "loss": 0.8181, + "step": 12323 + }, + { + "epoch": 0.8350159224879734, + "grad_norm": 4.985629558563232, + "learning_rate": 8.415634198097064e-05, + "loss": 0.8815, + "step": 12324 + }, + { + "epoch": 0.8350836777559455, + "grad_norm": 6.169389724731445, + "learning_rate": 8.415497296187282e-05, + "loss": 0.7982, + "step": 12325 + }, + { + "epoch": 0.8351514330239176, + "grad_norm": 7.641390800476074, + "learning_rate": 8.4153603942775e-05, + "loss": 0.7122, + "step": 12326 + }, + { + "epoch": 0.8352191882918897, + "grad_norm": 5.388051509857178, + "learning_rate": 8.415223492367718e-05, + "loss": 0.9924, + "step": 12327 + }, + { + "epoch": 0.8352869435598618, + "grad_norm": 6.870946407318115, + "learning_rate": 8.415086590457936e-05, + "loss": 0.5997, + "step": 12328 + }, + { + "epoch": 0.8353546988278339, + "grad_norm": 7.626512050628662, + "learning_rate": 8.414949688548156e-05, + "loss": 0.7789, + "step": 12329 + }, + { + "epoch": 0.835422454095806, + "grad_norm": 5.378123760223389, + "learning_rate": 8.414812786638374e-05, + "loss": 0.8172, + "step": 12330 + }, + { + "epoch": 0.835490209363778, + "grad_norm": 6.5015716552734375, + "learning_rate": 8.414675884728592e-05, + "loss": 0.805, + "step": 12331 + }, + { + "epoch": 0.8355579646317501, + "grad_norm": 5.173733711242676, + "learning_rate": 8.414538982818811e-05, + "loss": 0.6039, + "step": 12332 + }, + { + "epoch": 0.8356257198997222, + "grad_norm": 5.68528413772583, + "learning_rate": 8.414402080909029e-05, + "loss": 0.8144, + "step": 12333 + }, + { + "epoch": 0.8356934751676943, + "grad_norm": 6.874687671661377, + "learning_rate": 8.414265178999247e-05, + "loss": 0.889, + "step": 12334 + }, + { + "epoch": 0.8357612304356664, + "grad_norm": 8.360238075256348, + "learning_rate": 8.414128277089466e-05, + "loss": 0.9033, + "step": 12335 + }, + { + "epoch": 0.8358289857036385, + "grad_norm": 5.4390106201171875, + "learning_rate": 8.413991375179684e-05, + "loss": 0.6946, + "step": 12336 + }, + { + "epoch": 0.8358967409716106, + "grad_norm": 9.848974227905273, + "learning_rate": 8.413854473269903e-05, + "loss": 0.7939, + "step": 12337 + }, + { + "epoch": 0.8359644962395826, + "grad_norm": 7.111310958862305, + "learning_rate": 8.413717571360122e-05, + "loss": 0.739, + "step": 12338 + }, + { + "epoch": 0.8360322515075547, + "grad_norm": 6.9361443519592285, + "learning_rate": 8.41358066945034e-05, + "loss": 0.8542, + "step": 12339 + }, + { + "epoch": 0.8361000067755268, + "grad_norm": 8.055535316467285, + "learning_rate": 8.413443767540558e-05, + "loss": 0.7256, + "step": 12340 + }, + { + "epoch": 0.8361677620434989, + "grad_norm": 5.564542770385742, + "learning_rate": 8.413306865630776e-05, + "loss": 0.7472, + "step": 12341 + }, + { + "epoch": 0.8362355173114709, + "grad_norm": 5.300485134124756, + "learning_rate": 8.413169963720995e-05, + "loss": 0.5295, + "step": 12342 + }, + { + "epoch": 0.836303272579443, + "grad_norm": 5.996912479400635, + "learning_rate": 8.413033061811213e-05, + "loss": 0.9896, + "step": 12343 + }, + { + "epoch": 0.8363710278474151, + "grad_norm": 5.013984680175781, + "learning_rate": 8.412896159901431e-05, + "loss": 0.8257, + "step": 12344 + }, + { + "epoch": 0.8364387831153872, + "grad_norm": 6.624052047729492, + "learning_rate": 8.41275925799165e-05, + "loss": 0.9886, + "step": 12345 + }, + { + "epoch": 0.8365065383833593, + "grad_norm": 6.04857063293457, + "learning_rate": 8.412622356081868e-05, + "loss": 0.8513, + "step": 12346 + }, + { + "epoch": 0.8365742936513314, + "grad_norm": 4.840993404388428, + "learning_rate": 8.412485454172087e-05, + "loss": 0.8743, + "step": 12347 + }, + { + "epoch": 0.8366420489193035, + "grad_norm": 7.008707046508789, + "learning_rate": 8.412348552262305e-05, + "loss": 0.85, + "step": 12348 + }, + { + "epoch": 0.8367098041872756, + "grad_norm": 7.408865451812744, + "learning_rate": 8.412211650352523e-05, + "loss": 0.9099, + "step": 12349 + }, + { + "epoch": 0.8367775594552477, + "grad_norm": 6.566858291625977, + "learning_rate": 8.412074748442741e-05, + "loss": 0.9804, + "step": 12350 + }, + { + "epoch": 0.8368453147232198, + "grad_norm": 5.375271797180176, + "learning_rate": 8.411937846532959e-05, + "loss": 0.6209, + "step": 12351 + }, + { + "epoch": 0.8369130699911919, + "grad_norm": 6.087400436401367, + "learning_rate": 8.411800944623178e-05, + "loss": 0.6413, + "step": 12352 + }, + { + "epoch": 0.836980825259164, + "grad_norm": 6.311927318572998, + "learning_rate": 8.411664042713396e-05, + "loss": 0.7192, + "step": 12353 + }, + { + "epoch": 0.8370485805271359, + "grad_norm": 7.907698154449463, + "learning_rate": 8.411527140803615e-05, + "loss": 0.9493, + "step": 12354 + }, + { + "epoch": 0.837116335795108, + "grad_norm": 5.864373683929443, + "learning_rate": 8.411390238893833e-05, + "loss": 0.921, + "step": 12355 + }, + { + "epoch": 0.8371840910630801, + "grad_norm": 5.77023458480835, + "learning_rate": 8.411253336984052e-05, + "loss": 0.8094, + "step": 12356 + }, + { + "epoch": 0.8372518463310522, + "grad_norm": 7.120599746704102, + "learning_rate": 8.41111643507427e-05, + "loss": 0.7822, + "step": 12357 + }, + { + "epoch": 0.8373196015990243, + "grad_norm": 6.42577600479126, + "learning_rate": 8.410979533164488e-05, + "loss": 0.9123, + "step": 12358 + }, + { + "epoch": 0.8373873568669964, + "grad_norm": 6.504154205322266, + "learning_rate": 8.410842631254706e-05, + "loss": 0.7072, + "step": 12359 + }, + { + "epoch": 0.8374551121349685, + "grad_norm": 6.982132434844971, + "learning_rate": 8.410705729344924e-05, + "loss": 0.9333, + "step": 12360 + }, + { + "epoch": 0.8375228674029406, + "grad_norm": 5.410412311553955, + "learning_rate": 8.410568827435143e-05, + "loss": 0.7905, + "step": 12361 + }, + { + "epoch": 0.8375906226709127, + "grad_norm": 5.881120681762695, + "learning_rate": 8.410431925525361e-05, + "loss": 0.7371, + "step": 12362 + }, + { + "epoch": 0.8376583779388848, + "grad_norm": 6.792932033538818, + "learning_rate": 8.41029502361558e-05, + "loss": 0.6885, + "step": 12363 + }, + { + "epoch": 0.8377261332068568, + "grad_norm": 6.715199947357178, + "learning_rate": 8.410158121705798e-05, + "loss": 0.8673, + "step": 12364 + }, + { + "epoch": 0.8377938884748289, + "grad_norm": 6.608368873596191, + "learning_rate": 8.410021219796017e-05, + "loss": 1.0625, + "step": 12365 + }, + { + "epoch": 0.837861643742801, + "grad_norm": 8.170140266418457, + "learning_rate": 8.409884317886235e-05, + "loss": 0.8656, + "step": 12366 + }, + { + "epoch": 0.8379293990107731, + "grad_norm": 7.180694103240967, + "learning_rate": 8.409747415976453e-05, + "loss": 0.7619, + "step": 12367 + }, + { + "epoch": 0.8379971542787452, + "grad_norm": 5.306380748748779, + "learning_rate": 8.409610514066671e-05, + "loss": 0.6672, + "step": 12368 + }, + { + "epoch": 0.8380649095467173, + "grad_norm": 9.249306678771973, + "learning_rate": 8.409473612156889e-05, + "loss": 0.759, + "step": 12369 + }, + { + "epoch": 0.8381326648146893, + "grad_norm": 6.1703572273254395, + "learning_rate": 8.409336710247108e-05, + "loss": 0.661, + "step": 12370 + }, + { + "epoch": 0.8382004200826614, + "grad_norm": 4.662862300872803, + "learning_rate": 8.409199808337327e-05, + "loss": 0.6401, + "step": 12371 + }, + { + "epoch": 0.8382681753506335, + "grad_norm": 4.667969226837158, + "learning_rate": 8.409062906427545e-05, + "loss": 0.6579, + "step": 12372 + }, + { + "epoch": 0.8383359306186056, + "grad_norm": 6.247222900390625, + "learning_rate": 8.408926004517763e-05, + "loss": 0.7907, + "step": 12373 + }, + { + "epoch": 0.8384036858865777, + "grad_norm": 6.933483123779297, + "learning_rate": 8.40878910260798e-05, + "loss": 0.8402, + "step": 12374 + }, + { + "epoch": 0.8384714411545497, + "grad_norm": 5.13164758682251, + "learning_rate": 8.4086522006982e-05, + "loss": 0.7144, + "step": 12375 + }, + { + "epoch": 0.8385391964225218, + "grad_norm": 5.230576515197754, + "learning_rate": 8.408515298788418e-05, + "loss": 0.6836, + "step": 12376 + }, + { + "epoch": 0.8386069516904939, + "grad_norm": 6.535162448883057, + "learning_rate": 8.408378396878636e-05, + "loss": 0.9004, + "step": 12377 + }, + { + "epoch": 0.838674706958466, + "grad_norm": 7.608928203582764, + "learning_rate": 8.408241494968855e-05, + "loss": 0.9791, + "step": 12378 + }, + { + "epoch": 0.8387424622264381, + "grad_norm": 6.712092399597168, + "learning_rate": 8.408104593059073e-05, + "loss": 0.6513, + "step": 12379 + }, + { + "epoch": 0.8388102174944102, + "grad_norm": 6.102575778961182, + "learning_rate": 8.407967691149292e-05, + "loss": 0.8751, + "step": 12380 + }, + { + "epoch": 0.8388779727623823, + "grad_norm": 5.246453285217285, + "learning_rate": 8.407830789239511e-05, + "loss": 0.6477, + "step": 12381 + }, + { + "epoch": 0.8389457280303544, + "grad_norm": 6.3806633949279785, + "learning_rate": 8.407693887329729e-05, + "loss": 0.7549, + "step": 12382 + }, + { + "epoch": 0.8390134832983265, + "grad_norm": 6.284943103790283, + "learning_rate": 8.407556985419947e-05, + "loss": 0.7873, + "step": 12383 + }, + { + "epoch": 0.8390812385662986, + "grad_norm": 6.552889823913574, + "learning_rate": 8.407420083510166e-05, + "loss": 0.8152, + "step": 12384 + }, + { + "epoch": 0.8391489938342707, + "grad_norm": 7.694222927093506, + "learning_rate": 8.407283181600384e-05, + "loss": 0.6777, + "step": 12385 + }, + { + "epoch": 0.8392167491022428, + "grad_norm": 6.1887125968933105, + "learning_rate": 8.407146279690602e-05, + "loss": 0.9402, + "step": 12386 + }, + { + "epoch": 0.8392845043702147, + "grad_norm": 6.24276065826416, + "learning_rate": 8.40700937778082e-05, + "loss": 1.0976, + "step": 12387 + }, + { + "epoch": 0.8393522596381868, + "grad_norm": 5.816521167755127, + "learning_rate": 8.40687247587104e-05, + "loss": 0.8681, + "step": 12388 + }, + { + "epoch": 0.8394200149061589, + "grad_norm": 5.3127923011779785, + "learning_rate": 8.406735573961258e-05, + "loss": 0.7598, + "step": 12389 + }, + { + "epoch": 0.839487770174131, + "grad_norm": 7.444540500640869, + "learning_rate": 8.406598672051476e-05, + "loss": 0.8451, + "step": 12390 + }, + { + "epoch": 0.8395555254421031, + "grad_norm": 8.680950164794922, + "learning_rate": 8.406461770141694e-05, + "loss": 1.0441, + "step": 12391 + }, + { + "epoch": 0.8396232807100752, + "grad_norm": 6.707736968994141, + "learning_rate": 8.406324868231912e-05, + "loss": 0.7834, + "step": 12392 + }, + { + "epoch": 0.8396910359780473, + "grad_norm": 5.440469741821289, + "learning_rate": 8.406187966322131e-05, + "loss": 0.7424, + "step": 12393 + }, + { + "epoch": 0.8397587912460194, + "grad_norm": 9.19237995147705, + "learning_rate": 8.40605106441235e-05, + "loss": 0.7367, + "step": 12394 + }, + { + "epoch": 0.8398265465139915, + "grad_norm": 6.017999172210693, + "learning_rate": 8.405914162502567e-05, + "loss": 0.822, + "step": 12395 + }, + { + "epoch": 0.8398943017819636, + "grad_norm": 6.79027795791626, + "learning_rate": 8.405777260592785e-05, + "loss": 0.9386, + "step": 12396 + }, + { + "epoch": 0.8399620570499357, + "grad_norm": 5.836680889129639, + "learning_rate": 8.405640358683004e-05, + "loss": 0.7095, + "step": 12397 + }, + { + "epoch": 0.8400298123179077, + "grad_norm": 5.866933345794678, + "learning_rate": 8.405503456773223e-05, + "loss": 0.7168, + "step": 12398 + }, + { + "epoch": 0.8400975675858798, + "grad_norm": 6.663938999176025, + "learning_rate": 8.405366554863441e-05, + "loss": 0.867, + "step": 12399 + }, + { + "epoch": 0.8401653228538519, + "grad_norm": 5.113254547119141, + "learning_rate": 8.405229652953659e-05, + "loss": 0.6785, + "step": 12400 + }, + { + "epoch": 0.840233078121824, + "grad_norm": 6.106493949890137, + "learning_rate": 8.405092751043877e-05, + "loss": 0.8815, + "step": 12401 + }, + { + "epoch": 0.8403008333897961, + "grad_norm": 5.644641399383545, + "learning_rate": 8.404955849134096e-05, + "loss": 0.6902, + "step": 12402 + }, + { + "epoch": 0.8403685886577681, + "grad_norm": 5.706465244293213, + "learning_rate": 8.404818947224314e-05, + "loss": 0.7725, + "step": 12403 + }, + { + "epoch": 0.8404363439257402, + "grad_norm": 8.362403869628906, + "learning_rate": 8.404682045314532e-05, + "loss": 0.9757, + "step": 12404 + }, + { + "epoch": 0.8405040991937123, + "grad_norm": 5.341566562652588, + "learning_rate": 8.40454514340475e-05, + "loss": 0.8645, + "step": 12405 + }, + { + "epoch": 0.8405718544616844, + "grad_norm": 6.9801859855651855, + "learning_rate": 8.404408241494969e-05, + "loss": 0.7933, + "step": 12406 + }, + { + "epoch": 0.8406396097296565, + "grad_norm": 7.076079845428467, + "learning_rate": 8.404271339585188e-05, + "loss": 0.9387, + "step": 12407 + }, + { + "epoch": 0.8407073649976285, + "grad_norm": 5.684225082397461, + "learning_rate": 8.404134437675406e-05, + "loss": 0.6575, + "step": 12408 + }, + { + "epoch": 0.8407751202656006, + "grad_norm": 6.203160285949707, + "learning_rate": 8.403997535765624e-05, + "loss": 0.6188, + "step": 12409 + }, + { + "epoch": 0.8408428755335727, + "grad_norm": 7.039827346801758, + "learning_rate": 8.403860633855842e-05, + "loss": 0.8741, + "step": 12410 + }, + { + "epoch": 0.8409106308015448, + "grad_norm": 5.340671062469482, + "learning_rate": 8.403723731946061e-05, + "loss": 0.6957, + "step": 12411 + }, + { + "epoch": 0.8409783860695169, + "grad_norm": 6.316356182098389, + "learning_rate": 8.40358683003628e-05, + "loss": 0.8684, + "step": 12412 + }, + { + "epoch": 0.841046141337489, + "grad_norm": 5.906060218811035, + "learning_rate": 8.403449928126497e-05, + "loss": 0.9206, + "step": 12413 + }, + { + "epoch": 0.8411138966054611, + "grad_norm": 7.139042854309082, + "learning_rate": 8.403313026216716e-05, + "loss": 0.862, + "step": 12414 + }, + { + "epoch": 0.8411816518734332, + "grad_norm": 6.347969055175781, + "learning_rate": 8.403176124306934e-05, + "loss": 0.7029, + "step": 12415 + }, + { + "epoch": 0.8412494071414053, + "grad_norm": 7.3854780197143555, + "learning_rate": 8.403039222397153e-05, + "loss": 0.9811, + "step": 12416 + }, + { + "epoch": 0.8413171624093774, + "grad_norm": 5.81329870223999, + "learning_rate": 8.402902320487371e-05, + "loss": 0.9048, + "step": 12417 + }, + { + "epoch": 0.8413849176773495, + "grad_norm": 5.879884719848633, + "learning_rate": 8.402765418577589e-05, + "loss": 0.8854, + "step": 12418 + }, + { + "epoch": 0.8414526729453214, + "grad_norm": 5.32490348815918, + "learning_rate": 8.402628516667807e-05, + "loss": 1.0044, + "step": 12419 + }, + { + "epoch": 0.8415204282132935, + "grad_norm": 6.529613018035889, + "learning_rate": 8.402491614758026e-05, + "loss": 0.7893, + "step": 12420 + }, + { + "epoch": 0.8415881834812656, + "grad_norm": 6.274640083312988, + "learning_rate": 8.402354712848244e-05, + "loss": 1.0186, + "step": 12421 + }, + { + "epoch": 0.8416559387492377, + "grad_norm": 6.493834972381592, + "learning_rate": 8.402217810938463e-05, + "loss": 0.9347, + "step": 12422 + }, + { + "epoch": 0.8417236940172098, + "grad_norm": 5.429368495941162, + "learning_rate": 8.40208090902868e-05, + "loss": 0.7862, + "step": 12423 + }, + { + "epoch": 0.8417914492851819, + "grad_norm": 5.897449970245361, + "learning_rate": 8.4019440071189e-05, + "loss": 0.7278, + "step": 12424 + }, + { + "epoch": 0.841859204553154, + "grad_norm": 5.485132694244385, + "learning_rate": 8.401807105209118e-05, + "loss": 0.5984, + "step": 12425 + }, + { + "epoch": 0.8419269598211261, + "grad_norm": 6.194313049316406, + "learning_rate": 8.401670203299336e-05, + "loss": 0.8396, + "step": 12426 + }, + { + "epoch": 0.8419947150890982, + "grad_norm": 6.459996700286865, + "learning_rate": 8.401533301389555e-05, + "loss": 0.8757, + "step": 12427 + }, + { + "epoch": 0.8420624703570703, + "grad_norm": 5.770567417144775, + "learning_rate": 8.401396399479773e-05, + "loss": 0.6524, + "step": 12428 + }, + { + "epoch": 0.8421302256250424, + "grad_norm": 5.737586975097656, + "learning_rate": 8.401259497569991e-05, + "loss": 0.7936, + "step": 12429 + }, + { + "epoch": 0.8421979808930145, + "grad_norm": 5.61273717880249, + "learning_rate": 8.401122595660211e-05, + "loss": 0.8404, + "step": 12430 + }, + { + "epoch": 0.8422657361609865, + "grad_norm": 5.813633918762207, + "learning_rate": 8.400985693750429e-05, + "loss": 0.8042, + "step": 12431 + }, + { + "epoch": 0.8423334914289586, + "grad_norm": 10.310413360595703, + "learning_rate": 8.400848791840647e-05, + "loss": 0.7039, + "step": 12432 + }, + { + "epoch": 0.8424012466969307, + "grad_norm": 5.826198577880859, + "learning_rate": 8.400711889930865e-05, + "loss": 0.7276, + "step": 12433 + }, + { + "epoch": 0.8424690019649028, + "grad_norm": 6.436339855194092, + "learning_rate": 8.400574988021084e-05, + "loss": 0.7409, + "step": 12434 + }, + { + "epoch": 0.8425367572328749, + "grad_norm": 5.5825700759887695, + "learning_rate": 8.400438086111302e-05, + "loss": 0.6429, + "step": 12435 + }, + { + "epoch": 0.8426045125008469, + "grad_norm": 5.8885297775268555, + "learning_rate": 8.40030118420152e-05, + "loss": 0.8675, + "step": 12436 + }, + { + "epoch": 0.842672267768819, + "grad_norm": 6.454819679260254, + "learning_rate": 8.400164282291738e-05, + "loss": 0.6461, + "step": 12437 + }, + { + "epoch": 0.8427400230367911, + "grad_norm": 6.442966938018799, + "learning_rate": 8.400027380381956e-05, + "loss": 0.9699, + "step": 12438 + }, + { + "epoch": 0.8428077783047632, + "grad_norm": 5.841514587402344, + "learning_rate": 8.399890478472176e-05, + "loss": 0.734, + "step": 12439 + }, + { + "epoch": 0.8428755335727353, + "grad_norm": 5.0523858070373535, + "learning_rate": 8.399753576562394e-05, + "loss": 0.7477, + "step": 12440 + }, + { + "epoch": 0.8429432888407073, + "grad_norm": 6.7047810554504395, + "learning_rate": 8.399616674652612e-05, + "loss": 0.8632, + "step": 12441 + }, + { + "epoch": 0.8430110441086794, + "grad_norm": 5.654105186462402, + "learning_rate": 8.39947977274283e-05, + "loss": 0.6073, + "step": 12442 + }, + { + "epoch": 0.8430787993766515, + "grad_norm": 6.039260387420654, + "learning_rate": 8.39934287083305e-05, + "loss": 0.7417, + "step": 12443 + }, + { + "epoch": 0.8431465546446236, + "grad_norm": 6.465257167816162, + "learning_rate": 8.399205968923267e-05, + "loss": 0.8625, + "step": 12444 + }, + { + "epoch": 0.8432143099125957, + "grad_norm": 7.922903060913086, + "learning_rate": 8.399069067013485e-05, + "loss": 0.8621, + "step": 12445 + }, + { + "epoch": 0.8432820651805678, + "grad_norm": 6.696178436279297, + "learning_rate": 8.398932165103703e-05, + "loss": 0.7575, + "step": 12446 + }, + { + "epoch": 0.8433498204485399, + "grad_norm": 7.259800910949707, + "learning_rate": 8.398795263193921e-05, + "loss": 0.9376, + "step": 12447 + }, + { + "epoch": 0.843417575716512, + "grad_norm": 5.065820217132568, + "learning_rate": 8.398658361284141e-05, + "loss": 0.8156, + "step": 12448 + }, + { + "epoch": 0.8434853309844841, + "grad_norm": 6.689807415008545, + "learning_rate": 8.398521459374359e-05, + "loss": 0.6496, + "step": 12449 + }, + { + "epoch": 0.8435530862524562, + "grad_norm": 7.013186454772949, + "learning_rate": 8.398384557464577e-05, + "loss": 1.0973, + "step": 12450 + }, + { + "epoch": 0.8436208415204283, + "grad_norm": 6.942663669586182, + "learning_rate": 8.398247655554795e-05, + "loss": 0.9876, + "step": 12451 + }, + { + "epoch": 0.8436885967884002, + "grad_norm": 6.426051139831543, + "learning_rate": 8.398110753645013e-05, + "loss": 0.6516, + "step": 12452 + }, + { + "epoch": 0.8437563520563723, + "grad_norm": 5.993021488189697, + "learning_rate": 8.397973851735232e-05, + "loss": 0.5495, + "step": 12453 + }, + { + "epoch": 0.8438241073243444, + "grad_norm": 7.648578643798828, + "learning_rate": 8.39783694982545e-05, + "loss": 1.3047, + "step": 12454 + }, + { + "epoch": 0.8438918625923165, + "grad_norm": 6.795345306396484, + "learning_rate": 8.397700047915668e-05, + "loss": 0.8851, + "step": 12455 + }, + { + "epoch": 0.8439596178602886, + "grad_norm": 9.486870765686035, + "learning_rate": 8.397563146005887e-05, + "loss": 0.7838, + "step": 12456 + }, + { + "epoch": 0.8440273731282607, + "grad_norm": 6.156589031219482, + "learning_rate": 8.397426244096106e-05, + "loss": 0.7293, + "step": 12457 + }, + { + "epoch": 0.8440951283962328, + "grad_norm": 6.535582542419434, + "learning_rate": 8.397289342186324e-05, + "loss": 0.853, + "step": 12458 + }, + { + "epoch": 0.8441628836642049, + "grad_norm": 6.530213832855225, + "learning_rate": 8.397152440276542e-05, + "loss": 1.0661, + "step": 12459 + }, + { + "epoch": 0.844230638932177, + "grad_norm": 5.963207721710205, + "learning_rate": 8.39701553836676e-05, + "loss": 0.6558, + "step": 12460 + }, + { + "epoch": 0.8442983942001491, + "grad_norm": 6.132920742034912, + "learning_rate": 8.396878636456978e-05, + "loss": 0.7386, + "step": 12461 + }, + { + "epoch": 0.8443661494681212, + "grad_norm": 4.533850193023682, + "learning_rate": 8.396741734547197e-05, + "loss": 0.6762, + "step": 12462 + }, + { + "epoch": 0.8444339047360933, + "grad_norm": 5.42271614074707, + "learning_rate": 8.396604832637415e-05, + "loss": 0.9324, + "step": 12463 + }, + { + "epoch": 0.8445016600040653, + "grad_norm": 7.894658088684082, + "learning_rate": 8.396467930727633e-05, + "loss": 1.0492, + "step": 12464 + }, + { + "epoch": 0.8445694152720374, + "grad_norm": 6.1358137130737305, + "learning_rate": 8.396331028817852e-05, + "loss": 0.6701, + "step": 12465 + }, + { + "epoch": 0.8446371705400095, + "grad_norm": 6.7011871337890625, + "learning_rate": 8.396194126908071e-05, + "loss": 0.9623, + "step": 12466 + }, + { + "epoch": 0.8447049258079816, + "grad_norm": 6.27651834487915, + "learning_rate": 8.396057224998289e-05, + "loss": 0.8553, + "step": 12467 + }, + { + "epoch": 0.8447726810759536, + "grad_norm": 6.427924156188965, + "learning_rate": 8.395920323088507e-05, + "loss": 0.9759, + "step": 12468 + }, + { + "epoch": 0.8448404363439257, + "grad_norm": 6.353015422821045, + "learning_rate": 8.395783421178725e-05, + "loss": 0.7357, + "step": 12469 + }, + { + "epoch": 0.8449081916118978, + "grad_norm": 6.444865703582764, + "learning_rate": 8.395646519268944e-05, + "loss": 0.6417, + "step": 12470 + }, + { + "epoch": 0.8449759468798699, + "grad_norm": 6.375645160675049, + "learning_rate": 8.395509617359162e-05, + "loss": 1.0158, + "step": 12471 + }, + { + "epoch": 0.845043702147842, + "grad_norm": 4.871046543121338, + "learning_rate": 8.39537271544938e-05, + "loss": 0.7491, + "step": 12472 + }, + { + "epoch": 0.8451114574158141, + "grad_norm": 5.615902423858643, + "learning_rate": 8.3952358135396e-05, + "loss": 0.6925, + "step": 12473 + }, + { + "epoch": 0.8451792126837862, + "grad_norm": 6.724735260009766, + "learning_rate": 8.395098911629818e-05, + "loss": 0.6802, + "step": 12474 + }, + { + "epoch": 0.8452469679517582, + "grad_norm": 7.350205421447754, + "learning_rate": 8.394962009720036e-05, + "loss": 1.1414, + "step": 12475 + }, + { + "epoch": 0.8453147232197303, + "grad_norm": 5.377062797546387, + "learning_rate": 8.394825107810255e-05, + "loss": 0.8551, + "step": 12476 + }, + { + "epoch": 0.8453824784877024, + "grad_norm": 5.765392303466797, + "learning_rate": 8.394688205900473e-05, + "loss": 0.6638, + "step": 12477 + }, + { + "epoch": 0.8454502337556745, + "grad_norm": 5.007123947143555, + "learning_rate": 8.394551303990691e-05, + "loss": 0.7236, + "step": 12478 + }, + { + "epoch": 0.8455179890236466, + "grad_norm": 6.559772491455078, + "learning_rate": 8.39441440208091e-05, + "loss": 0.8623, + "step": 12479 + }, + { + "epoch": 0.8455857442916187, + "grad_norm": 6.448508262634277, + "learning_rate": 8.394277500171129e-05, + "loss": 0.7342, + "step": 12480 + }, + { + "epoch": 0.8456534995595908, + "grad_norm": 6.105748653411865, + "learning_rate": 8.394140598261347e-05, + "loss": 0.7703, + "step": 12481 + }, + { + "epoch": 0.8457212548275629, + "grad_norm": 6.34128999710083, + "learning_rate": 8.394003696351565e-05, + "loss": 0.8306, + "step": 12482 + }, + { + "epoch": 0.845789010095535, + "grad_norm": 6.052567005157471, + "learning_rate": 8.393866794441783e-05, + "loss": 0.7125, + "step": 12483 + }, + { + "epoch": 0.845856765363507, + "grad_norm": 5.930888652801514, + "learning_rate": 8.393729892532001e-05, + "loss": 0.9559, + "step": 12484 + }, + { + "epoch": 0.845924520631479, + "grad_norm": 5.138692378997803, + "learning_rate": 8.39359299062222e-05, + "loss": 0.6747, + "step": 12485 + }, + { + "epoch": 0.8459922758994511, + "grad_norm": 6.211801528930664, + "learning_rate": 8.393456088712438e-05, + "loss": 0.7852, + "step": 12486 + }, + { + "epoch": 0.8460600311674232, + "grad_norm": 6.416335105895996, + "learning_rate": 8.393319186802656e-05, + "loss": 0.9246, + "step": 12487 + }, + { + "epoch": 0.8461277864353953, + "grad_norm": 5.819685459136963, + "learning_rate": 8.393182284892874e-05, + "loss": 0.9555, + "step": 12488 + }, + { + "epoch": 0.8461955417033674, + "grad_norm": 6.297514915466309, + "learning_rate": 8.393045382983094e-05, + "loss": 0.8673, + "step": 12489 + }, + { + "epoch": 0.8462632969713395, + "grad_norm": 11.973522186279297, + "learning_rate": 8.392908481073312e-05, + "loss": 0.9805, + "step": 12490 + }, + { + "epoch": 0.8463310522393116, + "grad_norm": 7.551254749298096, + "learning_rate": 8.39277157916353e-05, + "loss": 0.855, + "step": 12491 + }, + { + "epoch": 0.8463988075072837, + "grad_norm": 5.7769389152526855, + "learning_rate": 8.392634677253748e-05, + "loss": 0.6064, + "step": 12492 + }, + { + "epoch": 0.8464665627752558, + "grad_norm": 4.919633388519287, + "learning_rate": 8.392497775343966e-05, + "loss": 0.7027, + "step": 12493 + }, + { + "epoch": 0.8465343180432279, + "grad_norm": 5.183913230895996, + "learning_rate": 8.392360873434185e-05, + "loss": 0.8747, + "step": 12494 + }, + { + "epoch": 0.8466020733112, + "grad_norm": 5.794585704803467, + "learning_rate": 8.392223971524403e-05, + "loss": 0.7961, + "step": 12495 + }, + { + "epoch": 0.8466698285791721, + "grad_norm": 6.117403030395508, + "learning_rate": 8.392087069614621e-05, + "loss": 0.5892, + "step": 12496 + }, + { + "epoch": 0.8467375838471441, + "grad_norm": 7.63482666015625, + "learning_rate": 8.39195016770484e-05, + "loss": 0.8159, + "step": 12497 + }, + { + "epoch": 0.8468053391151162, + "grad_norm": 5.1039347648620605, + "learning_rate": 8.391813265795059e-05, + "loss": 0.5934, + "step": 12498 + }, + { + "epoch": 0.8468730943830883, + "grad_norm": 6.369871616363525, + "learning_rate": 8.391676363885277e-05, + "loss": 0.9178, + "step": 12499 + }, + { + "epoch": 0.8469408496510604, + "grad_norm": 6.773191928863525, + "learning_rate": 8.391539461975495e-05, + "loss": 1.0055, + "step": 12500 + }, + { + "epoch": 0.8470086049190324, + "grad_norm": 5.994389057159424, + "learning_rate": 8.391402560065713e-05, + "loss": 0.8812, + "step": 12501 + }, + { + "epoch": 0.8470763601870045, + "grad_norm": 5.712553977966309, + "learning_rate": 8.391265658155931e-05, + "loss": 0.7008, + "step": 12502 + }, + { + "epoch": 0.8471441154549766, + "grad_norm": 5.810184955596924, + "learning_rate": 8.39112875624615e-05, + "loss": 0.7936, + "step": 12503 + }, + { + "epoch": 0.8472118707229487, + "grad_norm": 6.312936782836914, + "learning_rate": 8.390991854336368e-05, + "loss": 0.8583, + "step": 12504 + }, + { + "epoch": 0.8472796259909208, + "grad_norm": 6.539886474609375, + "learning_rate": 8.390854952426586e-05, + "loss": 0.7788, + "step": 12505 + }, + { + "epoch": 0.8473473812588929, + "grad_norm": 7.018226146697998, + "learning_rate": 8.390718050516804e-05, + "loss": 0.7205, + "step": 12506 + }, + { + "epoch": 0.847415136526865, + "grad_norm": 6.536552429199219, + "learning_rate": 8.390581148607023e-05, + "loss": 0.9422, + "step": 12507 + }, + { + "epoch": 0.847482891794837, + "grad_norm": 6.521510601043701, + "learning_rate": 8.390444246697242e-05, + "loss": 0.7813, + "step": 12508 + }, + { + "epoch": 0.8475506470628091, + "grad_norm": 5.533815383911133, + "learning_rate": 8.39030734478746e-05, + "loss": 0.8097, + "step": 12509 + }, + { + "epoch": 0.8476184023307812, + "grad_norm": 5.778811931610107, + "learning_rate": 8.390170442877678e-05, + "loss": 0.7295, + "step": 12510 + }, + { + "epoch": 0.8476861575987533, + "grad_norm": 5.98907995223999, + "learning_rate": 8.390033540967896e-05, + "loss": 0.7462, + "step": 12511 + }, + { + "epoch": 0.8477539128667254, + "grad_norm": 5.19685697555542, + "learning_rate": 8.389896639058115e-05, + "loss": 0.735, + "step": 12512 + }, + { + "epoch": 0.8478216681346975, + "grad_norm": 5.980901718139648, + "learning_rate": 8.389759737148333e-05, + "loss": 0.5655, + "step": 12513 + }, + { + "epoch": 0.8478894234026696, + "grad_norm": 4.931701183319092, + "learning_rate": 8.389622835238551e-05, + "loss": 0.6426, + "step": 12514 + }, + { + "epoch": 0.8479571786706417, + "grad_norm": 5.470427513122559, + "learning_rate": 8.38948593332877e-05, + "loss": 0.8698, + "step": 12515 + }, + { + "epoch": 0.8480249339386138, + "grad_norm": 8.906782150268555, + "learning_rate": 8.389349031418989e-05, + "loss": 0.9118, + "step": 12516 + }, + { + "epoch": 0.8480926892065858, + "grad_norm": 7.61644172668457, + "learning_rate": 8.389212129509207e-05, + "loss": 1.0529, + "step": 12517 + }, + { + "epoch": 0.8481604444745579, + "grad_norm": 6.867774963378906, + "learning_rate": 8.389075227599425e-05, + "loss": 0.6104, + "step": 12518 + }, + { + "epoch": 0.8482281997425299, + "grad_norm": 7.476839065551758, + "learning_rate": 8.388938325689644e-05, + "loss": 1.036, + "step": 12519 + }, + { + "epoch": 0.848295955010502, + "grad_norm": 6.415992259979248, + "learning_rate": 8.388801423779862e-05, + "loss": 0.9505, + "step": 12520 + }, + { + "epoch": 0.8483637102784741, + "grad_norm": 6.049834251403809, + "learning_rate": 8.38866452187008e-05, + "loss": 0.7437, + "step": 12521 + }, + { + "epoch": 0.8484314655464462, + "grad_norm": 5.392312526702881, + "learning_rate": 8.3885276199603e-05, + "loss": 0.7305, + "step": 12522 + }, + { + "epoch": 0.8484992208144183, + "grad_norm": 6.002782821655273, + "learning_rate": 8.388390718050518e-05, + "loss": 0.6241, + "step": 12523 + }, + { + "epoch": 0.8485669760823904, + "grad_norm": 7.464806079864502, + "learning_rate": 8.388253816140736e-05, + "loss": 1.0126, + "step": 12524 + }, + { + "epoch": 0.8486347313503625, + "grad_norm": 6.660724639892578, + "learning_rate": 8.388116914230954e-05, + "loss": 0.8958, + "step": 12525 + }, + { + "epoch": 0.8487024866183346, + "grad_norm": 6.646303176879883, + "learning_rate": 8.387980012321173e-05, + "loss": 0.951, + "step": 12526 + }, + { + "epoch": 0.8487702418863067, + "grad_norm": 6.518253803253174, + "learning_rate": 8.387843110411391e-05, + "loss": 0.8785, + "step": 12527 + }, + { + "epoch": 0.8488379971542788, + "grad_norm": 4.314852237701416, + "learning_rate": 8.38770620850161e-05, + "loss": 0.8498, + "step": 12528 + }, + { + "epoch": 0.8489057524222509, + "grad_norm": 5.918713092803955, + "learning_rate": 8.387569306591827e-05, + "loss": 0.9178, + "step": 12529 + }, + { + "epoch": 0.848973507690223, + "grad_norm": 7.2677788734436035, + "learning_rate": 8.387432404682045e-05, + "loss": 0.8692, + "step": 12530 + }, + { + "epoch": 0.849041262958195, + "grad_norm": 5.739150524139404, + "learning_rate": 8.387295502772265e-05, + "loss": 0.7924, + "step": 12531 + }, + { + "epoch": 0.8491090182261671, + "grad_norm": 5.6296000480651855, + "learning_rate": 8.387158600862483e-05, + "loss": 0.7735, + "step": 12532 + }, + { + "epoch": 0.8491767734941391, + "grad_norm": 5.1605000495910645, + "learning_rate": 8.387021698952701e-05, + "loss": 0.7029, + "step": 12533 + }, + { + "epoch": 0.8492445287621112, + "grad_norm": 6.645575046539307, + "learning_rate": 8.386884797042919e-05, + "loss": 1.0384, + "step": 12534 + }, + { + "epoch": 0.8493122840300833, + "grad_norm": 6.592693328857422, + "learning_rate": 8.386747895133138e-05, + "loss": 0.7987, + "step": 12535 + }, + { + "epoch": 0.8493800392980554, + "grad_norm": 7.588740825653076, + "learning_rate": 8.386610993223356e-05, + "loss": 0.6359, + "step": 12536 + }, + { + "epoch": 0.8494477945660275, + "grad_norm": 4.178662300109863, + "learning_rate": 8.386474091313574e-05, + "loss": 0.7081, + "step": 12537 + }, + { + "epoch": 0.8495155498339996, + "grad_norm": 5.266997814178467, + "learning_rate": 8.386337189403792e-05, + "loss": 0.688, + "step": 12538 + }, + { + "epoch": 0.8495833051019717, + "grad_norm": 5.189149379730225, + "learning_rate": 8.38620028749401e-05, + "loss": 0.8713, + "step": 12539 + }, + { + "epoch": 0.8496510603699438, + "grad_norm": 5.311150074005127, + "learning_rate": 8.38606338558423e-05, + "loss": 0.7709, + "step": 12540 + }, + { + "epoch": 0.8497188156379158, + "grad_norm": 4.873614311218262, + "learning_rate": 8.385926483674448e-05, + "loss": 0.8112, + "step": 12541 + }, + { + "epoch": 0.8497865709058879, + "grad_norm": 6.431467056274414, + "learning_rate": 8.385789581764666e-05, + "loss": 0.9818, + "step": 12542 + }, + { + "epoch": 0.84985432617386, + "grad_norm": 7.026850700378418, + "learning_rate": 8.385652679854884e-05, + "loss": 0.7773, + "step": 12543 + }, + { + "epoch": 0.8499220814418321, + "grad_norm": 5.634081840515137, + "learning_rate": 8.385515777945103e-05, + "loss": 0.8703, + "step": 12544 + }, + { + "epoch": 0.8499898367098042, + "grad_norm": 5.535772800445557, + "learning_rate": 8.385378876035321e-05, + "loss": 0.6963, + "step": 12545 + }, + { + "epoch": 0.8500575919777763, + "grad_norm": 6.581893444061279, + "learning_rate": 8.38524197412554e-05, + "loss": 0.7044, + "step": 12546 + }, + { + "epoch": 0.8501253472457484, + "grad_norm": 5.496252536773682, + "learning_rate": 8.385105072215757e-05, + "loss": 0.7298, + "step": 12547 + }, + { + "epoch": 0.8501931025137205, + "grad_norm": 6.205685615539551, + "learning_rate": 8.384968170305975e-05, + "loss": 0.7728, + "step": 12548 + }, + { + "epoch": 0.8502608577816926, + "grad_norm": 7.986889362335205, + "learning_rate": 8.384831268396195e-05, + "loss": 0.767, + "step": 12549 + }, + { + "epoch": 0.8503286130496646, + "grad_norm": 6.026723384857178, + "learning_rate": 8.384694366486413e-05, + "loss": 0.9097, + "step": 12550 + }, + { + "epoch": 0.8503963683176367, + "grad_norm": 5.750411510467529, + "learning_rate": 8.384557464576631e-05, + "loss": 0.8567, + "step": 12551 + }, + { + "epoch": 0.8504641235856087, + "grad_norm": 6.5039896965026855, + "learning_rate": 8.384420562666849e-05, + "loss": 0.882, + "step": 12552 + }, + { + "epoch": 0.8505318788535808, + "grad_norm": 6.847061634063721, + "learning_rate": 8.384283660757068e-05, + "loss": 0.5942, + "step": 12553 + }, + { + "epoch": 0.8505996341215529, + "grad_norm": 4.668815612792969, + "learning_rate": 8.384146758847286e-05, + "loss": 0.5723, + "step": 12554 + }, + { + "epoch": 0.850667389389525, + "grad_norm": 7.477560043334961, + "learning_rate": 8.384009856937504e-05, + "loss": 0.9085, + "step": 12555 + }, + { + "epoch": 0.8507351446574971, + "grad_norm": 5.242186069488525, + "learning_rate": 8.383872955027722e-05, + "loss": 0.5811, + "step": 12556 + }, + { + "epoch": 0.8508028999254692, + "grad_norm": 6.460277080535889, + "learning_rate": 8.38373605311794e-05, + "loss": 0.6622, + "step": 12557 + }, + { + "epoch": 0.8508706551934413, + "grad_norm": 8.43552017211914, + "learning_rate": 8.38359915120816e-05, + "loss": 0.6877, + "step": 12558 + }, + { + "epoch": 0.8509384104614134, + "grad_norm": 5.636725902557373, + "learning_rate": 8.383462249298378e-05, + "loss": 0.7697, + "step": 12559 + }, + { + "epoch": 0.8510061657293855, + "grad_norm": 7.544033050537109, + "learning_rate": 8.383325347388596e-05, + "loss": 1.031, + "step": 12560 + }, + { + "epoch": 0.8510739209973576, + "grad_norm": 10.117722511291504, + "learning_rate": 8.383188445478814e-05, + "loss": 1.1007, + "step": 12561 + }, + { + "epoch": 0.8511416762653297, + "grad_norm": 5.651546955108643, + "learning_rate": 8.383051543569032e-05, + "loss": 0.6959, + "step": 12562 + }, + { + "epoch": 0.8512094315333018, + "grad_norm": 7.87699556350708, + "learning_rate": 8.382914641659251e-05, + "loss": 0.873, + "step": 12563 + }, + { + "epoch": 0.8512771868012738, + "grad_norm": 5.291513442993164, + "learning_rate": 8.38277773974947e-05, + "loss": 0.6965, + "step": 12564 + }, + { + "epoch": 0.8513449420692459, + "grad_norm": 5.983782768249512, + "learning_rate": 8.382640837839687e-05, + "loss": 0.908, + "step": 12565 + }, + { + "epoch": 0.8514126973372179, + "grad_norm": 4.902809143066406, + "learning_rate": 8.382503935929907e-05, + "loss": 0.6924, + "step": 12566 + }, + { + "epoch": 0.85148045260519, + "grad_norm": 4.983574390411377, + "learning_rate": 8.382367034020125e-05, + "loss": 0.7984, + "step": 12567 + }, + { + "epoch": 0.8515482078731621, + "grad_norm": 6.039658069610596, + "learning_rate": 8.382230132110343e-05, + "loss": 0.683, + "step": 12568 + }, + { + "epoch": 0.8516159631411342, + "grad_norm": 5.577428340911865, + "learning_rate": 8.382093230200562e-05, + "loss": 0.8412, + "step": 12569 + }, + { + "epoch": 0.8516837184091063, + "grad_norm": 6.592475891113281, + "learning_rate": 8.38195632829078e-05, + "loss": 0.6641, + "step": 12570 + }, + { + "epoch": 0.8517514736770784, + "grad_norm": 5.534562587738037, + "learning_rate": 8.381819426380998e-05, + "loss": 0.9195, + "step": 12571 + }, + { + "epoch": 0.8518192289450505, + "grad_norm": 5.535101413726807, + "learning_rate": 8.381682524471218e-05, + "loss": 0.723, + "step": 12572 + }, + { + "epoch": 0.8518869842130226, + "grad_norm": 6.00454044342041, + "learning_rate": 8.381545622561436e-05, + "loss": 0.6079, + "step": 12573 + }, + { + "epoch": 0.8519547394809946, + "grad_norm": 6.607147216796875, + "learning_rate": 8.381408720651654e-05, + "loss": 0.9253, + "step": 12574 + }, + { + "epoch": 0.8520224947489667, + "grad_norm": 5.750394821166992, + "learning_rate": 8.381271818741872e-05, + "loss": 0.689, + "step": 12575 + }, + { + "epoch": 0.8520902500169388, + "grad_norm": 6.017903804779053, + "learning_rate": 8.381134916832091e-05, + "loss": 0.8317, + "step": 12576 + }, + { + "epoch": 0.8521580052849109, + "grad_norm": 6.839371681213379, + "learning_rate": 8.380998014922309e-05, + "loss": 0.9688, + "step": 12577 + }, + { + "epoch": 0.852225760552883, + "grad_norm": 5.614028453826904, + "learning_rate": 8.380861113012527e-05, + "loss": 0.5495, + "step": 12578 + }, + { + "epoch": 0.8522935158208551, + "grad_norm": 6.759237766265869, + "learning_rate": 8.380724211102745e-05, + "loss": 0.7634, + "step": 12579 + }, + { + "epoch": 0.8523612710888272, + "grad_norm": 5.667263031005859, + "learning_rate": 8.380587309192963e-05, + "loss": 0.8539, + "step": 12580 + }, + { + "epoch": 0.8524290263567993, + "grad_norm": 5.995064735412598, + "learning_rate": 8.380450407283183e-05, + "loss": 0.7092, + "step": 12581 + }, + { + "epoch": 0.8524967816247713, + "grad_norm": 6.612915515899658, + "learning_rate": 8.380313505373401e-05, + "loss": 0.822, + "step": 12582 + }, + { + "epoch": 0.8525645368927434, + "grad_norm": 5.579259395599365, + "learning_rate": 8.380176603463619e-05, + "loss": 0.682, + "step": 12583 + }, + { + "epoch": 0.8526322921607155, + "grad_norm": 6.590492248535156, + "learning_rate": 8.380039701553837e-05, + "loss": 0.7243, + "step": 12584 + }, + { + "epoch": 0.8527000474286875, + "grad_norm": 4.9304680824279785, + "learning_rate": 8.379902799644055e-05, + "loss": 0.8079, + "step": 12585 + }, + { + "epoch": 0.8527678026966596, + "grad_norm": 8.054821014404297, + "learning_rate": 8.379765897734274e-05, + "loss": 0.9659, + "step": 12586 + }, + { + "epoch": 0.8528355579646317, + "grad_norm": 5.432389736175537, + "learning_rate": 8.379628995824492e-05, + "loss": 0.7316, + "step": 12587 + }, + { + "epoch": 0.8529033132326038, + "grad_norm": 5.523181915283203, + "learning_rate": 8.37949209391471e-05, + "loss": 0.9273, + "step": 12588 + }, + { + "epoch": 0.8529710685005759, + "grad_norm": 6.227122783660889, + "learning_rate": 8.379355192004928e-05, + "loss": 0.7196, + "step": 12589 + }, + { + "epoch": 0.853038823768548, + "grad_norm": 6.5586371421813965, + "learning_rate": 8.379218290095148e-05, + "loss": 0.793, + "step": 12590 + }, + { + "epoch": 0.8531065790365201, + "grad_norm": 6.881863594055176, + "learning_rate": 8.379081388185366e-05, + "loss": 0.6888, + "step": 12591 + }, + { + "epoch": 0.8531743343044922, + "grad_norm": 6.569089889526367, + "learning_rate": 8.378944486275584e-05, + "loss": 1.0791, + "step": 12592 + }, + { + "epoch": 0.8532420895724643, + "grad_norm": 4.835020542144775, + "learning_rate": 8.378807584365802e-05, + "loss": 0.866, + "step": 12593 + }, + { + "epoch": 0.8533098448404364, + "grad_norm": 5.276635646820068, + "learning_rate": 8.37867068245602e-05, + "loss": 0.5689, + "step": 12594 + }, + { + "epoch": 0.8533776001084085, + "grad_norm": 5.557784080505371, + "learning_rate": 8.378533780546239e-05, + "loss": 0.8455, + "step": 12595 + }, + { + "epoch": 0.8534453553763806, + "grad_norm": 6.33231258392334, + "learning_rate": 8.378396878636457e-05, + "loss": 0.6639, + "step": 12596 + }, + { + "epoch": 0.8535131106443526, + "grad_norm": 6.476678371429443, + "learning_rate": 8.378259976726675e-05, + "loss": 0.6786, + "step": 12597 + }, + { + "epoch": 0.8535808659123247, + "grad_norm": 5.376199245452881, + "learning_rate": 8.378123074816893e-05, + "loss": 0.7915, + "step": 12598 + }, + { + "epoch": 0.8536486211802967, + "grad_norm": 6.826034069061279, + "learning_rate": 8.377986172907113e-05, + "loss": 0.7511, + "step": 12599 + }, + { + "epoch": 0.8537163764482688, + "grad_norm": 5.847662925720215, + "learning_rate": 8.377849270997331e-05, + "loss": 0.7095, + "step": 12600 + }, + { + "epoch": 0.8537841317162409, + "grad_norm": 5.112993240356445, + "learning_rate": 8.377712369087549e-05, + "loss": 0.5066, + "step": 12601 + }, + { + "epoch": 0.853851886984213, + "grad_norm": 8.529839515686035, + "learning_rate": 8.377575467177767e-05, + "loss": 0.8007, + "step": 12602 + }, + { + "epoch": 0.8539196422521851, + "grad_norm": 5.600361347198486, + "learning_rate": 8.377438565267985e-05, + "loss": 0.847, + "step": 12603 + }, + { + "epoch": 0.8539873975201572, + "grad_norm": 7.553412914276123, + "learning_rate": 8.377301663358204e-05, + "loss": 0.8847, + "step": 12604 + }, + { + "epoch": 0.8540551527881293, + "grad_norm": 4.93065881729126, + "learning_rate": 8.377164761448422e-05, + "loss": 0.5527, + "step": 12605 + }, + { + "epoch": 0.8541229080561014, + "grad_norm": 6.301657676696777, + "learning_rate": 8.37702785953864e-05, + "loss": 0.9723, + "step": 12606 + }, + { + "epoch": 0.8541906633240735, + "grad_norm": 6.898577690124512, + "learning_rate": 8.376890957628858e-05, + "loss": 0.6861, + "step": 12607 + }, + { + "epoch": 0.8542584185920455, + "grad_norm": 6.174037933349609, + "learning_rate": 8.376754055719078e-05, + "loss": 0.8906, + "step": 12608 + }, + { + "epoch": 0.8543261738600176, + "grad_norm": 5.768240928649902, + "learning_rate": 8.376617153809296e-05, + "loss": 0.8712, + "step": 12609 + }, + { + "epoch": 0.8543939291279897, + "grad_norm": 5.855865001678467, + "learning_rate": 8.376480251899514e-05, + "loss": 0.8333, + "step": 12610 + }, + { + "epoch": 0.8544616843959618, + "grad_norm": 5.274192810058594, + "learning_rate": 8.376343349989732e-05, + "loss": 0.6786, + "step": 12611 + }, + { + "epoch": 0.8545294396639339, + "grad_norm": 5.603989601135254, + "learning_rate": 8.376206448079951e-05, + "loss": 0.7428, + "step": 12612 + }, + { + "epoch": 0.854597194931906, + "grad_norm": 4.8484883308410645, + "learning_rate": 8.37606954617017e-05, + "loss": 0.7456, + "step": 12613 + }, + { + "epoch": 0.8546649501998781, + "grad_norm": 5.10874605178833, + "learning_rate": 8.375932644260387e-05, + "loss": 0.6326, + "step": 12614 + }, + { + "epoch": 0.8547327054678501, + "grad_norm": 4.878294944763184, + "learning_rate": 8.375795742350607e-05, + "loss": 0.6428, + "step": 12615 + }, + { + "epoch": 0.8548004607358222, + "grad_norm": 7.003088474273682, + "learning_rate": 8.375658840440825e-05, + "loss": 0.8246, + "step": 12616 + }, + { + "epoch": 0.8548682160037943, + "grad_norm": 7.047825336456299, + "learning_rate": 8.375521938531043e-05, + "loss": 0.833, + "step": 12617 + }, + { + "epoch": 0.8549359712717663, + "grad_norm": 9.071378707885742, + "learning_rate": 8.375385036621262e-05, + "loss": 0.8094, + "step": 12618 + }, + { + "epoch": 0.8550037265397384, + "grad_norm": 5.345335960388184, + "learning_rate": 8.37524813471148e-05, + "loss": 0.79, + "step": 12619 + }, + { + "epoch": 0.8550714818077105, + "grad_norm": 5.579035758972168, + "learning_rate": 8.375111232801698e-05, + "loss": 0.7526, + "step": 12620 + }, + { + "epoch": 0.8551392370756826, + "grad_norm": 6.771716117858887, + "learning_rate": 8.374974330891916e-05, + "loss": 0.7558, + "step": 12621 + }, + { + "epoch": 0.8552069923436547, + "grad_norm": 7.533701419830322, + "learning_rate": 8.374837428982136e-05, + "loss": 1.1432, + "step": 12622 + }, + { + "epoch": 0.8552747476116268, + "grad_norm": 7.239196300506592, + "learning_rate": 8.374700527072354e-05, + "loss": 1.1609, + "step": 12623 + }, + { + "epoch": 0.8553425028795989, + "grad_norm": 5.9439544677734375, + "learning_rate": 8.374563625162572e-05, + "loss": 0.8908, + "step": 12624 + }, + { + "epoch": 0.855410258147571, + "grad_norm": 7.268623352050781, + "learning_rate": 8.37442672325279e-05, + "loss": 1.2071, + "step": 12625 + }, + { + "epoch": 0.8554780134155431, + "grad_norm": 8.557291984558105, + "learning_rate": 8.374289821343008e-05, + "loss": 1.0076, + "step": 12626 + }, + { + "epoch": 0.8555457686835152, + "grad_norm": 5.16387939453125, + "learning_rate": 8.374152919433227e-05, + "loss": 0.7862, + "step": 12627 + }, + { + "epoch": 0.8556135239514873, + "grad_norm": 5.909477710723877, + "learning_rate": 8.374016017523445e-05, + "loss": 0.8143, + "step": 12628 + }, + { + "epoch": 0.8556812792194594, + "grad_norm": 6.213109016418457, + "learning_rate": 8.373879115613663e-05, + "loss": 0.8753, + "step": 12629 + }, + { + "epoch": 0.8557490344874314, + "grad_norm": 6.664327621459961, + "learning_rate": 8.373742213703881e-05, + "loss": 0.7626, + "step": 12630 + }, + { + "epoch": 0.8558167897554034, + "grad_norm": 5.750275135040283, + "learning_rate": 8.373605311794101e-05, + "loss": 0.8285, + "step": 12631 + }, + { + "epoch": 0.8558845450233755, + "grad_norm": 6.6249237060546875, + "learning_rate": 8.373468409884319e-05, + "loss": 0.9556, + "step": 12632 + }, + { + "epoch": 0.8559523002913476, + "grad_norm": 5.466978549957275, + "learning_rate": 8.373331507974537e-05, + "loss": 0.9843, + "step": 12633 + }, + { + "epoch": 0.8560200555593197, + "grad_norm": 7.178730487823486, + "learning_rate": 8.373194606064755e-05, + "loss": 0.9921, + "step": 12634 + }, + { + "epoch": 0.8560878108272918, + "grad_norm": 5.387322902679443, + "learning_rate": 8.373057704154973e-05, + "loss": 0.8924, + "step": 12635 + }, + { + "epoch": 0.8561555660952639, + "grad_norm": 6.278534889221191, + "learning_rate": 8.372920802245192e-05, + "loss": 1.05, + "step": 12636 + }, + { + "epoch": 0.856223321363236, + "grad_norm": 6.567953586578369, + "learning_rate": 8.37278390033541e-05, + "loss": 0.9592, + "step": 12637 + }, + { + "epoch": 0.8562910766312081, + "grad_norm": 4.848402976989746, + "learning_rate": 8.372646998425628e-05, + "loss": 0.7223, + "step": 12638 + }, + { + "epoch": 0.8563588318991802, + "grad_norm": 5.792343616485596, + "learning_rate": 8.372510096515846e-05, + "loss": 0.8339, + "step": 12639 + }, + { + "epoch": 0.8564265871671523, + "grad_norm": 7.278899669647217, + "learning_rate": 8.372373194606064e-05, + "loss": 0.9728, + "step": 12640 + }, + { + "epoch": 0.8564943424351243, + "grad_norm": 6.241243839263916, + "learning_rate": 8.372236292696284e-05, + "loss": 0.6706, + "step": 12641 + }, + { + "epoch": 0.8565620977030964, + "grad_norm": 4.546154022216797, + "learning_rate": 8.372099390786502e-05, + "loss": 0.6978, + "step": 12642 + }, + { + "epoch": 0.8566298529710685, + "grad_norm": 5.440952777862549, + "learning_rate": 8.37196248887672e-05, + "loss": 0.5912, + "step": 12643 + }, + { + "epoch": 0.8566976082390406, + "grad_norm": 5.9767656326293945, + "learning_rate": 8.371825586966938e-05, + "loss": 0.8739, + "step": 12644 + }, + { + "epoch": 0.8567653635070127, + "grad_norm": 5.622544288635254, + "learning_rate": 8.371688685057157e-05, + "loss": 0.9851, + "step": 12645 + }, + { + "epoch": 0.8568331187749848, + "grad_norm": 4.423412322998047, + "learning_rate": 8.371551783147375e-05, + "loss": 0.578, + "step": 12646 + }, + { + "epoch": 0.8569008740429569, + "grad_norm": 7.680568695068359, + "learning_rate": 8.371414881237593e-05, + "loss": 0.8078, + "step": 12647 + }, + { + "epoch": 0.8569686293109289, + "grad_norm": 5.702794075012207, + "learning_rate": 8.371277979327811e-05, + "loss": 0.734, + "step": 12648 + }, + { + "epoch": 0.857036384578901, + "grad_norm": 6.302651882171631, + "learning_rate": 8.37114107741803e-05, + "loss": 0.7974, + "step": 12649 + }, + { + "epoch": 0.8571041398468731, + "grad_norm": 6.76057243347168, + "learning_rate": 8.371004175508249e-05, + "loss": 0.8451, + "step": 12650 + }, + { + "epoch": 0.8571718951148452, + "grad_norm": 5.101822376251221, + "learning_rate": 8.370867273598467e-05, + "loss": 0.7528, + "step": 12651 + }, + { + "epoch": 0.8572396503828172, + "grad_norm": 5.3127641677856445, + "learning_rate": 8.370730371688685e-05, + "loss": 0.7881, + "step": 12652 + }, + { + "epoch": 0.8573074056507893, + "grad_norm": 6.015966892242432, + "learning_rate": 8.370593469778903e-05, + "loss": 0.6502, + "step": 12653 + }, + { + "epoch": 0.8573751609187614, + "grad_norm": 4.573479175567627, + "learning_rate": 8.370456567869122e-05, + "loss": 0.5229, + "step": 12654 + }, + { + "epoch": 0.8574429161867335, + "grad_norm": 7.478061676025391, + "learning_rate": 8.37031966595934e-05, + "loss": 1.1038, + "step": 12655 + }, + { + "epoch": 0.8575106714547056, + "grad_norm": 6.543705940246582, + "learning_rate": 8.370182764049558e-05, + "loss": 0.733, + "step": 12656 + }, + { + "epoch": 0.8575784267226777, + "grad_norm": 5.073485851287842, + "learning_rate": 8.370045862139776e-05, + "loss": 0.8401, + "step": 12657 + }, + { + "epoch": 0.8576461819906498, + "grad_norm": 6.257830619812012, + "learning_rate": 8.369908960229996e-05, + "loss": 0.709, + "step": 12658 + }, + { + "epoch": 0.8577139372586219, + "grad_norm": 7.744876384735107, + "learning_rate": 8.369772058320214e-05, + "loss": 0.7146, + "step": 12659 + }, + { + "epoch": 0.857781692526594, + "grad_norm": 6.404613971710205, + "learning_rate": 8.369635156410432e-05, + "loss": 0.7723, + "step": 12660 + }, + { + "epoch": 0.8578494477945661, + "grad_norm": 7.350032806396484, + "learning_rate": 8.369498254500651e-05, + "loss": 0.9622, + "step": 12661 + }, + { + "epoch": 0.8579172030625382, + "grad_norm": 7.5004191398620605, + "learning_rate": 8.369361352590869e-05, + "loss": 0.8981, + "step": 12662 + }, + { + "epoch": 0.8579849583305103, + "grad_norm": 6.804741382598877, + "learning_rate": 8.369224450681087e-05, + "loss": 0.7108, + "step": 12663 + }, + { + "epoch": 0.8580527135984822, + "grad_norm": 6.440323829650879, + "learning_rate": 8.369087548771307e-05, + "loss": 0.7311, + "step": 12664 + }, + { + "epoch": 0.8581204688664543, + "grad_norm": 8.581847190856934, + "learning_rate": 8.368950646861525e-05, + "loss": 0.9141, + "step": 12665 + }, + { + "epoch": 0.8581882241344264, + "grad_norm": 9.157425880432129, + "learning_rate": 8.368813744951743e-05, + "loss": 0.8747, + "step": 12666 + }, + { + "epoch": 0.8582559794023985, + "grad_norm": 5.952433109283447, + "learning_rate": 8.368676843041961e-05, + "loss": 0.6971, + "step": 12667 + }, + { + "epoch": 0.8583237346703706, + "grad_norm": 7.463055610656738, + "learning_rate": 8.36853994113218e-05, + "loss": 0.8685, + "step": 12668 + }, + { + "epoch": 0.8583914899383427, + "grad_norm": 6.422219753265381, + "learning_rate": 8.368403039222398e-05, + "loss": 0.7494, + "step": 12669 + }, + { + "epoch": 0.8584592452063148, + "grad_norm": 7.621025085449219, + "learning_rate": 8.368266137312616e-05, + "loss": 0.7981, + "step": 12670 + }, + { + "epoch": 0.8585270004742869, + "grad_norm": 5.8451690673828125, + "learning_rate": 8.368129235402834e-05, + "loss": 0.6134, + "step": 12671 + }, + { + "epoch": 0.858594755742259, + "grad_norm": 6.098859786987305, + "learning_rate": 8.367992333493052e-05, + "loss": 0.7698, + "step": 12672 + }, + { + "epoch": 0.858662511010231, + "grad_norm": 7.19790506362915, + "learning_rate": 8.367855431583272e-05, + "loss": 0.8783, + "step": 12673 + }, + { + "epoch": 0.8587302662782031, + "grad_norm": 7.8366899490356445, + "learning_rate": 8.36771852967349e-05, + "loss": 0.8574, + "step": 12674 + }, + { + "epoch": 0.8587980215461752, + "grad_norm": 6.070743083953857, + "learning_rate": 8.367581627763708e-05, + "loss": 0.9785, + "step": 12675 + }, + { + "epoch": 0.8588657768141473, + "grad_norm": 7.112607002258301, + "learning_rate": 8.367444725853926e-05, + "loss": 0.8502, + "step": 12676 + }, + { + "epoch": 0.8589335320821194, + "grad_norm": 5.572093963623047, + "learning_rate": 8.367307823944145e-05, + "loss": 0.9328, + "step": 12677 + }, + { + "epoch": 0.8590012873500915, + "grad_norm": 5.674850940704346, + "learning_rate": 8.367170922034363e-05, + "loss": 0.8259, + "step": 12678 + }, + { + "epoch": 0.8590690426180636, + "grad_norm": 5.8405890464782715, + "learning_rate": 8.367034020124581e-05, + "loss": 0.6563, + "step": 12679 + }, + { + "epoch": 0.8591367978860356, + "grad_norm": 5.2222981452941895, + "learning_rate": 8.366897118214799e-05, + "loss": 0.7247, + "step": 12680 + }, + { + "epoch": 0.8592045531540077, + "grad_norm": 5.854241847991943, + "learning_rate": 8.366760216305017e-05, + "loss": 0.5706, + "step": 12681 + }, + { + "epoch": 0.8592723084219798, + "grad_norm": 6.589792251586914, + "learning_rate": 8.366623314395237e-05, + "loss": 0.6776, + "step": 12682 + }, + { + "epoch": 0.8593400636899519, + "grad_norm": 5.009568691253662, + "learning_rate": 8.366486412485455e-05, + "loss": 0.7079, + "step": 12683 + }, + { + "epoch": 0.859407818957924, + "grad_norm": 5.921725749969482, + "learning_rate": 8.366349510575673e-05, + "loss": 0.9658, + "step": 12684 + }, + { + "epoch": 0.859475574225896, + "grad_norm": 7.070530414581299, + "learning_rate": 8.366212608665891e-05, + "loss": 0.7979, + "step": 12685 + }, + { + "epoch": 0.8595433294938681, + "grad_norm": 8.296648979187012, + "learning_rate": 8.36607570675611e-05, + "loss": 0.8313, + "step": 12686 + }, + { + "epoch": 0.8596110847618402, + "grad_norm": 5.253691673278809, + "learning_rate": 8.365938804846328e-05, + "loss": 0.7979, + "step": 12687 + }, + { + "epoch": 0.8596788400298123, + "grad_norm": 6.794055461883545, + "learning_rate": 8.365801902936546e-05, + "loss": 0.8337, + "step": 12688 + }, + { + "epoch": 0.8597465952977844, + "grad_norm": 5.086427211761475, + "learning_rate": 8.365665001026764e-05, + "loss": 0.4667, + "step": 12689 + }, + { + "epoch": 0.8598143505657565, + "grad_norm": 5.407829761505127, + "learning_rate": 8.365528099116982e-05, + "loss": 0.7093, + "step": 12690 + }, + { + "epoch": 0.8598821058337286, + "grad_norm": 5.971084117889404, + "learning_rate": 8.365391197207202e-05, + "loss": 1.0033, + "step": 12691 + }, + { + "epoch": 0.8599498611017007, + "grad_norm": 5.477384090423584, + "learning_rate": 8.36525429529742e-05, + "loss": 0.5806, + "step": 12692 + }, + { + "epoch": 0.8600176163696728, + "grad_norm": 7.340150833129883, + "learning_rate": 8.365117393387638e-05, + "loss": 0.7215, + "step": 12693 + }, + { + "epoch": 0.8600853716376449, + "grad_norm": 5.252416133880615, + "learning_rate": 8.364980491477856e-05, + "loss": 0.7346, + "step": 12694 + }, + { + "epoch": 0.860153126905617, + "grad_norm": 7.507197380065918, + "learning_rate": 8.364843589568074e-05, + "loss": 1.0633, + "step": 12695 + }, + { + "epoch": 0.860220882173589, + "grad_norm": 4.731564998626709, + "learning_rate": 8.364706687658293e-05, + "loss": 0.5687, + "step": 12696 + }, + { + "epoch": 0.860288637441561, + "grad_norm": 6.081350803375244, + "learning_rate": 8.364569785748511e-05, + "loss": 0.7199, + "step": 12697 + }, + { + "epoch": 0.8603563927095331, + "grad_norm": 6.91575288772583, + "learning_rate": 8.36443288383873e-05, + "loss": 0.9717, + "step": 12698 + }, + { + "epoch": 0.8604241479775052, + "grad_norm": 5.101013660430908, + "learning_rate": 8.364295981928947e-05, + "loss": 0.772, + "step": 12699 + }, + { + "epoch": 0.8604919032454773, + "grad_norm": 6.185006141662598, + "learning_rate": 8.364159080019167e-05, + "loss": 0.601, + "step": 12700 + }, + { + "epoch": 0.8605596585134494, + "grad_norm": 6.696321487426758, + "learning_rate": 8.364022178109385e-05, + "loss": 0.8923, + "step": 12701 + }, + { + "epoch": 0.8606274137814215, + "grad_norm": 4.8003129959106445, + "learning_rate": 8.363885276199603e-05, + "loss": 0.656, + "step": 12702 + }, + { + "epoch": 0.8606951690493936, + "grad_norm": 8.93622875213623, + "learning_rate": 8.363748374289821e-05, + "loss": 0.8062, + "step": 12703 + }, + { + "epoch": 0.8607629243173657, + "grad_norm": 5.659854412078857, + "learning_rate": 8.36361147238004e-05, + "loss": 0.8201, + "step": 12704 + }, + { + "epoch": 0.8608306795853378, + "grad_norm": 6.121464729309082, + "learning_rate": 8.363474570470258e-05, + "loss": 0.8329, + "step": 12705 + }, + { + "epoch": 0.8608984348533099, + "grad_norm": 5.242605686187744, + "learning_rate": 8.363337668560476e-05, + "loss": 0.7151, + "step": 12706 + }, + { + "epoch": 0.860966190121282, + "grad_norm": 5.1735382080078125, + "learning_rate": 8.363200766650696e-05, + "loss": 0.7501, + "step": 12707 + }, + { + "epoch": 0.861033945389254, + "grad_norm": 7.21480655670166, + "learning_rate": 8.363063864740914e-05, + "loss": 0.9973, + "step": 12708 + }, + { + "epoch": 0.8611017006572261, + "grad_norm": 6.435115814208984, + "learning_rate": 8.362926962831132e-05, + "loss": 0.7217, + "step": 12709 + }, + { + "epoch": 0.8611694559251982, + "grad_norm": 6.879031658172607, + "learning_rate": 8.362790060921351e-05, + "loss": 0.9322, + "step": 12710 + }, + { + "epoch": 0.8612372111931703, + "grad_norm": 5.913161277770996, + "learning_rate": 8.362653159011569e-05, + "loss": 0.6451, + "step": 12711 + }, + { + "epoch": 0.8613049664611424, + "grad_norm": 4.3174920082092285, + "learning_rate": 8.362516257101787e-05, + "loss": 0.682, + "step": 12712 + }, + { + "epoch": 0.8613727217291144, + "grad_norm": 5.966519832611084, + "learning_rate": 8.362379355192005e-05, + "loss": 0.6362, + "step": 12713 + }, + { + "epoch": 0.8614404769970865, + "grad_norm": 6.3389410972595215, + "learning_rate": 8.362242453282225e-05, + "loss": 0.7318, + "step": 12714 + }, + { + "epoch": 0.8615082322650586, + "grad_norm": 8.78852653503418, + "learning_rate": 8.362105551372443e-05, + "loss": 0.7471, + "step": 12715 + }, + { + "epoch": 0.8615759875330307, + "grad_norm": 5.232541084289551, + "learning_rate": 8.361968649462661e-05, + "loss": 0.7463, + "step": 12716 + }, + { + "epoch": 0.8616437428010028, + "grad_norm": 6.598004341125488, + "learning_rate": 8.361831747552879e-05, + "loss": 0.7651, + "step": 12717 + }, + { + "epoch": 0.8617114980689748, + "grad_norm": 6.944410800933838, + "learning_rate": 8.361694845643097e-05, + "loss": 0.8616, + "step": 12718 + }, + { + "epoch": 0.8617792533369469, + "grad_norm": 5.001755714416504, + "learning_rate": 8.361557943733316e-05, + "loss": 0.6435, + "step": 12719 + }, + { + "epoch": 0.861847008604919, + "grad_norm": 5.759768486022949, + "learning_rate": 8.361421041823534e-05, + "loss": 0.8236, + "step": 12720 + }, + { + "epoch": 0.8619147638728911, + "grad_norm": 7.86954927444458, + "learning_rate": 8.361284139913752e-05, + "loss": 0.9191, + "step": 12721 + }, + { + "epoch": 0.8619825191408632, + "grad_norm": 7.7088541984558105, + "learning_rate": 8.36114723800397e-05, + "loss": 0.874, + "step": 12722 + }, + { + "epoch": 0.8620502744088353, + "grad_norm": 5.526132583618164, + "learning_rate": 8.36101033609419e-05, + "loss": 0.8083, + "step": 12723 + }, + { + "epoch": 0.8621180296768074, + "grad_norm": 6.153379440307617, + "learning_rate": 8.360873434184408e-05, + "loss": 0.8097, + "step": 12724 + }, + { + "epoch": 0.8621857849447795, + "grad_norm": 6.498526573181152, + "learning_rate": 8.360736532274626e-05, + "loss": 0.8441, + "step": 12725 + }, + { + "epoch": 0.8622535402127516, + "grad_norm": 6.869676113128662, + "learning_rate": 8.360599630364844e-05, + "loss": 0.8353, + "step": 12726 + }, + { + "epoch": 0.8623212954807237, + "grad_norm": 5.733700275421143, + "learning_rate": 8.360462728455062e-05, + "loss": 0.6627, + "step": 12727 + }, + { + "epoch": 0.8623890507486958, + "grad_norm": 5.4007415771484375, + "learning_rate": 8.360325826545281e-05, + "loss": 0.5945, + "step": 12728 + }, + { + "epoch": 0.8624568060166677, + "grad_norm": 5.028225898742676, + "learning_rate": 8.360188924635499e-05, + "loss": 0.7625, + "step": 12729 + }, + { + "epoch": 0.8625245612846398, + "grad_norm": 6.950478553771973, + "learning_rate": 8.360052022725717e-05, + "loss": 0.9458, + "step": 12730 + }, + { + "epoch": 0.8625923165526119, + "grad_norm": 5.532278060913086, + "learning_rate": 8.359915120815935e-05, + "loss": 0.7427, + "step": 12731 + }, + { + "epoch": 0.862660071820584, + "grad_norm": 6.450150966644287, + "learning_rate": 8.359778218906155e-05, + "loss": 0.7621, + "step": 12732 + }, + { + "epoch": 0.8627278270885561, + "grad_norm": 8.56269645690918, + "learning_rate": 8.359641316996373e-05, + "loss": 1.0904, + "step": 12733 + }, + { + "epoch": 0.8627955823565282, + "grad_norm": 5.204941272735596, + "learning_rate": 8.359504415086591e-05, + "loss": 0.6651, + "step": 12734 + }, + { + "epoch": 0.8628633376245003, + "grad_norm": 6.284374237060547, + "learning_rate": 8.359367513176809e-05, + "loss": 0.7062, + "step": 12735 + }, + { + "epoch": 0.8629310928924724, + "grad_norm": 5.432925224304199, + "learning_rate": 8.359230611267027e-05, + "loss": 0.7484, + "step": 12736 + }, + { + "epoch": 0.8629988481604445, + "grad_norm": 8.31678581237793, + "learning_rate": 8.359093709357246e-05, + "loss": 0.7803, + "step": 12737 + }, + { + "epoch": 0.8630666034284166, + "grad_norm": 5.666910171508789, + "learning_rate": 8.358956807447464e-05, + "loss": 0.6977, + "step": 12738 + }, + { + "epoch": 0.8631343586963887, + "grad_norm": 5.904063701629639, + "learning_rate": 8.358819905537682e-05, + "loss": 0.9111, + "step": 12739 + }, + { + "epoch": 0.8632021139643608, + "grad_norm": 5.75467586517334, + "learning_rate": 8.3586830036279e-05, + "loss": 0.8442, + "step": 12740 + }, + { + "epoch": 0.8632698692323328, + "grad_norm": 6.118785381317139, + "learning_rate": 8.35854610171812e-05, + "loss": 0.8948, + "step": 12741 + }, + { + "epoch": 0.8633376245003049, + "grad_norm": 6.068502426147461, + "learning_rate": 8.358409199808338e-05, + "loss": 0.6608, + "step": 12742 + }, + { + "epoch": 0.863405379768277, + "grad_norm": 6.425013542175293, + "learning_rate": 8.358272297898556e-05, + "loss": 0.6886, + "step": 12743 + }, + { + "epoch": 0.8634731350362491, + "grad_norm": 6.034631729125977, + "learning_rate": 8.358135395988774e-05, + "loss": 0.7105, + "step": 12744 + }, + { + "epoch": 0.8635408903042211, + "grad_norm": 5.688156604766846, + "learning_rate": 8.357998494078992e-05, + "loss": 0.7705, + "step": 12745 + }, + { + "epoch": 0.8636086455721932, + "grad_norm": 6.057868480682373, + "learning_rate": 8.357861592169211e-05, + "loss": 0.7228, + "step": 12746 + }, + { + "epoch": 0.8636764008401653, + "grad_norm": 9.307186126708984, + "learning_rate": 8.357724690259429e-05, + "loss": 0.8587, + "step": 12747 + }, + { + "epoch": 0.8637441561081374, + "grad_norm": 4.899465560913086, + "learning_rate": 8.357587788349647e-05, + "loss": 0.78, + "step": 12748 + }, + { + "epoch": 0.8638119113761095, + "grad_norm": 5.833337783813477, + "learning_rate": 8.357450886439865e-05, + "loss": 0.8024, + "step": 12749 + }, + { + "epoch": 0.8638796666440816, + "grad_norm": 5.353335380554199, + "learning_rate": 8.357313984530085e-05, + "loss": 0.9984, + "step": 12750 + }, + { + "epoch": 0.8639474219120536, + "grad_norm": 5.963881969451904, + "learning_rate": 8.357177082620303e-05, + "loss": 0.6835, + "step": 12751 + }, + { + "epoch": 0.8640151771800257, + "grad_norm": 5.2977166175842285, + "learning_rate": 8.357040180710521e-05, + "loss": 0.8039, + "step": 12752 + }, + { + "epoch": 0.8640829324479978, + "grad_norm": 5.586292743682861, + "learning_rate": 8.35690327880074e-05, + "loss": 0.8733, + "step": 12753 + }, + { + "epoch": 0.8641506877159699, + "grad_norm": 8.01198959350586, + "learning_rate": 8.356766376890958e-05, + "loss": 1.0226, + "step": 12754 + }, + { + "epoch": 0.864218442983942, + "grad_norm": 5.4859089851379395, + "learning_rate": 8.356629474981176e-05, + "loss": 0.9956, + "step": 12755 + }, + { + "epoch": 0.8642861982519141, + "grad_norm": 6.660008907318115, + "learning_rate": 8.356492573071396e-05, + "loss": 0.8324, + "step": 12756 + }, + { + "epoch": 0.8643539535198862, + "grad_norm": 4.699826717376709, + "learning_rate": 8.356355671161614e-05, + "loss": 0.6846, + "step": 12757 + }, + { + "epoch": 0.8644217087878583, + "grad_norm": 5.154027938842773, + "learning_rate": 8.356218769251832e-05, + "loss": 0.7131, + "step": 12758 + }, + { + "epoch": 0.8644894640558304, + "grad_norm": 6.356171131134033, + "learning_rate": 8.35608186734205e-05, + "loss": 0.9622, + "step": 12759 + }, + { + "epoch": 0.8645572193238025, + "grad_norm": 4.833785057067871, + "learning_rate": 8.355944965432269e-05, + "loss": 0.7694, + "step": 12760 + }, + { + "epoch": 0.8646249745917746, + "grad_norm": 5.7593889236450195, + "learning_rate": 8.355808063522487e-05, + "loss": 0.5833, + "step": 12761 + }, + { + "epoch": 0.8646927298597465, + "grad_norm": 5.679107189178467, + "learning_rate": 8.355671161612705e-05, + "loss": 0.7964, + "step": 12762 + }, + { + "epoch": 0.8647604851277186, + "grad_norm": 7.541195869445801, + "learning_rate": 8.355534259702923e-05, + "loss": 0.7552, + "step": 12763 + }, + { + "epoch": 0.8648282403956907, + "grad_norm": 7.550341606140137, + "learning_rate": 8.355397357793143e-05, + "loss": 0.9926, + "step": 12764 + }, + { + "epoch": 0.8648959956636628, + "grad_norm": 5.840928554534912, + "learning_rate": 8.35526045588336e-05, + "loss": 0.7363, + "step": 12765 + }, + { + "epoch": 0.8649637509316349, + "grad_norm": 5.607713222503662, + "learning_rate": 8.355123553973579e-05, + "loss": 0.7449, + "step": 12766 + }, + { + "epoch": 0.865031506199607, + "grad_norm": 4.359575271606445, + "learning_rate": 8.354986652063797e-05, + "loss": 0.7983, + "step": 12767 + }, + { + "epoch": 0.8650992614675791, + "grad_norm": 5.7104363441467285, + "learning_rate": 8.354849750154015e-05, + "loss": 0.7692, + "step": 12768 + }, + { + "epoch": 0.8651670167355512, + "grad_norm": 6.246327877044678, + "learning_rate": 8.354712848244234e-05, + "loss": 0.8406, + "step": 12769 + }, + { + "epoch": 0.8652347720035233, + "grad_norm": 5.536504745483398, + "learning_rate": 8.354575946334452e-05, + "loss": 0.8198, + "step": 12770 + }, + { + "epoch": 0.8653025272714954, + "grad_norm": 6.570160388946533, + "learning_rate": 8.35443904442467e-05, + "loss": 0.4944, + "step": 12771 + }, + { + "epoch": 0.8653702825394675, + "grad_norm": 4.442549705505371, + "learning_rate": 8.354302142514888e-05, + "loss": 0.6964, + "step": 12772 + }, + { + "epoch": 0.8654380378074396, + "grad_norm": 5.418501853942871, + "learning_rate": 8.354165240605106e-05, + "loss": 0.7211, + "step": 12773 + }, + { + "epoch": 0.8655057930754116, + "grad_norm": 8.48850154876709, + "learning_rate": 8.354028338695326e-05, + "loss": 0.7467, + "step": 12774 + }, + { + "epoch": 0.8655735483433837, + "grad_norm": 7.069798946380615, + "learning_rate": 8.353891436785544e-05, + "loss": 0.6164, + "step": 12775 + }, + { + "epoch": 0.8656413036113558, + "grad_norm": 9.349570274353027, + "learning_rate": 8.353754534875762e-05, + "loss": 0.7237, + "step": 12776 + }, + { + "epoch": 0.8657090588793279, + "grad_norm": 6.17117977142334, + "learning_rate": 8.35361763296598e-05, + "loss": 0.9118, + "step": 12777 + }, + { + "epoch": 0.8657768141472999, + "grad_norm": 6.606432914733887, + "learning_rate": 8.353480731056199e-05, + "loss": 0.7991, + "step": 12778 + }, + { + "epoch": 0.865844569415272, + "grad_norm": 8.163229942321777, + "learning_rate": 8.353343829146417e-05, + "loss": 1.0244, + "step": 12779 + }, + { + "epoch": 0.8659123246832441, + "grad_norm": 8.662758827209473, + "learning_rate": 8.353206927236635e-05, + "loss": 0.9709, + "step": 12780 + }, + { + "epoch": 0.8659800799512162, + "grad_norm": 5.596740245819092, + "learning_rate": 8.353070025326853e-05, + "loss": 0.6034, + "step": 12781 + }, + { + "epoch": 0.8660478352191883, + "grad_norm": 6.353254795074463, + "learning_rate": 8.352933123417071e-05, + "loss": 0.8139, + "step": 12782 + }, + { + "epoch": 0.8661155904871604, + "grad_norm": 6.522273540496826, + "learning_rate": 8.352796221507291e-05, + "loss": 0.7098, + "step": 12783 + }, + { + "epoch": 0.8661833457551324, + "grad_norm": 6.948729038238525, + "learning_rate": 8.352659319597509e-05, + "loss": 0.8503, + "step": 12784 + }, + { + "epoch": 0.8662511010231045, + "grad_norm": 4.999991416931152, + "learning_rate": 8.352522417687727e-05, + "loss": 0.8581, + "step": 12785 + }, + { + "epoch": 0.8663188562910766, + "grad_norm": 5.613487243652344, + "learning_rate": 8.352385515777945e-05, + "loss": 0.7931, + "step": 12786 + }, + { + "epoch": 0.8663866115590487, + "grad_norm": 4.850801944732666, + "learning_rate": 8.352248613868164e-05, + "loss": 0.8297, + "step": 12787 + }, + { + "epoch": 0.8664543668270208, + "grad_norm": 6.223756313323975, + "learning_rate": 8.352111711958382e-05, + "loss": 0.7722, + "step": 12788 + }, + { + "epoch": 0.8665221220949929, + "grad_norm": 6.001883029937744, + "learning_rate": 8.3519748100486e-05, + "loss": 0.7325, + "step": 12789 + }, + { + "epoch": 0.866589877362965, + "grad_norm": 6.25042724609375, + "learning_rate": 8.351837908138818e-05, + "loss": 0.5647, + "step": 12790 + }, + { + "epoch": 0.8666576326309371, + "grad_norm": 5.111363410949707, + "learning_rate": 8.351701006229036e-05, + "loss": 0.6094, + "step": 12791 + }, + { + "epoch": 0.8667253878989092, + "grad_norm": 6.161365985870361, + "learning_rate": 8.351564104319256e-05, + "loss": 0.9958, + "step": 12792 + }, + { + "epoch": 0.8667931431668813, + "grad_norm": 7.109573841094971, + "learning_rate": 8.351427202409474e-05, + "loss": 0.8822, + "step": 12793 + }, + { + "epoch": 0.8668608984348533, + "grad_norm": 5.816695690155029, + "learning_rate": 8.351290300499692e-05, + "loss": 0.7643, + "step": 12794 + }, + { + "epoch": 0.8669286537028253, + "grad_norm": 6.266007423400879, + "learning_rate": 8.35115339858991e-05, + "loss": 0.6835, + "step": 12795 + }, + { + "epoch": 0.8669964089707974, + "grad_norm": 5.289031505584717, + "learning_rate": 8.351016496680129e-05, + "loss": 0.6873, + "step": 12796 + }, + { + "epoch": 0.8670641642387695, + "grad_norm": 6.6583251953125, + "learning_rate": 8.350879594770347e-05, + "loss": 0.9201, + "step": 12797 + }, + { + "epoch": 0.8671319195067416, + "grad_norm": 8.268150329589844, + "learning_rate": 8.350742692860565e-05, + "loss": 1.0113, + "step": 12798 + }, + { + "epoch": 0.8671996747747137, + "grad_norm": 5.9207377433776855, + "learning_rate": 8.350605790950783e-05, + "loss": 0.8132, + "step": 12799 + }, + { + "epoch": 0.8672674300426858, + "grad_norm": 7.960853099822998, + "learning_rate": 8.350468889041003e-05, + "loss": 0.8865, + "step": 12800 + }, + { + "epoch": 0.8673351853106579, + "grad_norm": 7.3755364418029785, + "learning_rate": 8.350331987131221e-05, + "loss": 1.0102, + "step": 12801 + }, + { + "epoch": 0.86740294057863, + "grad_norm": 5.826414108276367, + "learning_rate": 8.35019508522144e-05, + "loss": 0.6119, + "step": 12802 + }, + { + "epoch": 0.8674706958466021, + "grad_norm": 4.6021857261657715, + "learning_rate": 8.350058183311658e-05, + "loss": 0.7426, + "step": 12803 + }, + { + "epoch": 0.8675384511145742, + "grad_norm": 6.744377136230469, + "learning_rate": 8.349921281401876e-05, + "loss": 0.6373, + "step": 12804 + }, + { + "epoch": 0.8676062063825463, + "grad_norm": 5.894944190979004, + "learning_rate": 8.349784379492094e-05, + "loss": 0.8591, + "step": 12805 + }, + { + "epoch": 0.8676739616505184, + "grad_norm": 6.757943630218506, + "learning_rate": 8.349647477582314e-05, + "loss": 0.8015, + "step": 12806 + }, + { + "epoch": 0.8677417169184904, + "grad_norm": 4.994389057159424, + "learning_rate": 8.349510575672532e-05, + "loss": 0.8663, + "step": 12807 + }, + { + "epoch": 0.8678094721864625, + "grad_norm": 6.01758337020874, + "learning_rate": 8.34937367376275e-05, + "loss": 0.8132, + "step": 12808 + }, + { + "epoch": 0.8678772274544346, + "grad_norm": 5.205808639526367, + "learning_rate": 8.349236771852968e-05, + "loss": 0.7275, + "step": 12809 + }, + { + "epoch": 0.8679449827224067, + "grad_norm": 6.259244441986084, + "learning_rate": 8.349099869943187e-05, + "loss": 0.9726, + "step": 12810 + }, + { + "epoch": 0.8680127379903787, + "grad_norm": 9.059528350830078, + "learning_rate": 8.348962968033405e-05, + "loss": 0.9158, + "step": 12811 + }, + { + "epoch": 0.8680804932583508, + "grad_norm": 4.859555244445801, + "learning_rate": 8.348826066123623e-05, + "loss": 1.0459, + "step": 12812 + }, + { + "epoch": 0.8681482485263229, + "grad_norm": 5.262827396392822, + "learning_rate": 8.348689164213841e-05, + "loss": 0.6795, + "step": 12813 + }, + { + "epoch": 0.868216003794295, + "grad_norm": 7.7913818359375, + "learning_rate": 8.348552262304059e-05, + "loss": 0.9334, + "step": 12814 + }, + { + "epoch": 0.8682837590622671, + "grad_norm": 6.01145601272583, + "learning_rate": 8.348415360394279e-05, + "loss": 0.8221, + "step": 12815 + }, + { + "epoch": 0.8683515143302392, + "grad_norm": 6.34686279296875, + "learning_rate": 8.348278458484497e-05, + "loss": 0.7442, + "step": 12816 + }, + { + "epoch": 0.8684192695982113, + "grad_norm": 6.925653457641602, + "learning_rate": 8.348141556574715e-05, + "loss": 0.9746, + "step": 12817 + }, + { + "epoch": 0.8684870248661833, + "grad_norm": 5.890630722045898, + "learning_rate": 8.348004654664933e-05, + "loss": 0.8322, + "step": 12818 + }, + { + "epoch": 0.8685547801341554, + "grad_norm": 5.628925800323486, + "learning_rate": 8.347867752755152e-05, + "loss": 0.7154, + "step": 12819 + }, + { + "epoch": 0.8686225354021275, + "grad_norm": 5.708745956420898, + "learning_rate": 8.34773085084537e-05, + "loss": 0.5718, + "step": 12820 + }, + { + "epoch": 0.8686902906700996, + "grad_norm": 5.534167289733887, + "learning_rate": 8.347593948935588e-05, + "loss": 0.84, + "step": 12821 + }, + { + "epoch": 0.8687580459380717, + "grad_norm": 6.858389854431152, + "learning_rate": 8.347457047025806e-05, + "loss": 0.7687, + "step": 12822 + }, + { + "epoch": 0.8688258012060438, + "grad_norm": 5.650959491729736, + "learning_rate": 8.347320145116024e-05, + "loss": 0.8645, + "step": 12823 + }, + { + "epoch": 0.8688935564740159, + "grad_norm": 5.0962324142456055, + "learning_rate": 8.347183243206244e-05, + "loss": 0.6123, + "step": 12824 + }, + { + "epoch": 0.868961311741988, + "grad_norm": 7.235836029052734, + "learning_rate": 8.347046341296462e-05, + "loss": 0.831, + "step": 12825 + }, + { + "epoch": 0.8690290670099601, + "grad_norm": 5.751955986022949, + "learning_rate": 8.34690943938668e-05, + "loss": 0.6362, + "step": 12826 + }, + { + "epoch": 0.8690968222779321, + "grad_norm": 6.159754276275635, + "learning_rate": 8.346772537476898e-05, + "loss": 0.6077, + "step": 12827 + }, + { + "epoch": 0.8691645775459041, + "grad_norm": 5.868882179260254, + "learning_rate": 8.346635635567116e-05, + "loss": 0.6687, + "step": 12828 + }, + { + "epoch": 0.8692323328138762, + "grad_norm": 6.147858142852783, + "learning_rate": 8.346498733657335e-05, + "loss": 0.7494, + "step": 12829 + }, + { + "epoch": 0.8693000880818483, + "grad_norm": 7.173940181732178, + "learning_rate": 8.346361831747553e-05, + "loss": 0.906, + "step": 12830 + }, + { + "epoch": 0.8693678433498204, + "grad_norm": 6.0628814697265625, + "learning_rate": 8.346224929837771e-05, + "loss": 0.6627, + "step": 12831 + }, + { + "epoch": 0.8694355986177925, + "grad_norm": 5.7082343101501465, + "learning_rate": 8.346088027927989e-05, + "loss": 0.7305, + "step": 12832 + }, + { + "epoch": 0.8695033538857646, + "grad_norm": 7.864633083343506, + "learning_rate": 8.345951126018209e-05, + "loss": 0.7189, + "step": 12833 + }, + { + "epoch": 0.8695711091537367, + "grad_norm": 6.37849235534668, + "learning_rate": 8.345814224108427e-05, + "loss": 0.6766, + "step": 12834 + }, + { + "epoch": 0.8696388644217088, + "grad_norm": 5.001320838928223, + "learning_rate": 8.345677322198645e-05, + "loss": 0.6396, + "step": 12835 + }, + { + "epoch": 0.8697066196896809, + "grad_norm": 5.3315606117248535, + "learning_rate": 8.345540420288863e-05, + "loss": 0.7312, + "step": 12836 + }, + { + "epoch": 0.869774374957653, + "grad_norm": 8.665613174438477, + "learning_rate": 8.345403518379081e-05, + "loss": 0.7123, + "step": 12837 + }, + { + "epoch": 0.8698421302256251, + "grad_norm": 6.103067874908447, + "learning_rate": 8.3452666164693e-05, + "loss": 0.6624, + "step": 12838 + }, + { + "epoch": 0.8699098854935972, + "grad_norm": 5.679276943206787, + "learning_rate": 8.345129714559518e-05, + "loss": 0.8496, + "step": 12839 + }, + { + "epoch": 0.8699776407615692, + "grad_norm": 5.381316184997559, + "learning_rate": 8.344992812649736e-05, + "loss": 0.5979, + "step": 12840 + }, + { + "epoch": 0.8700453960295413, + "grad_norm": 7.85809326171875, + "learning_rate": 8.344855910739954e-05, + "loss": 0.934, + "step": 12841 + }, + { + "epoch": 0.8701131512975134, + "grad_norm": 5.871927738189697, + "learning_rate": 8.344719008830174e-05, + "loss": 0.8423, + "step": 12842 + }, + { + "epoch": 0.8701809065654854, + "grad_norm": 7.4773268699646, + "learning_rate": 8.344582106920392e-05, + "loss": 0.9766, + "step": 12843 + }, + { + "epoch": 0.8702486618334575, + "grad_norm": 5.408422946929932, + "learning_rate": 8.34444520501061e-05, + "loss": 0.8514, + "step": 12844 + }, + { + "epoch": 0.8703164171014296, + "grad_norm": 5.86200475692749, + "learning_rate": 8.344308303100828e-05, + "loss": 0.7288, + "step": 12845 + }, + { + "epoch": 0.8703841723694017, + "grad_norm": 5.635140419006348, + "learning_rate": 8.344171401191047e-05, + "loss": 0.7645, + "step": 12846 + }, + { + "epoch": 0.8704519276373738, + "grad_norm": 4.9930524826049805, + "learning_rate": 8.344034499281265e-05, + "loss": 0.6171, + "step": 12847 + }, + { + "epoch": 0.8705196829053459, + "grad_norm": 7.424667835235596, + "learning_rate": 8.343897597371483e-05, + "loss": 0.6401, + "step": 12848 + }, + { + "epoch": 0.870587438173318, + "grad_norm": 6.124965667724609, + "learning_rate": 8.343760695461703e-05, + "loss": 0.8959, + "step": 12849 + }, + { + "epoch": 0.87065519344129, + "grad_norm": 7.060937881469727, + "learning_rate": 8.34362379355192e-05, + "loss": 0.9213, + "step": 12850 + }, + { + "epoch": 0.8707229487092621, + "grad_norm": 5.401867389678955, + "learning_rate": 8.343486891642139e-05, + "loss": 0.6287, + "step": 12851 + }, + { + "epoch": 0.8707907039772342, + "grad_norm": 6.958117485046387, + "learning_rate": 8.343349989732358e-05, + "loss": 0.9408, + "step": 12852 + }, + { + "epoch": 0.8708584592452063, + "grad_norm": 6.046300888061523, + "learning_rate": 8.343213087822576e-05, + "loss": 0.875, + "step": 12853 + }, + { + "epoch": 0.8709262145131784, + "grad_norm": 6.581618785858154, + "learning_rate": 8.343076185912794e-05, + "loss": 0.9302, + "step": 12854 + }, + { + "epoch": 0.8709939697811505, + "grad_norm": 6.795581817626953, + "learning_rate": 8.342939284003012e-05, + "loss": 0.8144, + "step": 12855 + }, + { + "epoch": 0.8710617250491226, + "grad_norm": 7.541666030883789, + "learning_rate": 8.342802382093232e-05, + "loss": 0.7349, + "step": 12856 + }, + { + "epoch": 0.8711294803170947, + "grad_norm": 7.175131320953369, + "learning_rate": 8.34266548018345e-05, + "loss": 0.9659, + "step": 12857 + }, + { + "epoch": 0.8711972355850668, + "grad_norm": 4.722870349884033, + "learning_rate": 8.342528578273668e-05, + "loss": 0.819, + "step": 12858 + }, + { + "epoch": 0.8712649908530389, + "grad_norm": 6.122445106506348, + "learning_rate": 8.342391676363886e-05, + "loss": 0.839, + "step": 12859 + }, + { + "epoch": 0.8713327461210109, + "grad_norm": 6.808223724365234, + "learning_rate": 8.342254774454104e-05, + "loss": 0.8034, + "step": 12860 + }, + { + "epoch": 0.871400501388983, + "grad_norm": 5.828917026519775, + "learning_rate": 8.342117872544323e-05, + "loss": 0.6161, + "step": 12861 + }, + { + "epoch": 0.871468256656955, + "grad_norm": 5.570794582366943, + "learning_rate": 8.341980970634541e-05, + "loss": 0.738, + "step": 12862 + }, + { + "epoch": 0.8715360119249271, + "grad_norm": 8.307096481323242, + "learning_rate": 8.341844068724759e-05, + "loss": 0.6682, + "step": 12863 + }, + { + "epoch": 0.8716037671928992, + "grad_norm": 5.596463203430176, + "learning_rate": 8.341707166814977e-05, + "loss": 0.7721, + "step": 12864 + }, + { + "epoch": 0.8716715224608713, + "grad_norm": 6.285654067993164, + "learning_rate": 8.341570264905197e-05, + "loss": 0.8688, + "step": 12865 + }, + { + "epoch": 0.8717392777288434, + "grad_norm": 5.791274070739746, + "learning_rate": 8.341433362995415e-05, + "loss": 0.6696, + "step": 12866 + }, + { + "epoch": 0.8718070329968155, + "grad_norm": 6.1828413009643555, + "learning_rate": 8.341296461085633e-05, + "loss": 0.6573, + "step": 12867 + }, + { + "epoch": 0.8718747882647876, + "grad_norm": 6.662983417510986, + "learning_rate": 8.341159559175851e-05, + "loss": 1.0418, + "step": 12868 + }, + { + "epoch": 0.8719425435327597, + "grad_norm": 6.25025749206543, + "learning_rate": 8.341022657266069e-05, + "loss": 0.8527, + "step": 12869 + }, + { + "epoch": 0.8720102988007318, + "grad_norm": 5.201798915863037, + "learning_rate": 8.340885755356288e-05, + "loss": 1.0302, + "step": 12870 + }, + { + "epoch": 0.8720780540687039, + "grad_norm": 7.105745315551758, + "learning_rate": 8.340748853446506e-05, + "loss": 0.8022, + "step": 12871 + }, + { + "epoch": 0.872145809336676, + "grad_norm": 5.172214031219482, + "learning_rate": 8.340611951536724e-05, + "loss": 0.7184, + "step": 12872 + }, + { + "epoch": 0.872213564604648, + "grad_norm": 4.862717151641846, + "learning_rate": 8.340475049626942e-05, + "loss": 0.5587, + "step": 12873 + }, + { + "epoch": 0.8722813198726201, + "grad_norm": 5.38953971862793, + "learning_rate": 8.340338147717162e-05, + "loss": 0.7139, + "step": 12874 + }, + { + "epoch": 0.8723490751405922, + "grad_norm": 6.497892379760742, + "learning_rate": 8.34020124580738e-05, + "loss": 0.7788, + "step": 12875 + }, + { + "epoch": 0.8724168304085642, + "grad_norm": 5.505675792694092, + "learning_rate": 8.340064343897598e-05, + "loss": 0.8322, + "step": 12876 + }, + { + "epoch": 0.8724845856765363, + "grad_norm": 6.221973419189453, + "learning_rate": 8.339927441987816e-05, + "loss": 0.9316, + "step": 12877 + }, + { + "epoch": 0.8725523409445084, + "grad_norm": 5.295744895935059, + "learning_rate": 8.339790540078034e-05, + "loss": 0.8424, + "step": 12878 + }, + { + "epoch": 0.8726200962124805, + "grad_norm": 8.020045280456543, + "learning_rate": 8.339653638168253e-05, + "loss": 0.7412, + "step": 12879 + }, + { + "epoch": 0.8726878514804526, + "grad_norm": 7.37377405166626, + "learning_rate": 8.339516736258471e-05, + "loss": 0.6915, + "step": 12880 + }, + { + "epoch": 0.8727556067484247, + "grad_norm": 4.970482349395752, + "learning_rate": 8.339379834348689e-05, + "loss": 0.6638, + "step": 12881 + }, + { + "epoch": 0.8728233620163968, + "grad_norm": 5.549528121948242, + "learning_rate": 8.339242932438907e-05, + "loss": 0.6753, + "step": 12882 + }, + { + "epoch": 0.8728911172843689, + "grad_norm": 6.604176044464111, + "learning_rate": 8.339106030529125e-05, + "loss": 0.8787, + "step": 12883 + }, + { + "epoch": 0.872958872552341, + "grad_norm": 5.274704933166504, + "learning_rate": 8.338969128619345e-05, + "loss": 0.7042, + "step": 12884 + }, + { + "epoch": 0.873026627820313, + "grad_norm": 6.73942232131958, + "learning_rate": 8.338832226709563e-05, + "loss": 0.9337, + "step": 12885 + }, + { + "epoch": 0.8730943830882851, + "grad_norm": 5.404533863067627, + "learning_rate": 8.338695324799781e-05, + "loss": 0.8108, + "step": 12886 + }, + { + "epoch": 0.8731621383562572, + "grad_norm": 5.6985087394714355, + "learning_rate": 8.338558422889999e-05, + "loss": 0.8906, + "step": 12887 + }, + { + "epoch": 0.8732298936242293, + "grad_norm": 6.844064235687256, + "learning_rate": 8.338421520980218e-05, + "loss": 0.8528, + "step": 12888 + }, + { + "epoch": 0.8732976488922014, + "grad_norm": 7.021403789520264, + "learning_rate": 8.338284619070436e-05, + "loss": 0.8084, + "step": 12889 + }, + { + "epoch": 0.8733654041601735, + "grad_norm": 7.886348247528076, + "learning_rate": 8.338147717160654e-05, + "loss": 0.6421, + "step": 12890 + }, + { + "epoch": 0.8734331594281456, + "grad_norm": 6.461071968078613, + "learning_rate": 8.338010815250872e-05, + "loss": 0.7556, + "step": 12891 + }, + { + "epoch": 0.8735009146961176, + "grad_norm": 5.232357501983643, + "learning_rate": 8.337873913341092e-05, + "loss": 0.7246, + "step": 12892 + }, + { + "epoch": 0.8735686699640897, + "grad_norm": 4.585625648498535, + "learning_rate": 8.33773701143131e-05, + "loss": 0.6826, + "step": 12893 + }, + { + "epoch": 0.8736364252320618, + "grad_norm": 5.381401538848877, + "learning_rate": 8.337600109521528e-05, + "loss": 0.8486, + "step": 12894 + }, + { + "epoch": 0.8737041805000338, + "grad_norm": 6.673171043395996, + "learning_rate": 8.337463207611747e-05, + "loss": 0.6716, + "step": 12895 + }, + { + "epoch": 0.8737719357680059, + "grad_norm": 9.136744499206543, + "learning_rate": 8.337326305701965e-05, + "loss": 1.0048, + "step": 12896 + }, + { + "epoch": 0.873839691035978, + "grad_norm": 7.344038963317871, + "learning_rate": 8.337189403792183e-05, + "loss": 1.0041, + "step": 12897 + }, + { + "epoch": 0.8739074463039501, + "grad_norm": 5.5072832107543945, + "learning_rate": 8.337052501882403e-05, + "loss": 0.6608, + "step": 12898 + }, + { + "epoch": 0.8739752015719222, + "grad_norm": 6.74569845199585, + "learning_rate": 8.33691559997262e-05, + "loss": 0.9765, + "step": 12899 + }, + { + "epoch": 0.8740429568398943, + "grad_norm": 6.700042247772217, + "learning_rate": 8.336778698062839e-05, + "loss": 0.8127, + "step": 12900 + }, + { + "epoch": 0.8741107121078664, + "grad_norm": 5.775038242340088, + "learning_rate": 8.336641796153057e-05, + "loss": 0.8717, + "step": 12901 + }, + { + "epoch": 0.8741784673758385, + "grad_norm": 6.800004005432129, + "learning_rate": 8.336504894243276e-05, + "loss": 0.9006, + "step": 12902 + }, + { + "epoch": 0.8742462226438106, + "grad_norm": 6.40112829208374, + "learning_rate": 8.336367992333494e-05, + "loss": 1.0521, + "step": 12903 + }, + { + "epoch": 0.8743139779117827, + "grad_norm": 8.33122444152832, + "learning_rate": 8.336231090423712e-05, + "loss": 0.8228, + "step": 12904 + }, + { + "epoch": 0.8743817331797548, + "grad_norm": 6.429588317871094, + "learning_rate": 8.33609418851393e-05, + "loss": 0.8859, + "step": 12905 + }, + { + "epoch": 0.8744494884477269, + "grad_norm": 6.242363452911377, + "learning_rate": 8.335957286604148e-05, + "loss": 0.7872, + "step": 12906 + }, + { + "epoch": 0.8745172437156989, + "grad_norm": 4.990901470184326, + "learning_rate": 8.335820384694368e-05, + "loss": 0.6444, + "step": 12907 + }, + { + "epoch": 0.874584998983671, + "grad_norm": 5.876582145690918, + "learning_rate": 8.335683482784586e-05, + "loss": 1.0041, + "step": 12908 + }, + { + "epoch": 0.874652754251643, + "grad_norm": 5.543231964111328, + "learning_rate": 8.335546580874804e-05, + "loss": 0.658, + "step": 12909 + }, + { + "epoch": 0.8747205095196151, + "grad_norm": 6.464064598083496, + "learning_rate": 8.335409678965022e-05, + "loss": 0.642, + "step": 12910 + }, + { + "epoch": 0.8747882647875872, + "grad_norm": 6.268253803253174, + "learning_rate": 8.335272777055241e-05, + "loss": 0.9617, + "step": 12911 + }, + { + "epoch": 0.8748560200555593, + "grad_norm": 6.467350482940674, + "learning_rate": 8.335135875145459e-05, + "loss": 0.7592, + "step": 12912 + }, + { + "epoch": 0.8749237753235314, + "grad_norm": 5.470207214355469, + "learning_rate": 8.334998973235677e-05, + "loss": 0.7742, + "step": 12913 + }, + { + "epoch": 0.8749915305915035, + "grad_norm": 6.371903419494629, + "learning_rate": 8.334862071325895e-05, + "loss": 0.6947, + "step": 12914 + }, + { + "epoch": 0.8750592858594756, + "grad_norm": 4.678818702697754, + "learning_rate": 8.334725169416113e-05, + "loss": 0.6938, + "step": 12915 + }, + { + "epoch": 0.8751270411274477, + "grad_norm": 9.119450569152832, + "learning_rate": 8.334588267506333e-05, + "loss": 1.0259, + "step": 12916 + }, + { + "epoch": 0.8751947963954197, + "grad_norm": 5.258639335632324, + "learning_rate": 8.33445136559655e-05, + "loss": 0.9058, + "step": 12917 + }, + { + "epoch": 0.8752625516633918, + "grad_norm": 6.689220905303955, + "learning_rate": 8.334314463686769e-05, + "loss": 0.9682, + "step": 12918 + }, + { + "epoch": 0.8753303069313639, + "grad_norm": 5.658355712890625, + "learning_rate": 8.334177561776987e-05, + "loss": 0.7693, + "step": 12919 + }, + { + "epoch": 0.875398062199336, + "grad_norm": 5.163204193115234, + "learning_rate": 8.334040659867206e-05, + "loss": 0.7801, + "step": 12920 + }, + { + "epoch": 0.8754658174673081, + "grad_norm": 5.308339595794678, + "learning_rate": 8.333903757957424e-05, + "loss": 0.9244, + "step": 12921 + }, + { + "epoch": 0.8755335727352802, + "grad_norm": 6.663207054138184, + "learning_rate": 8.333766856047642e-05, + "loss": 0.677, + "step": 12922 + }, + { + "epoch": 0.8756013280032523, + "grad_norm": 5.615358829498291, + "learning_rate": 8.33362995413786e-05, + "loss": 0.7344, + "step": 12923 + }, + { + "epoch": 0.8756690832712244, + "grad_norm": 6.694277763366699, + "learning_rate": 8.333493052228078e-05, + "loss": 0.8418, + "step": 12924 + }, + { + "epoch": 0.8757368385391964, + "grad_norm": 7.067589282989502, + "learning_rate": 8.333356150318298e-05, + "loss": 0.8682, + "step": 12925 + }, + { + "epoch": 0.8758045938071685, + "grad_norm": 6.071625709533691, + "learning_rate": 8.333219248408516e-05, + "loss": 1.0198, + "step": 12926 + }, + { + "epoch": 0.8758723490751406, + "grad_norm": 5.014382362365723, + "learning_rate": 8.333082346498734e-05, + "loss": 0.8123, + "step": 12927 + }, + { + "epoch": 0.8759401043431126, + "grad_norm": 7.893514156341553, + "learning_rate": 8.332945444588952e-05, + "loss": 0.7958, + "step": 12928 + }, + { + "epoch": 0.8760078596110847, + "grad_norm": 9.142471313476562, + "learning_rate": 8.332808542679171e-05, + "loss": 0.9072, + "step": 12929 + }, + { + "epoch": 0.8760756148790568, + "grad_norm": 4.87850284576416, + "learning_rate": 8.332671640769389e-05, + "loss": 0.6293, + "step": 12930 + }, + { + "epoch": 0.8761433701470289, + "grad_norm": 6.533822536468506, + "learning_rate": 8.332534738859607e-05, + "loss": 0.6865, + "step": 12931 + }, + { + "epoch": 0.876211125415001, + "grad_norm": 6.230561256408691, + "learning_rate": 8.332397836949825e-05, + "loss": 0.8655, + "step": 12932 + }, + { + "epoch": 0.8762788806829731, + "grad_norm": 5.474959373474121, + "learning_rate": 8.332260935040043e-05, + "loss": 0.7077, + "step": 12933 + }, + { + "epoch": 0.8763466359509452, + "grad_norm": 6.593677043914795, + "learning_rate": 8.332124033130263e-05, + "loss": 0.8323, + "step": 12934 + }, + { + "epoch": 0.8764143912189173, + "grad_norm": 6.031239986419678, + "learning_rate": 8.33198713122048e-05, + "loss": 0.8379, + "step": 12935 + }, + { + "epoch": 0.8764821464868894, + "grad_norm": 5.653693199157715, + "learning_rate": 8.331850229310699e-05, + "loss": 0.6913, + "step": 12936 + }, + { + "epoch": 0.8765499017548615, + "grad_norm": 4.6549296379089355, + "learning_rate": 8.331713327400917e-05, + "loss": 0.7085, + "step": 12937 + }, + { + "epoch": 0.8766176570228336, + "grad_norm": 7.875330448150635, + "learning_rate": 8.331576425491136e-05, + "loss": 0.7635, + "step": 12938 + }, + { + "epoch": 0.8766854122908057, + "grad_norm": 6.341701030731201, + "learning_rate": 8.331439523581354e-05, + "loss": 1.2747, + "step": 12939 + }, + { + "epoch": 0.8767531675587777, + "grad_norm": 5.573342323303223, + "learning_rate": 8.331302621671572e-05, + "loss": 0.8176, + "step": 12940 + }, + { + "epoch": 0.8768209228267497, + "grad_norm": 6.964317321777344, + "learning_rate": 8.331165719761792e-05, + "loss": 0.8216, + "step": 12941 + }, + { + "epoch": 0.8768886780947218, + "grad_norm": 5.422325611114502, + "learning_rate": 8.33102881785201e-05, + "loss": 0.6236, + "step": 12942 + }, + { + "epoch": 0.8769564333626939, + "grad_norm": 6.422939300537109, + "learning_rate": 8.330891915942228e-05, + "loss": 1.0188, + "step": 12943 + }, + { + "epoch": 0.877024188630666, + "grad_norm": 5.843238830566406, + "learning_rate": 8.330755014032447e-05, + "loss": 0.9579, + "step": 12944 + }, + { + "epoch": 0.8770919438986381, + "grad_norm": 5.675114631652832, + "learning_rate": 8.330618112122665e-05, + "loss": 0.7776, + "step": 12945 + }, + { + "epoch": 0.8771596991666102, + "grad_norm": 6.512584686279297, + "learning_rate": 8.330481210212883e-05, + "loss": 0.8351, + "step": 12946 + }, + { + "epoch": 0.8772274544345823, + "grad_norm": 7.719078063964844, + "learning_rate": 8.330344308303101e-05, + "loss": 1.0703, + "step": 12947 + }, + { + "epoch": 0.8772952097025544, + "grad_norm": 6.948828220367432, + "learning_rate": 8.33020740639332e-05, + "loss": 0.843, + "step": 12948 + }, + { + "epoch": 0.8773629649705265, + "grad_norm": 5.105550289154053, + "learning_rate": 8.330070504483539e-05, + "loss": 0.7685, + "step": 12949 + }, + { + "epoch": 0.8774307202384986, + "grad_norm": 4.994384288787842, + "learning_rate": 8.329933602573757e-05, + "loss": 0.6291, + "step": 12950 + }, + { + "epoch": 0.8774984755064706, + "grad_norm": 5.846680164337158, + "learning_rate": 8.329796700663975e-05, + "loss": 0.7701, + "step": 12951 + }, + { + "epoch": 0.8775662307744427, + "grad_norm": 8.621549606323242, + "learning_rate": 8.329659798754194e-05, + "loss": 0.7899, + "step": 12952 + }, + { + "epoch": 0.8776339860424148, + "grad_norm": 5.434492588043213, + "learning_rate": 8.329522896844412e-05, + "loss": 0.7942, + "step": 12953 + }, + { + "epoch": 0.8777017413103869, + "grad_norm": 8.437414169311523, + "learning_rate": 8.32938599493463e-05, + "loss": 0.8845, + "step": 12954 + }, + { + "epoch": 0.877769496578359, + "grad_norm": 5.602294921875, + "learning_rate": 8.329249093024848e-05, + "loss": 0.7475, + "step": 12955 + }, + { + "epoch": 0.8778372518463311, + "grad_norm": 6.249868392944336, + "learning_rate": 8.329112191115066e-05, + "loss": 0.6117, + "step": 12956 + }, + { + "epoch": 0.8779050071143031, + "grad_norm": 6.123753547668457, + "learning_rate": 8.328975289205286e-05, + "loss": 0.996, + "step": 12957 + }, + { + "epoch": 0.8779727623822752, + "grad_norm": 6.797160625457764, + "learning_rate": 8.328838387295504e-05, + "loss": 0.8562, + "step": 12958 + }, + { + "epoch": 0.8780405176502473, + "grad_norm": 6.009333610534668, + "learning_rate": 8.328701485385722e-05, + "loss": 0.7396, + "step": 12959 + }, + { + "epoch": 0.8781082729182194, + "grad_norm": 6.817856788635254, + "learning_rate": 8.32856458347594e-05, + "loss": 0.9193, + "step": 12960 + }, + { + "epoch": 0.8781760281861914, + "grad_norm": 6.004026412963867, + "learning_rate": 8.328427681566158e-05, + "loss": 0.7822, + "step": 12961 + }, + { + "epoch": 0.8782437834541635, + "grad_norm": 5.585984230041504, + "learning_rate": 8.328290779656377e-05, + "loss": 0.6767, + "step": 12962 + }, + { + "epoch": 0.8783115387221356, + "grad_norm": 9.209588050842285, + "learning_rate": 8.328153877746595e-05, + "loss": 0.9714, + "step": 12963 + }, + { + "epoch": 0.8783792939901077, + "grad_norm": 6.652541160583496, + "learning_rate": 8.328016975836813e-05, + "loss": 0.7528, + "step": 12964 + }, + { + "epoch": 0.8784470492580798, + "grad_norm": 6.689975261688232, + "learning_rate": 8.327880073927031e-05, + "loss": 0.7516, + "step": 12965 + }, + { + "epoch": 0.8785148045260519, + "grad_norm": 5.454050540924072, + "learning_rate": 8.32774317201725e-05, + "loss": 0.7394, + "step": 12966 + }, + { + "epoch": 0.878582559794024, + "grad_norm": 4.421219348907471, + "learning_rate": 8.327606270107469e-05, + "loss": 0.7704, + "step": 12967 + }, + { + "epoch": 0.8786503150619961, + "grad_norm": 4.957309246063232, + "learning_rate": 8.327469368197687e-05, + "loss": 0.7643, + "step": 12968 + }, + { + "epoch": 0.8787180703299682, + "grad_norm": 5.908604621887207, + "learning_rate": 8.327332466287905e-05, + "loss": 0.9247, + "step": 12969 + }, + { + "epoch": 0.8787858255979403, + "grad_norm": 7.434008598327637, + "learning_rate": 8.327195564378123e-05, + "loss": 0.9366, + "step": 12970 + }, + { + "epoch": 0.8788535808659124, + "grad_norm": 7.337964057922363, + "learning_rate": 8.327058662468342e-05, + "loss": 0.94, + "step": 12971 + }, + { + "epoch": 0.8789213361338845, + "grad_norm": 6.679739475250244, + "learning_rate": 8.32692176055856e-05, + "loss": 0.9922, + "step": 12972 + }, + { + "epoch": 0.8789890914018565, + "grad_norm": 5.989971160888672, + "learning_rate": 8.326784858648778e-05, + "loss": 0.8581, + "step": 12973 + }, + { + "epoch": 0.8790568466698285, + "grad_norm": 4.795865058898926, + "learning_rate": 8.326647956738996e-05, + "loss": 0.8417, + "step": 12974 + }, + { + "epoch": 0.8791246019378006, + "grad_norm": 6.574687957763672, + "learning_rate": 8.326511054829216e-05, + "loss": 0.9009, + "step": 12975 + }, + { + "epoch": 0.8791923572057727, + "grad_norm": 5.56545352935791, + "learning_rate": 8.326374152919434e-05, + "loss": 0.7627, + "step": 12976 + }, + { + "epoch": 0.8792601124737448, + "grad_norm": 7.365011215209961, + "learning_rate": 8.326237251009652e-05, + "loss": 1.0212, + "step": 12977 + }, + { + "epoch": 0.8793278677417169, + "grad_norm": 5.9187140464782715, + "learning_rate": 8.32610034909987e-05, + "loss": 0.804, + "step": 12978 + }, + { + "epoch": 0.879395623009689, + "grad_norm": 5.450742721557617, + "learning_rate": 8.325963447190088e-05, + "loss": 0.7946, + "step": 12979 + }, + { + "epoch": 0.8794633782776611, + "grad_norm": 5.892670154571533, + "learning_rate": 8.325826545280307e-05, + "loss": 0.7495, + "step": 12980 + }, + { + "epoch": 0.8795311335456332, + "grad_norm": 5.977981090545654, + "learning_rate": 8.325689643370525e-05, + "loss": 0.789, + "step": 12981 + }, + { + "epoch": 0.8795988888136053, + "grad_norm": 6.273918628692627, + "learning_rate": 8.325552741460743e-05, + "loss": 1.0261, + "step": 12982 + }, + { + "epoch": 0.8796666440815774, + "grad_norm": 5.750874996185303, + "learning_rate": 8.325415839550961e-05, + "loss": 0.8008, + "step": 12983 + }, + { + "epoch": 0.8797343993495494, + "grad_norm": 6.168051242828369, + "learning_rate": 8.32527893764118e-05, + "loss": 0.7802, + "step": 12984 + }, + { + "epoch": 0.8798021546175215, + "grad_norm": 6.125532627105713, + "learning_rate": 8.325142035731399e-05, + "loss": 0.8281, + "step": 12985 + }, + { + "epoch": 0.8798699098854936, + "grad_norm": 5.973419666290283, + "learning_rate": 8.325005133821617e-05, + "loss": 0.8108, + "step": 12986 + }, + { + "epoch": 0.8799376651534657, + "grad_norm": 5.780817031860352, + "learning_rate": 8.324868231911836e-05, + "loss": 0.6535, + "step": 12987 + }, + { + "epoch": 0.8800054204214378, + "grad_norm": 5.342846870422363, + "learning_rate": 8.324731330002054e-05, + "loss": 0.6626, + "step": 12988 + }, + { + "epoch": 0.8800731756894099, + "grad_norm": 5.165645599365234, + "learning_rate": 8.324594428092272e-05, + "loss": 0.8454, + "step": 12989 + }, + { + "epoch": 0.8801409309573819, + "grad_norm": 7.062921524047852, + "learning_rate": 8.324457526182492e-05, + "loss": 0.6814, + "step": 12990 + }, + { + "epoch": 0.880208686225354, + "grad_norm": 8.127230644226074, + "learning_rate": 8.32432062427271e-05, + "loss": 0.9044, + "step": 12991 + }, + { + "epoch": 0.8802764414933261, + "grad_norm": 7.321716785430908, + "learning_rate": 8.324183722362928e-05, + "loss": 0.8596, + "step": 12992 + }, + { + "epoch": 0.8803441967612982, + "grad_norm": 6.0520806312561035, + "learning_rate": 8.324046820453146e-05, + "loss": 0.7636, + "step": 12993 + }, + { + "epoch": 0.8804119520292703, + "grad_norm": 4.8115973472595215, + "learning_rate": 8.323909918543365e-05, + "loss": 0.5694, + "step": 12994 + }, + { + "epoch": 0.8804797072972423, + "grad_norm": 7.053106784820557, + "learning_rate": 8.323773016633583e-05, + "loss": 0.5812, + "step": 12995 + }, + { + "epoch": 0.8805474625652144, + "grad_norm": 5.414585590362549, + "learning_rate": 8.323636114723801e-05, + "loss": 0.6227, + "step": 12996 + }, + { + "epoch": 0.8806152178331865, + "grad_norm": 6.686954498291016, + "learning_rate": 8.323499212814019e-05, + "loss": 0.8822, + "step": 12997 + }, + { + "epoch": 0.8806829731011586, + "grad_norm": 5.530512809753418, + "learning_rate": 8.323362310904239e-05, + "loss": 0.745, + "step": 12998 + }, + { + "epoch": 0.8807507283691307, + "grad_norm": 7.229578495025635, + "learning_rate": 8.323225408994457e-05, + "loss": 0.7521, + "step": 12999 + }, + { + "epoch": 0.8808184836371028, + "grad_norm": 5.444945335388184, + "learning_rate": 8.323088507084675e-05, + "loss": 0.741, + "step": 13000 + }, + { + "epoch": 0.8808862389050749, + "grad_norm": 5.646410942077637, + "learning_rate": 8.322951605174893e-05, + "loss": 0.6562, + "step": 13001 + }, + { + "epoch": 0.880953994173047, + "grad_norm": 6.224180698394775, + "learning_rate": 8.32281470326511e-05, + "loss": 0.6087, + "step": 13002 + }, + { + "epoch": 0.8810217494410191, + "grad_norm": 5.3466997146606445, + "learning_rate": 8.32267780135533e-05, + "loss": 0.7514, + "step": 13003 + }, + { + "epoch": 0.8810895047089912, + "grad_norm": 8.124218940734863, + "learning_rate": 8.322540899445548e-05, + "loss": 0.8344, + "step": 13004 + }, + { + "epoch": 0.8811572599769633, + "grad_norm": 5.5622172355651855, + "learning_rate": 8.322403997535766e-05, + "loss": 0.8112, + "step": 13005 + }, + { + "epoch": 0.8812250152449352, + "grad_norm": 6.751789093017578, + "learning_rate": 8.322267095625984e-05, + "loss": 0.9156, + "step": 13006 + }, + { + "epoch": 0.8812927705129073, + "grad_norm": 6.383172512054443, + "learning_rate": 8.322130193716204e-05, + "loss": 0.8049, + "step": 13007 + }, + { + "epoch": 0.8813605257808794, + "grad_norm": 5.960168838500977, + "learning_rate": 8.321993291806422e-05, + "loss": 0.9027, + "step": 13008 + }, + { + "epoch": 0.8814282810488515, + "grad_norm": 5.624762535095215, + "learning_rate": 8.32185638989664e-05, + "loss": 0.6223, + "step": 13009 + }, + { + "epoch": 0.8814960363168236, + "grad_norm": 5.830256938934326, + "learning_rate": 8.321719487986858e-05, + "loss": 0.9301, + "step": 13010 + }, + { + "epoch": 0.8815637915847957, + "grad_norm": 11.064977645874023, + "learning_rate": 8.321582586077076e-05, + "loss": 0.8125, + "step": 13011 + }, + { + "epoch": 0.8816315468527678, + "grad_norm": 5.739172458648682, + "learning_rate": 8.321445684167295e-05, + "loss": 0.8507, + "step": 13012 + }, + { + "epoch": 0.8816993021207399, + "grad_norm": 5.400548458099365, + "learning_rate": 8.321308782257513e-05, + "loss": 0.7381, + "step": 13013 + }, + { + "epoch": 0.881767057388712, + "grad_norm": 5.973668098449707, + "learning_rate": 8.321171880347731e-05, + "loss": 0.7881, + "step": 13014 + }, + { + "epoch": 0.8818348126566841, + "grad_norm": 7.544906139373779, + "learning_rate": 8.321034978437949e-05, + "loss": 0.7593, + "step": 13015 + }, + { + "epoch": 0.8819025679246562, + "grad_norm": 5.305513381958008, + "learning_rate": 8.320898076528167e-05, + "loss": 0.7552, + "step": 13016 + }, + { + "epoch": 0.8819703231926282, + "grad_norm": 5.9566545486450195, + "learning_rate": 8.320761174618387e-05, + "loss": 0.8524, + "step": 13017 + }, + { + "epoch": 0.8820380784606003, + "grad_norm": 5.806572437286377, + "learning_rate": 8.320624272708605e-05, + "loss": 0.6832, + "step": 13018 + }, + { + "epoch": 0.8821058337285724, + "grad_norm": 5.694754600524902, + "learning_rate": 8.320487370798823e-05, + "loss": 0.8726, + "step": 13019 + }, + { + "epoch": 0.8821735889965445, + "grad_norm": 6.508894920349121, + "learning_rate": 8.32035046888904e-05, + "loss": 0.8468, + "step": 13020 + }, + { + "epoch": 0.8822413442645166, + "grad_norm": 5.450093746185303, + "learning_rate": 8.32021356697926e-05, + "loss": 0.715, + "step": 13021 + }, + { + "epoch": 0.8823090995324887, + "grad_norm": 7.250792026519775, + "learning_rate": 8.320076665069478e-05, + "loss": 0.8587, + "step": 13022 + }, + { + "epoch": 0.8823768548004607, + "grad_norm": 7.200087070465088, + "learning_rate": 8.319939763159696e-05, + "loss": 0.7188, + "step": 13023 + }, + { + "epoch": 0.8824446100684328, + "grad_norm": 7.003962993621826, + "learning_rate": 8.319802861249914e-05, + "loss": 0.7586, + "step": 13024 + }, + { + "epoch": 0.8825123653364049, + "grad_norm": 5.545292854309082, + "learning_rate": 8.319665959340132e-05, + "loss": 0.7623, + "step": 13025 + }, + { + "epoch": 0.882580120604377, + "grad_norm": 7.445784091949463, + "learning_rate": 8.319529057430352e-05, + "loss": 1.0729, + "step": 13026 + }, + { + "epoch": 0.882647875872349, + "grad_norm": 7.989095211029053, + "learning_rate": 8.31939215552057e-05, + "loss": 0.8033, + "step": 13027 + }, + { + "epoch": 0.8827156311403211, + "grad_norm": 5.488166809082031, + "learning_rate": 8.319255253610788e-05, + "loss": 0.7301, + "step": 13028 + }, + { + "epoch": 0.8827833864082932, + "grad_norm": 7.751863956451416, + "learning_rate": 8.319118351701006e-05, + "loss": 0.5935, + "step": 13029 + }, + { + "epoch": 0.8828511416762653, + "grad_norm": 7.284234046936035, + "learning_rate": 8.318981449791225e-05, + "loss": 0.8942, + "step": 13030 + }, + { + "epoch": 0.8829188969442374, + "grad_norm": 6.073469638824463, + "learning_rate": 8.318844547881443e-05, + "loss": 0.7782, + "step": 13031 + }, + { + "epoch": 0.8829866522122095, + "grad_norm": 5.341745853424072, + "learning_rate": 8.318707645971661e-05, + "loss": 0.8347, + "step": 13032 + }, + { + "epoch": 0.8830544074801816, + "grad_norm": 5.9635820388793945, + "learning_rate": 8.31857074406188e-05, + "loss": 0.9042, + "step": 13033 + }, + { + "epoch": 0.8831221627481537, + "grad_norm": 5.801054954528809, + "learning_rate": 8.318433842152099e-05, + "loss": 0.7764, + "step": 13034 + }, + { + "epoch": 0.8831899180161258, + "grad_norm": 7.042034149169922, + "learning_rate": 8.318296940242317e-05, + "loss": 1.0679, + "step": 13035 + }, + { + "epoch": 0.8832576732840979, + "grad_norm": 4.915299892425537, + "learning_rate": 8.318160038332536e-05, + "loss": 0.7412, + "step": 13036 + }, + { + "epoch": 0.88332542855207, + "grad_norm": 9.298592567443848, + "learning_rate": 8.318023136422754e-05, + "loss": 0.8991, + "step": 13037 + }, + { + "epoch": 0.8833931838200421, + "grad_norm": 4.614468097686768, + "learning_rate": 8.317886234512972e-05, + "loss": 0.6786, + "step": 13038 + }, + { + "epoch": 0.883460939088014, + "grad_norm": 5.394043445587158, + "learning_rate": 8.31774933260319e-05, + "loss": 0.7442, + "step": 13039 + }, + { + "epoch": 0.8835286943559861, + "grad_norm": 6.138361930847168, + "learning_rate": 8.31761243069341e-05, + "loss": 0.5959, + "step": 13040 + }, + { + "epoch": 0.8835964496239582, + "grad_norm": 6.376340866088867, + "learning_rate": 8.317475528783628e-05, + "loss": 1.0073, + "step": 13041 + }, + { + "epoch": 0.8836642048919303, + "grad_norm": 4.798174858093262, + "learning_rate": 8.317338626873846e-05, + "loss": 0.6251, + "step": 13042 + }, + { + "epoch": 0.8837319601599024, + "grad_norm": 6.462924957275391, + "learning_rate": 8.317201724964064e-05, + "loss": 0.9081, + "step": 13043 + }, + { + "epoch": 0.8837997154278745, + "grad_norm": 5.447483062744141, + "learning_rate": 8.317064823054283e-05, + "loss": 0.8058, + "step": 13044 + }, + { + "epoch": 0.8838674706958466, + "grad_norm": 6.656740188598633, + "learning_rate": 8.316927921144501e-05, + "loss": 0.8681, + "step": 13045 + }, + { + "epoch": 0.8839352259638187, + "grad_norm": 5.062714099884033, + "learning_rate": 8.316791019234719e-05, + "loss": 0.5941, + "step": 13046 + }, + { + "epoch": 0.8840029812317908, + "grad_norm": 6.954104900360107, + "learning_rate": 8.316654117324937e-05, + "loss": 0.9577, + "step": 13047 + }, + { + "epoch": 0.8840707364997629, + "grad_norm": 5.395656585693359, + "learning_rate": 8.316517215415155e-05, + "loss": 0.6636, + "step": 13048 + }, + { + "epoch": 0.884138491767735, + "grad_norm": 7.5406270027160645, + "learning_rate": 8.316380313505374e-05, + "loss": 0.8675, + "step": 13049 + }, + { + "epoch": 0.884206247035707, + "grad_norm": 4.966914653778076, + "learning_rate": 8.316243411595593e-05, + "loss": 0.7242, + "step": 13050 + }, + { + "epoch": 0.8842740023036791, + "grad_norm": 6.133572101593018, + "learning_rate": 8.31610650968581e-05, + "loss": 0.8462, + "step": 13051 + }, + { + "epoch": 0.8843417575716512, + "grad_norm": 4.549497127532959, + "learning_rate": 8.315969607776029e-05, + "loss": 0.7792, + "step": 13052 + }, + { + "epoch": 0.8844095128396233, + "grad_norm": 5.715592861175537, + "learning_rate": 8.315832705866248e-05, + "loss": 0.6261, + "step": 13053 + }, + { + "epoch": 0.8844772681075954, + "grad_norm": 7.551342010498047, + "learning_rate": 8.315695803956466e-05, + "loss": 0.8017, + "step": 13054 + }, + { + "epoch": 0.8845450233755674, + "grad_norm": 5.671367645263672, + "learning_rate": 8.315558902046684e-05, + "loss": 0.8092, + "step": 13055 + }, + { + "epoch": 0.8846127786435395, + "grad_norm": 5.61579704284668, + "learning_rate": 8.315422000136902e-05, + "loss": 0.7854, + "step": 13056 + }, + { + "epoch": 0.8846805339115116, + "grad_norm": 8.364435195922852, + "learning_rate": 8.31528509822712e-05, + "loss": 0.7893, + "step": 13057 + }, + { + "epoch": 0.8847482891794837, + "grad_norm": 7.481292724609375, + "learning_rate": 8.31514819631734e-05, + "loss": 0.7083, + "step": 13058 + }, + { + "epoch": 0.8848160444474558, + "grad_norm": 6.476237773895264, + "learning_rate": 8.315011294407558e-05, + "loss": 0.7749, + "step": 13059 + }, + { + "epoch": 0.8848837997154279, + "grad_norm": 6.060905456542969, + "learning_rate": 8.314874392497776e-05, + "loss": 0.8042, + "step": 13060 + }, + { + "epoch": 0.8849515549834, + "grad_norm": 5.672494888305664, + "learning_rate": 8.314737490587994e-05, + "loss": 0.5887, + "step": 13061 + }, + { + "epoch": 0.885019310251372, + "grad_norm": 8.22986888885498, + "learning_rate": 8.314600588678213e-05, + "loss": 1.1182, + "step": 13062 + }, + { + "epoch": 0.8850870655193441, + "grad_norm": 8.389451026916504, + "learning_rate": 8.314463686768431e-05, + "loss": 1.022, + "step": 13063 + }, + { + "epoch": 0.8851548207873162, + "grad_norm": 5.47923469543457, + "learning_rate": 8.314326784858649e-05, + "loss": 0.7902, + "step": 13064 + }, + { + "epoch": 0.8852225760552883, + "grad_norm": 5.775954723358154, + "learning_rate": 8.314189882948867e-05, + "loss": 0.8831, + "step": 13065 + }, + { + "epoch": 0.8852903313232604, + "grad_norm": 6.233015060424805, + "learning_rate": 8.314052981039085e-05, + "loss": 0.7723, + "step": 13066 + }, + { + "epoch": 0.8853580865912325, + "grad_norm": 6.433823585510254, + "learning_rate": 8.313916079129305e-05, + "loss": 0.7764, + "step": 13067 + }, + { + "epoch": 0.8854258418592046, + "grad_norm": 8.425026893615723, + "learning_rate": 8.313779177219523e-05, + "loss": 0.9493, + "step": 13068 + }, + { + "epoch": 0.8854935971271767, + "grad_norm": 5.355708599090576, + "learning_rate": 8.31364227530974e-05, + "loss": 0.7761, + "step": 13069 + }, + { + "epoch": 0.8855613523951488, + "grad_norm": 4.858754634857178, + "learning_rate": 8.313505373399959e-05, + "loss": 0.6928, + "step": 13070 + }, + { + "epoch": 0.8856291076631209, + "grad_norm": 6.304715633392334, + "learning_rate": 8.313368471490177e-05, + "loss": 0.7542, + "step": 13071 + }, + { + "epoch": 0.8856968629310928, + "grad_norm": 5.10394811630249, + "learning_rate": 8.313231569580396e-05, + "loss": 0.6926, + "step": 13072 + }, + { + "epoch": 0.8857646181990649, + "grad_norm": 5.681312561035156, + "learning_rate": 8.313094667670614e-05, + "loss": 0.5444, + "step": 13073 + }, + { + "epoch": 0.885832373467037, + "grad_norm": 5.030150890350342, + "learning_rate": 8.312957765760832e-05, + "loss": 0.7594, + "step": 13074 + }, + { + "epoch": 0.8859001287350091, + "grad_norm": 5.284333229064941, + "learning_rate": 8.31282086385105e-05, + "loss": 0.6214, + "step": 13075 + }, + { + "epoch": 0.8859678840029812, + "grad_norm": 6.305645942687988, + "learning_rate": 8.31268396194127e-05, + "loss": 0.5335, + "step": 13076 + }, + { + "epoch": 0.8860356392709533, + "grad_norm": 4.828126907348633, + "learning_rate": 8.312547060031488e-05, + "loss": 0.5699, + "step": 13077 + }, + { + "epoch": 0.8861033945389254, + "grad_norm": 6.154019355773926, + "learning_rate": 8.312410158121706e-05, + "loss": 0.8389, + "step": 13078 + }, + { + "epoch": 0.8861711498068975, + "grad_norm": 7.554732322692871, + "learning_rate": 8.312273256211924e-05, + "loss": 1.1726, + "step": 13079 + }, + { + "epoch": 0.8862389050748696, + "grad_norm": 7.06660270690918, + "learning_rate": 8.312136354302143e-05, + "loss": 0.7025, + "step": 13080 + }, + { + "epoch": 0.8863066603428417, + "grad_norm": 6.983936786651611, + "learning_rate": 8.311999452392361e-05, + "loss": 0.7986, + "step": 13081 + }, + { + "epoch": 0.8863744156108138, + "grad_norm": 6.566361427307129, + "learning_rate": 8.311862550482579e-05, + "loss": 0.6886, + "step": 13082 + }, + { + "epoch": 0.8864421708787859, + "grad_norm": 7.804087162017822, + "learning_rate": 8.311725648572798e-05, + "loss": 0.8087, + "step": 13083 + }, + { + "epoch": 0.8865099261467579, + "grad_norm": 7.015219211578369, + "learning_rate": 8.311588746663017e-05, + "loss": 0.7983, + "step": 13084 + }, + { + "epoch": 0.88657768141473, + "grad_norm": 7.756356239318848, + "learning_rate": 8.311451844753235e-05, + "loss": 0.865, + "step": 13085 + }, + { + "epoch": 0.8866454366827021, + "grad_norm": 4.957078456878662, + "learning_rate": 8.311314942843454e-05, + "loss": 0.7427, + "step": 13086 + }, + { + "epoch": 0.8867131919506742, + "grad_norm": 7.29202127456665, + "learning_rate": 8.311178040933672e-05, + "loss": 0.9095, + "step": 13087 + }, + { + "epoch": 0.8867809472186462, + "grad_norm": 6.850350856781006, + "learning_rate": 8.31104113902389e-05, + "loss": 0.6982, + "step": 13088 + }, + { + "epoch": 0.8868487024866183, + "grad_norm": 7.409617900848389, + "learning_rate": 8.310904237114108e-05, + "loss": 0.7422, + "step": 13089 + }, + { + "epoch": 0.8869164577545904, + "grad_norm": 6.317065715789795, + "learning_rate": 8.310767335204327e-05, + "loss": 0.7854, + "step": 13090 + }, + { + "epoch": 0.8869842130225625, + "grad_norm": 6.616084098815918, + "learning_rate": 8.310630433294545e-05, + "loss": 0.5892, + "step": 13091 + }, + { + "epoch": 0.8870519682905346, + "grad_norm": 4.691561222076416, + "learning_rate": 8.310493531384764e-05, + "loss": 0.7925, + "step": 13092 + }, + { + "epoch": 0.8871197235585067, + "grad_norm": 6.057029724121094, + "learning_rate": 8.310356629474982e-05, + "loss": 0.9739, + "step": 13093 + }, + { + "epoch": 0.8871874788264787, + "grad_norm": 6.407534122467041, + "learning_rate": 8.3102197275652e-05, + "loss": 0.8429, + "step": 13094 + }, + { + "epoch": 0.8872552340944508, + "grad_norm": 5.778253078460693, + "learning_rate": 8.310082825655419e-05, + "loss": 0.8443, + "step": 13095 + }, + { + "epoch": 0.8873229893624229, + "grad_norm": 6.153006553649902, + "learning_rate": 8.309945923745637e-05, + "loss": 0.8315, + "step": 13096 + }, + { + "epoch": 0.887390744630395, + "grad_norm": 6.307031631469727, + "learning_rate": 8.309809021835855e-05, + "loss": 1.0075, + "step": 13097 + }, + { + "epoch": 0.8874584998983671, + "grad_norm": 6.513178825378418, + "learning_rate": 8.309672119926073e-05, + "loss": 0.7945, + "step": 13098 + }, + { + "epoch": 0.8875262551663392, + "grad_norm": 7.987000942230225, + "learning_rate": 8.309535218016292e-05, + "loss": 0.7709, + "step": 13099 + }, + { + "epoch": 0.8875940104343113, + "grad_norm": 5.424191951751709, + "learning_rate": 8.30939831610651e-05, + "loss": 0.7174, + "step": 13100 + }, + { + "epoch": 0.8876617657022834, + "grad_norm": 4.814406871795654, + "learning_rate": 8.309261414196729e-05, + "loss": 0.8897, + "step": 13101 + }, + { + "epoch": 0.8877295209702555, + "grad_norm": 5.997096538543701, + "learning_rate": 8.309124512286947e-05, + "loss": 0.6153, + "step": 13102 + }, + { + "epoch": 0.8877972762382276, + "grad_norm": 7.016286373138428, + "learning_rate": 8.308987610377165e-05, + "loss": 0.9323, + "step": 13103 + }, + { + "epoch": 0.8878650315061996, + "grad_norm": 5.908369541168213, + "learning_rate": 8.308850708467384e-05, + "loss": 0.7758, + "step": 13104 + }, + { + "epoch": 0.8879327867741716, + "grad_norm": 6.3806376457214355, + "learning_rate": 8.308713806557602e-05, + "loss": 0.803, + "step": 13105 + }, + { + "epoch": 0.8880005420421437, + "grad_norm": 5.187054634094238, + "learning_rate": 8.30857690464782e-05, + "loss": 0.8509, + "step": 13106 + }, + { + "epoch": 0.8880682973101158, + "grad_norm": 5.6671905517578125, + "learning_rate": 8.308440002738038e-05, + "loss": 0.9532, + "step": 13107 + }, + { + "epoch": 0.8881360525780879, + "grad_norm": 5.64413595199585, + "learning_rate": 8.308303100828257e-05, + "loss": 0.6387, + "step": 13108 + }, + { + "epoch": 0.88820380784606, + "grad_norm": 7.69677209854126, + "learning_rate": 8.308166198918476e-05, + "loss": 0.9477, + "step": 13109 + }, + { + "epoch": 0.8882715631140321, + "grad_norm": 6.669020652770996, + "learning_rate": 8.308029297008694e-05, + "loss": 0.7831, + "step": 13110 + }, + { + "epoch": 0.8883393183820042, + "grad_norm": 8.060406684875488, + "learning_rate": 8.307892395098912e-05, + "loss": 0.7206, + "step": 13111 + }, + { + "epoch": 0.8884070736499763, + "grad_norm": 6.262596607208252, + "learning_rate": 8.30775549318913e-05, + "loss": 0.7359, + "step": 13112 + }, + { + "epoch": 0.8884748289179484, + "grad_norm": 7.678366661071777, + "learning_rate": 8.307618591279349e-05, + "loss": 0.7463, + "step": 13113 + }, + { + "epoch": 0.8885425841859205, + "grad_norm": 4.828142166137695, + "learning_rate": 8.307481689369567e-05, + "loss": 0.6186, + "step": 13114 + }, + { + "epoch": 0.8886103394538926, + "grad_norm": 6.147395133972168, + "learning_rate": 8.307344787459785e-05, + "loss": 0.7742, + "step": 13115 + }, + { + "epoch": 0.8886780947218647, + "grad_norm": 7.005827903747559, + "learning_rate": 8.307207885550003e-05, + "loss": 0.899, + "step": 13116 + }, + { + "epoch": 0.8887458499898367, + "grad_norm": 6.948807239532471, + "learning_rate": 8.307070983640221e-05, + "loss": 0.9572, + "step": 13117 + }, + { + "epoch": 0.8888136052578088, + "grad_norm": 5.939198970794678, + "learning_rate": 8.30693408173044e-05, + "loss": 0.8103, + "step": 13118 + }, + { + "epoch": 0.8888813605257809, + "grad_norm": 5.84893274307251, + "learning_rate": 8.306797179820659e-05, + "loss": 0.7653, + "step": 13119 + }, + { + "epoch": 0.888949115793753, + "grad_norm": 9.817476272583008, + "learning_rate": 8.306660277910877e-05, + "loss": 0.8496, + "step": 13120 + }, + { + "epoch": 0.889016871061725, + "grad_norm": 6.369333267211914, + "learning_rate": 8.306523376001095e-05, + "loss": 0.8215, + "step": 13121 + }, + { + "epoch": 0.8890846263296971, + "grad_norm": 6.099016189575195, + "learning_rate": 8.306386474091314e-05, + "loss": 0.913, + "step": 13122 + }, + { + "epoch": 0.8891523815976692, + "grad_norm": 5.678265571594238, + "learning_rate": 8.306249572181532e-05, + "loss": 0.7645, + "step": 13123 + }, + { + "epoch": 0.8892201368656413, + "grad_norm": 5.685331344604492, + "learning_rate": 8.30611267027175e-05, + "loss": 0.8976, + "step": 13124 + }, + { + "epoch": 0.8892878921336134, + "grad_norm": 6.949743747711182, + "learning_rate": 8.305975768361968e-05, + "loss": 0.708, + "step": 13125 + }, + { + "epoch": 0.8893556474015855, + "grad_norm": 6.98148250579834, + "learning_rate": 8.305838866452188e-05, + "loss": 1.0055, + "step": 13126 + }, + { + "epoch": 0.8894234026695576, + "grad_norm": 6.679849147796631, + "learning_rate": 8.305701964542406e-05, + "loss": 0.7262, + "step": 13127 + }, + { + "epoch": 0.8894911579375296, + "grad_norm": 7.759592533111572, + "learning_rate": 8.305565062632624e-05, + "loss": 0.5805, + "step": 13128 + }, + { + "epoch": 0.8895589132055017, + "grad_norm": 5.812005519866943, + "learning_rate": 8.305428160722843e-05, + "loss": 0.8662, + "step": 13129 + }, + { + "epoch": 0.8896266684734738, + "grad_norm": 4.861279487609863, + "learning_rate": 8.305291258813061e-05, + "loss": 0.5661, + "step": 13130 + }, + { + "epoch": 0.8896944237414459, + "grad_norm": 6.805129051208496, + "learning_rate": 8.305154356903279e-05, + "loss": 0.9156, + "step": 13131 + }, + { + "epoch": 0.889762179009418, + "grad_norm": 4.98897647857666, + "learning_rate": 8.305017454993498e-05, + "loss": 0.698, + "step": 13132 + }, + { + "epoch": 0.8898299342773901, + "grad_norm": 6.2775726318359375, + "learning_rate": 8.304880553083716e-05, + "loss": 0.9092, + "step": 13133 + }, + { + "epoch": 0.8898976895453622, + "grad_norm": 7.023934841156006, + "learning_rate": 8.304743651173934e-05, + "loss": 0.7898, + "step": 13134 + }, + { + "epoch": 0.8899654448133343, + "grad_norm": 6.283311367034912, + "learning_rate": 8.304606749264153e-05, + "loss": 0.7596, + "step": 13135 + }, + { + "epoch": 0.8900332000813064, + "grad_norm": 4.842438220977783, + "learning_rate": 8.304469847354372e-05, + "loss": 0.7008, + "step": 13136 + }, + { + "epoch": 0.8901009553492784, + "grad_norm": 5.7367987632751465, + "learning_rate": 8.30433294544459e-05, + "loss": 0.987, + "step": 13137 + }, + { + "epoch": 0.8901687106172504, + "grad_norm": 6.082010746002197, + "learning_rate": 8.304196043534808e-05, + "loss": 0.6194, + "step": 13138 + }, + { + "epoch": 0.8902364658852225, + "grad_norm": 4.863292217254639, + "learning_rate": 8.304059141625026e-05, + "loss": 0.689, + "step": 13139 + }, + { + "epoch": 0.8903042211531946, + "grad_norm": 7.170241832733154, + "learning_rate": 8.303922239715245e-05, + "loss": 0.8191, + "step": 13140 + }, + { + "epoch": 0.8903719764211667, + "grad_norm": 5.541210174560547, + "learning_rate": 8.303785337805463e-05, + "loss": 0.7614, + "step": 13141 + }, + { + "epoch": 0.8904397316891388, + "grad_norm": 4.93407678604126, + "learning_rate": 8.303648435895681e-05, + "loss": 0.6698, + "step": 13142 + }, + { + "epoch": 0.8905074869571109, + "grad_norm": 5.758559226989746, + "learning_rate": 8.3035115339859e-05, + "loss": 0.8883, + "step": 13143 + }, + { + "epoch": 0.890575242225083, + "grad_norm": 5.253477096557617, + "learning_rate": 8.303374632076118e-05, + "loss": 0.7487, + "step": 13144 + }, + { + "epoch": 0.8906429974930551, + "grad_norm": 7.421339511871338, + "learning_rate": 8.303237730166337e-05, + "loss": 0.7125, + "step": 13145 + }, + { + "epoch": 0.8907107527610272, + "grad_norm": 6.252211093902588, + "learning_rate": 8.303100828256555e-05, + "loss": 0.7768, + "step": 13146 + }, + { + "epoch": 0.8907785080289993, + "grad_norm": 5.624354362487793, + "learning_rate": 8.302963926346773e-05, + "loss": 0.7325, + "step": 13147 + }, + { + "epoch": 0.8908462632969714, + "grad_norm": 7.876077651977539, + "learning_rate": 8.302827024436991e-05, + "loss": 0.9607, + "step": 13148 + }, + { + "epoch": 0.8909140185649435, + "grad_norm": 7.236328601837158, + "learning_rate": 8.302690122527209e-05, + "loss": 0.9411, + "step": 13149 + }, + { + "epoch": 0.8909817738329155, + "grad_norm": 5.283116817474365, + "learning_rate": 8.302553220617428e-05, + "loss": 0.8631, + "step": 13150 + }, + { + "epoch": 0.8910495291008876, + "grad_norm": 4.565162181854248, + "learning_rate": 8.302416318707646e-05, + "loss": 0.5822, + "step": 13151 + }, + { + "epoch": 0.8911172843688597, + "grad_norm": 8.308517456054688, + "learning_rate": 8.302279416797865e-05, + "loss": 0.7722, + "step": 13152 + }, + { + "epoch": 0.8911850396368317, + "grad_norm": 5.399427890777588, + "learning_rate": 8.302142514888083e-05, + "loss": 0.7398, + "step": 13153 + }, + { + "epoch": 0.8912527949048038, + "grad_norm": 6.947094917297363, + "learning_rate": 8.302005612978302e-05, + "loss": 0.8397, + "step": 13154 + }, + { + "epoch": 0.8913205501727759, + "grad_norm": 5.054037094116211, + "learning_rate": 8.30186871106852e-05, + "loss": 0.8779, + "step": 13155 + }, + { + "epoch": 0.891388305440748, + "grad_norm": 5.343729496002197, + "learning_rate": 8.301731809158738e-05, + "loss": 0.6804, + "step": 13156 + }, + { + "epoch": 0.8914560607087201, + "grad_norm": 5.4643425941467285, + "learning_rate": 8.301594907248956e-05, + "loss": 0.764, + "step": 13157 + }, + { + "epoch": 0.8915238159766922, + "grad_norm": 5.799702167510986, + "learning_rate": 8.301458005339174e-05, + "loss": 0.8367, + "step": 13158 + }, + { + "epoch": 0.8915915712446643, + "grad_norm": 5.780505657196045, + "learning_rate": 8.301321103429393e-05, + "loss": 0.6387, + "step": 13159 + }, + { + "epoch": 0.8916593265126364, + "grad_norm": 7.076272964477539, + "learning_rate": 8.301184201519612e-05, + "loss": 0.8751, + "step": 13160 + }, + { + "epoch": 0.8917270817806084, + "grad_norm": 5.820427417755127, + "learning_rate": 8.30104729960983e-05, + "loss": 0.7296, + "step": 13161 + }, + { + "epoch": 0.8917948370485805, + "grad_norm": 6.612248420715332, + "learning_rate": 8.300910397700048e-05, + "loss": 0.9585, + "step": 13162 + }, + { + "epoch": 0.8918625923165526, + "grad_norm": 5.051817893981934, + "learning_rate": 8.300773495790267e-05, + "loss": 0.7389, + "step": 13163 + }, + { + "epoch": 0.8919303475845247, + "grad_norm": 6.8403639793396, + "learning_rate": 8.300636593880485e-05, + "loss": 0.9632, + "step": 13164 + }, + { + "epoch": 0.8919981028524968, + "grad_norm": 5.8375983238220215, + "learning_rate": 8.300499691970703e-05, + "loss": 0.7462, + "step": 13165 + }, + { + "epoch": 0.8920658581204689, + "grad_norm": 4.138559341430664, + "learning_rate": 8.300362790060921e-05, + "loss": 0.5961, + "step": 13166 + }, + { + "epoch": 0.892133613388441, + "grad_norm": 5.837608814239502, + "learning_rate": 8.300225888151139e-05, + "loss": 0.8189, + "step": 13167 + }, + { + "epoch": 0.8922013686564131, + "grad_norm": 5.9026875495910645, + "learning_rate": 8.300088986241358e-05, + "loss": 0.8394, + "step": 13168 + }, + { + "epoch": 0.8922691239243851, + "grad_norm": 5.615271091461182, + "learning_rate": 8.299952084331577e-05, + "loss": 0.7897, + "step": 13169 + }, + { + "epoch": 0.8923368791923572, + "grad_norm": 5.872547149658203, + "learning_rate": 8.299815182421795e-05, + "loss": 0.7276, + "step": 13170 + }, + { + "epoch": 0.8924046344603292, + "grad_norm": 7.943142414093018, + "learning_rate": 8.299678280512013e-05, + "loss": 0.7512, + "step": 13171 + }, + { + "epoch": 0.8924723897283013, + "grad_norm": 5.548654079437256, + "learning_rate": 8.299541378602232e-05, + "loss": 0.8011, + "step": 13172 + }, + { + "epoch": 0.8925401449962734, + "grad_norm": 5.295721054077148, + "learning_rate": 8.29940447669245e-05, + "loss": 0.7481, + "step": 13173 + }, + { + "epoch": 0.8926079002642455, + "grad_norm": 5.564395904541016, + "learning_rate": 8.299267574782668e-05, + "loss": 0.7028, + "step": 13174 + }, + { + "epoch": 0.8926756555322176, + "grad_norm": 6.351992607116699, + "learning_rate": 8.299130672872887e-05, + "loss": 0.6943, + "step": 13175 + }, + { + "epoch": 0.8927434108001897, + "grad_norm": 8.362895965576172, + "learning_rate": 8.298993770963105e-05, + "loss": 0.9287, + "step": 13176 + }, + { + "epoch": 0.8928111660681618, + "grad_norm": 6.428536415100098, + "learning_rate": 8.298856869053324e-05, + "loss": 0.8992, + "step": 13177 + }, + { + "epoch": 0.8928789213361339, + "grad_norm": 6.148324489593506, + "learning_rate": 8.298719967143543e-05, + "loss": 0.6605, + "step": 13178 + }, + { + "epoch": 0.892946676604106, + "grad_norm": 6.530026912689209, + "learning_rate": 8.298583065233761e-05, + "loss": 0.6085, + "step": 13179 + }, + { + "epoch": 0.8930144318720781, + "grad_norm": 5.863303184509277, + "learning_rate": 8.298446163323979e-05, + "loss": 0.7368, + "step": 13180 + }, + { + "epoch": 0.8930821871400502, + "grad_norm": 11.581231117248535, + "learning_rate": 8.298309261414197e-05, + "loss": 0.732, + "step": 13181 + }, + { + "epoch": 0.8931499424080223, + "grad_norm": 7.566738128662109, + "learning_rate": 8.298172359504416e-05, + "loss": 0.9834, + "step": 13182 + }, + { + "epoch": 0.8932176976759943, + "grad_norm": 7.298031330108643, + "learning_rate": 8.298035457594634e-05, + "loss": 0.9663, + "step": 13183 + }, + { + "epoch": 0.8932854529439664, + "grad_norm": 6.619592666625977, + "learning_rate": 8.297898555684852e-05, + "loss": 0.6992, + "step": 13184 + }, + { + "epoch": 0.8933532082119385, + "grad_norm": 6.211091041564941, + "learning_rate": 8.29776165377507e-05, + "loss": 0.6848, + "step": 13185 + }, + { + "epoch": 0.8934209634799105, + "grad_norm": 5.228729248046875, + "learning_rate": 8.29762475186529e-05, + "loss": 0.629, + "step": 13186 + }, + { + "epoch": 0.8934887187478826, + "grad_norm": 5.966601848602295, + "learning_rate": 8.297487849955508e-05, + "loss": 0.7837, + "step": 13187 + }, + { + "epoch": 0.8935564740158547, + "grad_norm": 5.975196838378906, + "learning_rate": 8.297350948045726e-05, + "loss": 0.8608, + "step": 13188 + }, + { + "epoch": 0.8936242292838268, + "grad_norm": 7.837411880493164, + "learning_rate": 8.297214046135944e-05, + "loss": 1.006, + "step": 13189 + }, + { + "epoch": 0.8936919845517989, + "grad_norm": 6.313420295715332, + "learning_rate": 8.297077144226162e-05, + "loss": 0.9157, + "step": 13190 + }, + { + "epoch": 0.893759739819771, + "grad_norm": 4.761128902435303, + "learning_rate": 8.296940242316381e-05, + "loss": 0.7262, + "step": 13191 + }, + { + "epoch": 0.8938274950877431, + "grad_norm": 6.320545673370361, + "learning_rate": 8.2968033404066e-05, + "loss": 0.8437, + "step": 13192 + }, + { + "epoch": 0.8938952503557152, + "grad_norm": 6.234335899353027, + "learning_rate": 8.296666438496817e-05, + "loss": 0.8268, + "step": 13193 + }, + { + "epoch": 0.8939630056236872, + "grad_norm": 6.060486316680908, + "learning_rate": 8.296529536587036e-05, + "loss": 0.6344, + "step": 13194 + }, + { + "epoch": 0.8940307608916593, + "grad_norm": 4.9132771492004395, + "learning_rate": 8.296392634677255e-05, + "loss": 0.6422, + "step": 13195 + }, + { + "epoch": 0.8940985161596314, + "grad_norm": 7.546984672546387, + "learning_rate": 8.296255732767473e-05, + "loss": 0.8527, + "step": 13196 + }, + { + "epoch": 0.8941662714276035, + "grad_norm": 5.650018215179443, + "learning_rate": 8.296118830857691e-05, + "loss": 0.7624, + "step": 13197 + }, + { + "epoch": 0.8942340266955756, + "grad_norm": 6.712080955505371, + "learning_rate": 8.295981928947909e-05, + "loss": 0.9443, + "step": 13198 + }, + { + "epoch": 0.8943017819635477, + "grad_norm": 7.756785869598389, + "learning_rate": 8.295845027038127e-05, + "loss": 0.9292, + "step": 13199 + }, + { + "epoch": 0.8943695372315198, + "grad_norm": 5.824887752532959, + "learning_rate": 8.295708125128346e-05, + "loss": 0.7814, + "step": 13200 + }, + { + "epoch": 0.8944372924994919, + "grad_norm": 6.378854751586914, + "learning_rate": 8.295571223218564e-05, + "loss": 0.7699, + "step": 13201 + }, + { + "epoch": 0.8945050477674639, + "grad_norm": 6.914346218109131, + "learning_rate": 8.295434321308782e-05, + "loss": 0.7734, + "step": 13202 + }, + { + "epoch": 0.894572803035436, + "grad_norm": 5.830096244812012, + "learning_rate": 8.295297419399e-05, + "loss": 0.762, + "step": 13203 + }, + { + "epoch": 0.894640558303408, + "grad_norm": 5.791236877441406, + "learning_rate": 8.295160517489219e-05, + "loss": 0.6882, + "step": 13204 + }, + { + "epoch": 0.8947083135713801, + "grad_norm": 5.9879326820373535, + "learning_rate": 8.295023615579438e-05, + "loss": 0.8134, + "step": 13205 + }, + { + "epoch": 0.8947760688393522, + "grad_norm": 6.294375419616699, + "learning_rate": 8.294886713669656e-05, + "loss": 0.6101, + "step": 13206 + }, + { + "epoch": 0.8948438241073243, + "grad_norm": 5.5092267990112305, + "learning_rate": 8.294749811759874e-05, + "loss": 0.8178, + "step": 13207 + }, + { + "epoch": 0.8949115793752964, + "grad_norm": 5.606123924255371, + "learning_rate": 8.294612909850092e-05, + "loss": 0.7347, + "step": 13208 + }, + { + "epoch": 0.8949793346432685, + "grad_norm": 7.427051544189453, + "learning_rate": 8.294476007940311e-05, + "loss": 0.8645, + "step": 13209 + }, + { + "epoch": 0.8950470899112406, + "grad_norm": 8.88985538482666, + "learning_rate": 8.29433910603053e-05, + "loss": 0.793, + "step": 13210 + }, + { + "epoch": 0.8951148451792127, + "grad_norm": 5.20978307723999, + "learning_rate": 8.294202204120748e-05, + "loss": 0.954, + "step": 13211 + }, + { + "epoch": 0.8951826004471848, + "grad_norm": 8.192554473876953, + "learning_rate": 8.294065302210966e-05, + "loss": 0.812, + "step": 13212 + }, + { + "epoch": 0.8952503557151569, + "grad_norm": 5.569815158843994, + "learning_rate": 8.293928400301184e-05, + "loss": 0.702, + "step": 13213 + }, + { + "epoch": 0.895318110983129, + "grad_norm": 5.993325233459473, + "learning_rate": 8.293791498391403e-05, + "loss": 0.9014, + "step": 13214 + }, + { + "epoch": 0.8953858662511011, + "grad_norm": 5.743150234222412, + "learning_rate": 8.293654596481621e-05, + "loss": 0.5031, + "step": 13215 + }, + { + "epoch": 0.8954536215190732, + "grad_norm": 6.546820163726807, + "learning_rate": 8.293517694571839e-05, + "loss": 0.8145, + "step": 13216 + }, + { + "epoch": 0.8955213767870452, + "grad_norm": 6.425495147705078, + "learning_rate": 8.293380792662057e-05, + "loss": 0.6972, + "step": 13217 + }, + { + "epoch": 0.8955891320550172, + "grad_norm": 5.768934726715088, + "learning_rate": 8.293243890752276e-05, + "loss": 0.7145, + "step": 13218 + }, + { + "epoch": 0.8956568873229893, + "grad_norm": 5.6524224281311035, + "learning_rate": 8.293106988842494e-05, + "loss": 0.8436, + "step": 13219 + }, + { + "epoch": 0.8957246425909614, + "grad_norm": 7.842732906341553, + "learning_rate": 8.292970086932713e-05, + "loss": 0.9397, + "step": 13220 + }, + { + "epoch": 0.8957923978589335, + "grad_norm": 6.612356185913086, + "learning_rate": 8.292833185022932e-05, + "loss": 0.7468, + "step": 13221 + }, + { + "epoch": 0.8958601531269056, + "grad_norm": 6.291922569274902, + "learning_rate": 8.29269628311315e-05, + "loss": 0.7622, + "step": 13222 + }, + { + "epoch": 0.8959279083948777, + "grad_norm": 6.623983860015869, + "learning_rate": 8.292559381203368e-05, + "loss": 0.79, + "step": 13223 + }, + { + "epoch": 0.8959956636628498, + "grad_norm": 5.58621883392334, + "learning_rate": 8.292422479293587e-05, + "loss": 0.7481, + "step": 13224 + }, + { + "epoch": 0.8960634189308219, + "grad_norm": 5.839790344238281, + "learning_rate": 8.292285577383805e-05, + "loss": 0.7201, + "step": 13225 + }, + { + "epoch": 0.896131174198794, + "grad_norm": 7.310943126678467, + "learning_rate": 8.292148675474023e-05, + "loss": 1.062, + "step": 13226 + }, + { + "epoch": 0.896198929466766, + "grad_norm": 7.1324872970581055, + "learning_rate": 8.292011773564241e-05, + "loss": 0.86, + "step": 13227 + }, + { + "epoch": 0.8962666847347381, + "grad_norm": 5.25512170791626, + "learning_rate": 8.291874871654461e-05, + "loss": 0.7487, + "step": 13228 + }, + { + "epoch": 0.8963344400027102, + "grad_norm": 4.826694488525391, + "learning_rate": 8.291737969744679e-05, + "loss": 0.6497, + "step": 13229 + }, + { + "epoch": 0.8964021952706823, + "grad_norm": 6.0009918212890625, + "learning_rate": 8.291601067834897e-05, + "loss": 0.7122, + "step": 13230 + }, + { + "epoch": 0.8964699505386544, + "grad_norm": 6.770015716552734, + "learning_rate": 8.291464165925115e-05, + "loss": 0.9121, + "step": 13231 + }, + { + "epoch": 0.8965377058066265, + "grad_norm": 7.527721405029297, + "learning_rate": 8.291327264015334e-05, + "loss": 0.7042, + "step": 13232 + }, + { + "epoch": 0.8966054610745986, + "grad_norm": 5.726056098937988, + "learning_rate": 8.291190362105552e-05, + "loss": 0.7324, + "step": 13233 + }, + { + "epoch": 0.8966732163425707, + "grad_norm": 5.054379463195801, + "learning_rate": 8.29105346019577e-05, + "loss": 0.8064, + "step": 13234 + }, + { + "epoch": 0.8967409716105427, + "grad_norm": 5.584482669830322, + "learning_rate": 8.290916558285988e-05, + "loss": 0.7225, + "step": 13235 + }, + { + "epoch": 0.8968087268785148, + "grad_norm": 6.002537727355957, + "learning_rate": 8.290779656376206e-05, + "loss": 1.0223, + "step": 13236 + }, + { + "epoch": 0.8968764821464869, + "grad_norm": 6.58087158203125, + "learning_rate": 8.290642754466426e-05, + "loss": 0.847, + "step": 13237 + }, + { + "epoch": 0.896944237414459, + "grad_norm": 7.690030097961426, + "learning_rate": 8.290505852556644e-05, + "loss": 0.6217, + "step": 13238 + }, + { + "epoch": 0.897011992682431, + "grad_norm": 6.937661170959473, + "learning_rate": 8.290368950646862e-05, + "loss": 0.8138, + "step": 13239 + }, + { + "epoch": 0.8970797479504031, + "grad_norm": 5.334490776062012, + "learning_rate": 8.29023204873708e-05, + "loss": 0.8908, + "step": 13240 + }, + { + "epoch": 0.8971475032183752, + "grad_norm": 5.833104133605957, + "learning_rate": 8.2900951468273e-05, + "loss": 0.5743, + "step": 13241 + }, + { + "epoch": 0.8972152584863473, + "grad_norm": 5.803739547729492, + "learning_rate": 8.289958244917517e-05, + "loss": 0.6845, + "step": 13242 + }, + { + "epoch": 0.8972830137543194, + "grad_norm": 5.436889171600342, + "learning_rate": 8.289821343007735e-05, + "loss": 0.8216, + "step": 13243 + }, + { + "epoch": 0.8973507690222915, + "grad_norm": 4.356090068817139, + "learning_rate": 8.289684441097953e-05, + "loss": 0.7757, + "step": 13244 + }, + { + "epoch": 0.8974185242902636, + "grad_norm": 5.816674709320068, + "learning_rate": 8.289547539188172e-05, + "loss": 0.8839, + "step": 13245 + }, + { + "epoch": 0.8974862795582357, + "grad_norm": 7.878244876861572, + "learning_rate": 8.289410637278391e-05, + "loss": 1.0252, + "step": 13246 + }, + { + "epoch": 0.8975540348262078, + "grad_norm": 6.409861087799072, + "learning_rate": 8.289273735368609e-05, + "loss": 0.8281, + "step": 13247 + }, + { + "epoch": 0.8976217900941799, + "grad_norm": 6.617053031921387, + "learning_rate": 8.289136833458827e-05, + "loss": 1.0406, + "step": 13248 + }, + { + "epoch": 0.897689545362152, + "grad_norm": 5.893568992614746, + "learning_rate": 8.288999931549045e-05, + "loss": 0.8713, + "step": 13249 + }, + { + "epoch": 0.897757300630124, + "grad_norm": 5.194582939147949, + "learning_rate": 8.288863029639263e-05, + "loss": 0.7163, + "step": 13250 + }, + { + "epoch": 0.897825055898096, + "grad_norm": 5.706815719604492, + "learning_rate": 8.288726127729482e-05, + "loss": 0.7675, + "step": 13251 + }, + { + "epoch": 0.8978928111660681, + "grad_norm": 5.490322113037109, + "learning_rate": 8.2885892258197e-05, + "loss": 0.7559, + "step": 13252 + }, + { + "epoch": 0.8979605664340402, + "grad_norm": 4.902472019195557, + "learning_rate": 8.288452323909918e-05, + "loss": 0.6095, + "step": 13253 + }, + { + "epoch": 0.8980283217020123, + "grad_norm": 4.228281497955322, + "learning_rate": 8.288315422000137e-05, + "loss": 0.5824, + "step": 13254 + }, + { + "epoch": 0.8980960769699844, + "grad_norm": 5.2611494064331055, + "learning_rate": 8.288178520090356e-05, + "loss": 0.8144, + "step": 13255 + }, + { + "epoch": 0.8981638322379565, + "grad_norm": 5.848245143890381, + "learning_rate": 8.288041618180574e-05, + "loss": 0.7095, + "step": 13256 + }, + { + "epoch": 0.8982315875059286, + "grad_norm": 5.738656044006348, + "learning_rate": 8.287904716270792e-05, + "loss": 0.731, + "step": 13257 + }, + { + "epoch": 0.8982993427739007, + "grad_norm": 6.212946891784668, + "learning_rate": 8.28776781436101e-05, + "loss": 0.6781, + "step": 13258 + }, + { + "epoch": 0.8983670980418728, + "grad_norm": 4.827934741973877, + "learning_rate": 8.287630912451228e-05, + "loss": 0.7693, + "step": 13259 + }, + { + "epoch": 0.8984348533098448, + "grad_norm": 5.135469436645508, + "learning_rate": 8.287494010541447e-05, + "loss": 0.7819, + "step": 13260 + }, + { + "epoch": 0.8985026085778169, + "grad_norm": 8.885536193847656, + "learning_rate": 8.287357108631665e-05, + "loss": 0.7556, + "step": 13261 + }, + { + "epoch": 0.898570363845789, + "grad_norm": 8.197842597961426, + "learning_rate": 8.287220206721884e-05, + "loss": 0.7379, + "step": 13262 + }, + { + "epoch": 0.8986381191137611, + "grad_norm": 5.406810283660889, + "learning_rate": 8.287083304812102e-05, + "loss": 0.6618, + "step": 13263 + }, + { + "epoch": 0.8987058743817332, + "grad_norm": 5.688655376434326, + "learning_rate": 8.286946402902321e-05, + "loss": 0.9949, + "step": 13264 + }, + { + "epoch": 0.8987736296497053, + "grad_norm": 7.776078701019287, + "learning_rate": 8.286809500992539e-05, + "loss": 0.5491, + "step": 13265 + }, + { + "epoch": 0.8988413849176774, + "grad_norm": 7.738804817199707, + "learning_rate": 8.286672599082757e-05, + "loss": 1.06, + "step": 13266 + }, + { + "epoch": 0.8989091401856494, + "grad_norm": 5.860600471496582, + "learning_rate": 8.286535697172976e-05, + "loss": 0.7621, + "step": 13267 + }, + { + "epoch": 0.8989768954536215, + "grad_norm": 5.878091335296631, + "learning_rate": 8.286398795263194e-05, + "loss": 0.9419, + "step": 13268 + }, + { + "epoch": 0.8990446507215936, + "grad_norm": 6.572638511657715, + "learning_rate": 8.286261893353412e-05, + "loss": 1.1296, + "step": 13269 + }, + { + "epoch": 0.8991124059895657, + "grad_norm": 6.110701084136963, + "learning_rate": 8.286124991443632e-05, + "loss": 0.6976, + "step": 13270 + }, + { + "epoch": 0.8991801612575377, + "grad_norm": 5.455756187438965, + "learning_rate": 8.28598808953385e-05, + "loss": 0.7511, + "step": 13271 + }, + { + "epoch": 0.8992479165255098, + "grad_norm": 5.473435878753662, + "learning_rate": 8.285851187624068e-05, + "loss": 0.7217, + "step": 13272 + }, + { + "epoch": 0.8993156717934819, + "grad_norm": 6.81781005859375, + "learning_rate": 8.285714285714287e-05, + "loss": 0.6594, + "step": 13273 + }, + { + "epoch": 0.899383427061454, + "grad_norm": 6.982245445251465, + "learning_rate": 8.285577383804505e-05, + "loss": 0.8544, + "step": 13274 + }, + { + "epoch": 0.8994511823294261, + "grad_norm": 4.480033874511719, + "learning_rate": 8.285440481894723e-05, + "loss": 0.8935, + "step": 13275 + }, + { + "epoch": 0.8995189375973982, + "grad_norm": 5.940613746643066, + "learning_rate": 8.285303579984941e-05, + "loss": 0.8299, + "step": 13276 + }, + { + "epoch": 0.8995866928653703, + "grad_norm": 5.0714335441589355, + "learning_rate": 8.28516667807516e-05, + "loss": 0.6672, + "step": 13277 + }, + { + "epoch": 0.8996544481333424, + "grad_norm": 5.617218971252441, + "learning_rate": 8.285029776165379e-05, + "loss": 0.8913, + "step": 13278 + }, + { + "epoch": 0.8997222034013145, + "grad_norm": 8.896486282348633, + "learning_rate": 8.284892874255597e-05, + "loss": 0.9386, + "step": 13279 + }, + { + "epoch": 0.8997899586692866, + "grad_norm": 6.059647083282471, + "learning_rate": 8.284755972345815e-05, + "loss": 0.8023, + "step": 13280 + }, + { + "epoch": 0.8998577139372587, + "grad_norm": 5.9106950759887695, + "learning_rate": 8.284619070436033e-05, + "loss": 0.8094, + "step": 13281 + }, + { + "epoch": 0.8999254692052308, + "grad_norm": 7.264159679412842, + "learning_rate": 8.284482168526251e-05, + "loss": 0.8013, + "step": 13282 + }, + { + "epoch": 0.8999932244732028, + "grad_norm": 7.696917533874512, + "learning_rate": 8.28434526661647e-05, + "loss": 0.8387, + "step": 13283 + }, + { + "epoch": 0.9000609797411748, + "grad_norm": 6.536654472351074, + "learning_rate": 8.284208364706688e-05, + "loss": 0.8861, + "step": 13284 + }, + { + "epoch": 0.9001287350091469, + "grad_norm": 5.371990203857422, + "learning_rate": 8.284071462796906e-05, + "loss": 0.7841, + "step": 13285 + }, + { + "epoch": 0.900196490277119, + "grad_norm": 7.402823448181152, + "learning_rate": 8.283934560887124e-05, + "loss": 0.9058, + "step": 13286 + }, + { + "epoch": 0.9002642455450911, + "grad_norm": 5.5579328536987305, + "learning_rate": 8.283797658977344e-05, + "loss": 0.698, + "step": 13287 + }, + { + "epoch": 0.9003320008130632, + "grad_norm": 5.994121074676514, + "learning_rate": 8.283660757067562e-05, + "loss": 0.7502, + "step": 13288 + }, + { + "epoch": 0.9003997560810353, + "grad_norm": 6.9693427085876465, + "learning_rate": 8.28352385515778e-05, + "loss": 0.9358, + "step": 13289 + }, + { + "epoch": 0.9004675113490074, + "grad_norm": 6.08130407333374, + "learning_rate": 8.283386953247998e-05, + "loss": 0.7868, + "step": 13290 + }, + { + "epoch": 0.9005352666169795, + "grad_norm": 6.161886692047119, + "learning_rate": 8.283250051338216e-05, + "loss": 0.6941, + "step": 13291 + }, + { + "epoch": 0.9006030218849516, + "grad_norm": 6.822792053222656, + "learning_rate": 8.283113149428435e-05, + "loss": 0.8085, + "step": 13292 + }, + { + "epoch": 0.9006707771529237, + "grad_norm": 5.883656978607178, + "learning_rate": 8.282976247518653e-05, + "loss": 0.7429, + "step": 13293 + }, + { + "epoch": 0.9007385324208957, + "grad_norm": 4.72324800491333, + "learning_rate": 8.282839345608871e-05, + "loss": 0.6083, + "step": 13294 + }, + { + "epoch": 0.9008062876888678, + "grad_norm": 5.912334442138672, + "learning_rate": 8.28270244369909e-05, + "loss": 0.8443, + "step": 13295 + }, + { + "epoch": 0.9008740429568399, + "grad_norm": 4.501663684844971, + "learning_rate": 8.282565541789309e-05, + "loss": 0.7635, + "step": 13296 + }, + { + "epoch": 0.900941798224812, + "grad_norm": 6.771607875823975, + "learning_rate": 8.282428639879527e-05, + "loss": 0.9044, + "step": 13297 + }, + { + "epoch": 0.9010095534927841, + "grad_norm": 6.7166428565979, + "learning_rate": 8.282291737969745e-05, + "loss": 0.8295, + "step": 13298 + }, + { + "epoch": 0.9010773087607562, + "grad_norm": 6.394474029541016, + "learning_rate": 8.282154836059963e-05, + "loss": 0.7899, + "step": 13299 + }, + { + "epoch": 0.9011450640287282, + "grad_norm": 5.574833869934082, + "learning_rate": 8.282017934150181e-05, + "loss": 0.752, + "step": 13300 + }, + { + "epoch": 0.9012128192967003, + "grad_norm": 6.585177898406982, + "learning_rate": 8.2818810322404e-05, + "loss": 0.6169, + "step": 13301 + }, + { + "epoch": 0.9012805745646724, + "grad_norm": 5.940279960632324, + "learning_rate": 8.281744130330618e-05, + "loss": 0.7563, + "step": 13302 + }, + { + "epoch": 0.9013483298326445, + "grad_norm": 6.039457321166992, + "learning_rate": 8.281607228420836e-05, + "loss": 0.7505, + "step": 13303 + }, + { + "epoch": 0.9014160851006165, + "grad_norm": 10.80169677734375, + "learning_rate": 8.281470326511054e-05, + "loss": 0.8651, + "step": 13304 + }, + { + "epoch": 0.9014838403685886, + "grad_norm": 5.8975043296813965, + "learning_rate": 8.281333424601273e-05, + "loss": 0.8325, + "step": 13305 + }, + { + "epoch": 0.9015515956365607, + "grad_norm": 5.902426719665527, + "learning_rate": 8.281196522691492e-05, + "loss": 0.697, + "step": 13306 + }, + { + "epoch": 0.9016193509045328, + "grad_norm": 7.651483058929443, + "learning_rate": 8.28105962078171e-05, + "loss": 0.8322, + "step": 13307 + }, + { + "epoch": 0.9016871061725049, + "grad_norm": 5.933381080627441, + "learning_rate": 8.280922718871928e-05, + "loss": 0.834, + "step": 13308 + }, + { + "epoch": 0.901754861440477, + "grad_norm": 5.620283126831055, + "learning_rate": 8.280785816962146e-05, + "loss": 0.8047, + "step": 13309 + }, + { + "epoch": 0.9018226167084491, + "grad_norm": 4.271581172943115, + "learning_rate": 8.280648915052365e-05, + "loss": 0.7422, + "step": 13310 + }, + { + "epoch": 0.9018903719764212, + "grad_norm": 7.821619033813477, + "learning_rate": 8.280512013142583e-05, + "loss": 0.9909, + "step": 13311 + }, + { + "epoch": 0.9019581272443933, + "grad_norm": 5.665806770324707, + "learning_rate": 8.280375111232801e-05, + "loss": 0.6748, + "step": 13312 + }, + { + "epoch": 0.9020258825123654, + "grad_norm": 7.500571250915527, + "learning_rate": 8.28023820932302e-05, + "loss": 0.9495, + "step": 13313 + }, + { + "epoch": 0.9020936377803375, + "grad_norm": 7.833176612854004, + "learning_rate": 8.280101307413239e-05, + "loss": 0.8774, + "step": 13314 + }, + { + "epoch": 0.9021613930483096, + "grad_norm": 10.599181175231934, + "learning_rate": 8.279964405503457e-05, + "loss": 0.9064, + "step": 13315 + }, + { + "epoch": 0.9022291483162815, + "grad_norm": 5.582043647766113, + "learning_rate": 8.279827503593675e-05, + "loss": 0.5968, + "step": 13316 + }, + { + "epoch": 0.9022969035842536, + "grad_norm": 10.205772399902344, + "learning_rate": 8.279690601683894e-05, + "loss": 0.8455, + "step": 13317 + }, + { + "epoch": 0.9023646588522257, + "grad_norm": 6.769802093505859, + "learning_rate": 8.279553699774112e-05, + "loss": 0.8242, + "step": 13318 + }, + { + "epoch": 0.9024324141201978, + "grad_norm": 8.564204216003418, + "learning_rate": 8.279416797864332e-05, + "loss": 0.9918, + "step": 13319 + }, + { + "epoch": 0.9025001693881699, + "grad_norm": 6.2231645584106445, + "learning_rate": 8.27927989595455e-05, + "loss": 0.8375, + "step": 13320 + }, + { + "epoch": 0.902567924656142, + "grad_norm": 6.065762519836426, + "learning_rate": 8.279142994044768e-05, + "loss": 0.8746, + "step": 13321 + }, + { + "epoch": 0.9026356799241141, + "grad_norm": 7.134408950805664, + "learning_rate": 8.279006092134986e-05, + "loss": 1.1725, + "step": 13322 + }, + { + "epoch": 0.9027034351920862, + "grad_norm": 7.31404972076416, + "learning_rate": 8.278869190225204e-05, + "loss": 0.6658, + "step": 13323 + }, + { + "epoch": 0.9027711904600583, + "grad_norm": 4.640890121459961, + "learning_rate": 8.278732288315423e-05, + "loss": 0.882, + "step": 13324 + }, + { + "epoch": 0.9028389457280304, + "grad_norm": 6.120473861694336, + "learning_rate": 8.278595386405641e-05, + "loss": 0.5502, + "step": 13325 + }, + { + "epoch": 0.9029067009960025, + "grad_norm": 5.745514392852783, + "learning_rate": 8.27845848449586e-05, + "loss": 0.6134, + "step": 13326 + }, + { + "epoch": 0.9029744562639745, + "grad_norm": 8.348088264465332, + "learning_rate": 8.278321582586077e-05, + "loss": 0.8996, + "step": 13327 + }, + { + "epoch": 0.9030422115319466, + "grad_norm": 6.585142612457275, + "learning_rate": 8.278184680676297e-05, + "loss": 0.8028, + "step": 13328 + }, + { + "epoch": 0.9031099667999187, + "grad_norm": 4.813357830047607, + "learning_rate": 8.278047778766515e-05, + "loss": 0.6632, + "step": 13329 + }, + { + "epoch": 0.9031777220678908, + "grad_norm": 6.714639663696289, + "learning_rate": 8.277910876856733e-05, + "loss": 0.7389, + "step": 13330 + }, + { + "epoch": 0.9032454773358629, + "grad_norm": 6.200788974761963, + "learning_rate": 8.277773974946951e-05, + "loss": 0.8529, + "step": 13331 + }, + { + "epoch": 0.903313232603835, + "grad_norm": 6.508268356323242, + "learning_rate": 8.277637073037169e-05, + "loss": 0.9533, + "step": 13332 + }, + { + "epoch": 0.903380987871807, + "grad_norm": 6.651120662689209, + "learning_rate": 8.277500171127388e-05, + "loss": 0.5568, + "step": 13333 + }, + { + "epoch": 0.9034487431397791, + "grad_norm": 6.641012668609619, + "learning_rate": 8.277363269217606e-05, + "loss": 0.7402, + "step": 13334 + }, + { + "epoch": 0.9035164984077512, + "grad_norm": 5.423649787902832, + "learning_rate": 8.277226367307824e-05, + "loss": 0.6719, + "step": 13335 + }, + { + "epoch": 0.9035842536757233, + "grad_norm": 4.8691558837890625, + "learning_rate": 8.277089465398042e-05, + "loss": 0.5897, + "step": 13336 + }, + { + "epoch": 0.9036520089436954, + "grad_norm": 4.2445759773254395, + "learning_rate": 8.27695256348826e-05, + "loss": 0.8443, + "step": 13337 + }, + { + "epoch": 0.9037197642116674, + "grad_norm": 5.379455089569092, + "learning_rate": 8.27681566157848e-05, + "loss": 0.6049, + "step": 13338 + }, + { + "epoch": 0.9037875194796395, + "grad_norm": 5.717207908630371, + "learning_rate": 8.276678759668698e-05, + "loss": 0.8009, + "step": 13339 + }, + { + "epoch": 0.9038552747476116, + "grad_norm": 5.412480354309082, + "learning_rate": 8.276541857758916e-05, + "loss": 0.7753, + "step": 13340 + }, + { + "epoch": 0.9039230300155837, + "grad_norm": 8.551285743713379, + "learning_rate": 8.276404955849134e-05, + "loss": 0.9724, + "step": 13341 + }, + { + "epoch": 0.9039907852835558, + "grad_norm": 6.7664899826049805, + "learning_rate": 8.276268053939353e-05, + "loss": 0.9088, + "step": 13342 + }, + { + "epoch": 0.9040585405515279, + "grad_norm": 6.312598705291748, + "learning_rate": 8.276131152029571e-05, + "loss": 0.6874, + "step": 13343 + }, + { + "epoch": 0.9041262958195, + "grad_norm": 6.274691104888916, + "learning_rate": 8.27599425011979e-05, + "loss": 0.9076, + "step": 13344 + }, + { + "epoch": 0.9041940510874721, + "grad_norm": 5.1724958419799805, + "learning_rate": 8.275857348210007e-05, + "loss": 0.6779, + "step": 13345 + }, + { + "epoch": 0.9042618063554442, + "grad_norm": 6.403627872467041, + "learning_rate": 8.275720446300225e-05, + "loss": 0.8524, + "step": 13346 + }, + { + "epoch": 0.9043295616234163, + "grad_norm": 5.662676811218262, + "learning_rate": 8.275583544390445e-05, + "loss": 0.8278, + "step": 13347 + }, + { + "epoch": 0.9043973168913884, + "grad_norm": 5.776680946350098, + "learning_rate": 8.275446642480663e-05, + "loss": 0.7354, + "step": 13348 + }, + { + "epoch": 0.9044650721593603, + "grad_norm": 5.1496076583862305, + "learning_rate": 8.275309740570881e-05, + "loss": 0.7018, + "step": 13349 + }, + { + "epoch": 0.9045328274273324, + "grad_norm": 6.148200035095215, + "learning_rate": 8.275172838661099e-05, + "loss": 0.7145, + "step": 13350 + }, + { + "epoch": 0.9046005826953045, + "grad_norm": 6.302008152008057, + "learning_rate": 8.275035936751318e-05, + "loss": 0.7148, + "step": 13351 + }, + { + "epoch": 0.9046683379632766, + "grad_norm": 5.558827877044678, + "learning_rate": 8.274899034841536e-05, + "loss": 0.7591, + "step": 13352 + }, + { + "epoch": 0.9047360932312487, + "grad_norm": 4.537057876586914, + "learning_rate": 8.274762132931754e-05, + "loss": 0.5739, + "step": 13353 + }, + { + "epoch": 0.9048038484992208, + "grad_norm": 5.1838555335998535, + "learning_rate": 8.274625231021972e-05, + "loss": 0.7811, + "step": 13354 + }, + { + "epoch": 0.9048716037671929, + "grad_norm": 5.07068395614624, + "learning_rate": 8.27448832911219e-05, + "loss": 0.9203, + "step": 13355 + }, + { + "epoch": 0.904939359035165, + "grad_norm": 6.439149856567383, + "learning_rate": 8.27435142720241e-05, + "loss": 0.7654, + "step": 13356 + }, + { + "epoch": 0.9050071143031371, + "grad_norm": 8.04731559753418, + "learning_rate": 8.274214525292628e-05, + "loss": 1.1907, + "step": 13357 + }, + { + "epoch": 0.9050748695711092, + "grad_norm": 5.4005961418151855, + "learning_rate": 8.274077623382846e-05, + "loss": 0.8081, + "step": 13358 + }, + { + "epoch": 0.9051426248390813, + "grad_norm": 5.338225364685059, + "learning_rate": 8.273940721473064e-05, + "loss": 0.5817, + "step": 13359 + }, + { + "epoch": 0.9052103801070533, + "grad_norm": 7.287635326385498, + "learning_rate": 8.273803819563283e-05, + "loss": 0.9362, + "step": 13360 + }, + { + "epoch": 0.9052781353750254, + "grad_norm": 7.328275203704834, + "learning_rate": 8.273666917653501e-05, + "loss": 0.9302, + "step": 13361 + }, + { + "epoch": 0.9053458906429975, + "grad_norm": 5.7136359214782715, + "learning_rate": 8.27353001574372e-05, + "loss": 0.8013, + "step": 13362 + }, + { + "epoch": 0.9054136459109696, + "grad_norm": 5.051130294799805, + "learning_rate": 8.273393113833939e-05, + "loss": 0.6682, + "step": 13363 + }, + { + "epoch": 0.9054814011789417, + "grad_norm": 6.250859260559082, + "learning_rate": 8.273256211924157e-05, + "loss": 0.8889, + "step": 13364 + }, + { + "epoch": 0.9055491564469137, + "grad_norm": 6.39178991317749, + "learning_rate": 8.273119310014375e-05, + "loss": 0.6838, + "step": 13365 + }, + { + "epoch": 0.9056169117148858, + "grad_norm": 5.243597507476807, + "learning_rate": 8.272982408104594e-05, + "loss": 0.8019, + "step": 13366 + }, + { + "epoch": 0.9056846669828579, + "grad_norm": 6.917558670043945, + "learning_rate": 8.272845506194812e-05, + "loss": 0.7121, + "step": 13367 + }, + { + "epoch": 0.90575242225083, + "grad_norm": 4.401981830596924, + "learning_rate": 8.27270860428503e-05, + "loss": 0.5813, + "step": 13368 + }, + { + "epoch": 0.9058201775188021, + "grad_norm": 6.838183879852295, + "learning_rate": 8.272571702375248e-05, + "loss": 0.8273, + "step": 13369 + }, + { + "epoch": 0.9058879327867742, + "grad_norm": 6.264207363128662, + "learning_rate": 8.272434800465468e-05, + "loss": 0.6854, + "step": 13370 + }, + { + "epoch": 0.9059556880547462, + "grad_norm": 4.945788383483887, + "learning_rate": 8.272297898555686e-05, + "loss": 0.5941, + "step": 13371 + }, + { + "epoch": 0.9060234433227183, + "grad_norm": 6.928656578063965, + "learning_rate": 8.272160996645904e-05, + "loss": 0.754, + "step": 13372 + }, + { + "epoch": 0.9060911985906904, + "grad_norm": 4.5859551429748535, + "learning_rate": 8.272024094736122e-05, + "loss": 0.6271, + "step": 13373 + }, + { + "epoch": 0.9061589538586625, + "grad_norm": 6.357034206390381, + "learning_rate": 8.271887192826341e-05, + "loss": 0.6632, + "step": 13374 + }, + { + "epoch": 0.9062267091266346, + "grad_norm": 6.899624824523926, + "learning_rate": 8.271750290916559e-05, + "loss": 0.8332, + "step": 13375 + }, + { + "epoch": 0.9062944643946067, + "grad_norm": 8.641779899597168, + "learning_rate": 8.271613389006777e-05, + "loss": 0.8412, + "step": 13376 + }, + { + "epoch": 0.9063622196625788, + "grad_norm": 6.443648815155029, + "learning_rate": 8.271476487096995e-05, + "loss": 0.8374, + "step": 13377 + }, + { + "epoch": 0.9064299749305509, + "grad_norm": 8.222790718078613, + "learning_rate": 8.271339585187213e-05, + "loss": 0.9038, + "step": 13378 + }, + { + "epoch": 0.906497730198523, + "grad_norm": 5.121990203857422, + "learning_rate": 8.271202683277433e-05, + "loss": 0.6617, + "step": 13379 + }, + { + "epoch": 0.9065654854664951, + "grad_norm": 6.855564117431641, + "learning_rate": 8.271065781367651e-05, + "loss": 0.7803, + "step": 13380 + }, + { + "epoch": 0.906633240734467, + "grad_norm": 6.987429141998291, + "learning_rate": 8.270928879457869e-05, + "loss": 0.6917, + "step": 13381 + }, + { + "epoch": 0.9067009960024391, + "grad_norm": 6.5699896812438965, + "learning_rate": 8.270791977548087e-05, + "loss": 0.9687, + "step": 13382 + }, + { + "epoch": 0.9067687512704112, + "grad_norm": 6.9197797775268555, + "learning_rate": 8.270655075638306e-05, + "loss": 0.7054, + "step": 13383 + }, + { + "epoch": 0.9068365065383833, + "grad_norm": 6.5933380126953125, + "learning_rate": 8.270518173728524e-05, + "loss": 0.7276, + "step": 13384 + }, + { + "epoch": 0.9069042618063554, + "grad_norm": 5.359493732452393, + "learning_rate": 8.270381271818742e-05, + "loss": 0.6566, + "step": 13385 + }, + { + "epoch": 0.9069720170743275, + "grad_norm": 9.177874565124512, + "learning_rate": 8.27024436990896e-05, + "loss": 0.7313, + "step": 13386 + }, + { + "epoch": 0.9070397723422996, + "grad_norm": 7.813451766967773, + "learning_rate": 8.270107467999178e-05, + "loss": 0.6503, + "step": 13387 + }, + { + "epoch": 0.9071075276102717, + "grad_norm": 5.215133190155029, + "learning_rate": 8.269970566089398e-05, + "loss": 0.8036, + "step": 13388 + }, + { + "epoch": 0.9071752828782438, + "grad_norm": 5.473424434661865, + "learning_rate": 8.269833664179616e-05, + "loss": 0.7178, + "step": 13389 + }, + { + "epoch": 0.9072430381462159, + "grad_norm": 7.770848751068115, + "learning_rate": 8.269696762269834e-05, + "loss": 0.8419, + "step": 13390 + }, + { + "epoch": 0.907310793414188, + "grad_norm": 6.786417007446289, + "learning_rate": 8.269559860360052e-05, + "loss": 0.9242, + "step": 13391 + }, + { + "epoch": 0.9073785486821601, + "grad_norm": 4.775079250335693, + "learning_rate": 8.26942295845027e-05, + "loss": 0.7514, + "step": 13392 + }, + { + "epoch": 0.9074463039501321, + "grad_norm": 5.795116901397705, + "learning_rate": 8.26928605654049e-05, + "loss": 0.8425, + "step": 13393 + }, + { + "epoch": 0.9075140592181042, + "grad_norm": 7.329837322235107, + "learning_rate": 8.269149154630707e-05, + "loss": 0.8125, + "step": 13394 + }, + { + "epoch": 0.9075818144860763, + "grad_norm": 4.981736660003662, + "learning_rate": 8.269012252720925e-05, + "loss": 0.6766, + "step": 13395 + }, + { + "epoch": 0.9076495697540484, + "grad_norm": 7.599316120147705, + "learning_rate": 8.268875350811143e-05, + "loss": 0.6769, + "step": 13396 + }, + { + "epoch": 0.9077173250220205, + "grad_norm": 5.966447830200195, + "learning_rate": 8.268738448901363e-05, + "loss": 0.7356, + "step": 13397 + }, + { + "epoch": 0.9077850802899925, + "grad_norm": 6.569231986999512, + "learning_rate": 8.268601546991581e-05, + "loss": 0.84, + "step": 13398 + }, + { + "epoch": 0.9078528355579646, + "grad_norm": 5.759048938751221, + "learning_rate": 8.268464645081799e-05, + "loss": 0.9159, + "step": 13399 + }, + { + "epoch": 0.9079205908259367, + "grad_norm": 5.395700931549072, + "learning_rate": 8.268327743172017e-05, + "loss": 0.662, + "step": 13400 + }, + { + "epoch": 0.9079883460939088, + "grad_norm": 6.1797099113464355, + "learning_rate": 8.268190841262235e-05, + "loss": 0.6498, + "step": 13401 + }, + { + "epoch": 0.9080561013618809, + "grad_norm": 7.686827659606934, + "learning_rate": 8.268053939352454e-05, + "loss": 0.9687, + "step": 13402 + }, + { + "epoch": 0.908123856629853, + "grad_norm": 6.104824066162109, + "learning_rate": 8.267917037442672e-05, + "loss": 0.7184, + "step": 13403 + }, + { + "epoch": 0.908191611897825, + "grad_norm": 6.225331783294678, + "learning_rate": 8.26778013553289e-05, + "loss": 0.8403, + "step": 13404 + }, + { + "epoch": 0.9082593671657971, + "grad_norm": 5.349873065948486, + "learning_rate": 8.267643233623108e-05, + "loss": 0.8676, + "step": 13405 + }, + { + "epoch": 0.9083271224337692, + "grad_norm": 5.427150249481201, + "learning_rate": 8.267506331713328e-05, + "loss": 0.7199, + "step": 13406 + }, + { + "epoch": 0.9083948777017413, + "grad_norm": 6.000942707061768, + "learning_rate": 8.267369429803546e-05, + "loss": 0.6516, + "step": 13407 + }, + { + "epoch": 0.9084626329697134, + "grad_norm": 7.7418532371521, + "learning_rate": 8.267232527893764e-05, + "loss": 0.8854, + "step": 13408 + }, + { + "epoch": 0.9085303882376855, + "grad_norm": 6.350762844085693, + "learning_rate": 8.267095625983983e-05, + "loss": 1.0689, + "step": 13409 + }, + { + "epoch": 0.9085981435056576, + "grad_norm": 6.675297260284424, + "learning_rate": 8.266958724074201e-05, + "loss": 1.1803, + "step": 13410 + }, + { + "epoch": 0.9086658987736297, + "grad_norm": 5.116997718811035, + "learning_rate": 8.26682182216442e-05, + "loss": 0.6724, + "step": 13411 + }, + { + "epoch": 0.9087336540416018, + "grad_norm": 6.529482364654541, + "learning_rate": 8.266684920254639e-05, + "loss": 0.7508, + "step": 13412 + }, + { + "epoch": 0.9088014093095739, + "grad_norm": 6.559128284454346, + "learning_rate": 8.266548018344857e-05, + "loss": 0.7786, + "step": 13413 + }, + { + "epoch": 0.9088691645775459, + "grad_norm": 6.002798080444336, + "learning_rate": 8.266411116435075e-05, + "loss": 0.613, + "step": 13414 + }, + { + "epoch": 0.9089369198455179, + "grad_norm": 5.554568767547607, + "learning_rate": 8.266274214525293e-05, + "loss": 0.7622, + "step": 13415 + }, + { + "epoch": 0.90900467511349, + "grad_norm": 4.8590521812438965, + "learning_rate": 8.266137312615512e-05, + "loss": 0.6765, + "step": 13416 + }, + { + "epoch": 0.9090724303814621, + "grad_norm": 5.818526744842529, + "learning_rate": 8.26600041070573e-05, + "loss": 0.8034, + "step": 13417 + }, + { + "epoch": 0.9091401856494342, + "grad_norm": 5.614035606384277, + "learning_rate": 8.265863508795948e-05, + "loss": 0.6917, + "step": 13418 + }, + { + "epoch": 0.9092079409174063, + "grad_norm": 4.978684425354004, + "learning_rate": 8.265726606886166e-05, + "loss": 0.6436, + "step": 13419 + }, + { + "epoch": 0.9092756961853784, + "grad_norm": 4.747771263122559, + "learning_rate": 8.265589704976386e-05, + "loss": 0.7033, + "step": 13420 + }, + { + "epoch": 0.9093434514533505, + "grad_norm": 5.64393424987793, + "learning_rate": 8.265452803066604e-05, + "loss": 0.6504, + "step": 13421 + }, + { + "epoch": 0.9094112067213226, + "grad_norm": 4.84307336807251, + "learning_rate": 8.265315901156822e-05, + "loss": 0.6661, + "step": 13422 + }, + { + "epoch": 0.9094789619892947, + "grad_norm": 7.957591533660889, + "learning_rate": 8.26517899924704e-05, + "loss": 0.797, + "step": 13423 + }, + { + "epoch": 0.9095467172572668, + "grad_norm": 7.481930255889893, + "learning_rate": 8.265042097337258e-05, + "loss": 0.9757, + "step": 13424 + }, + { + "epoch": 0.9096144725252389, + "grad_norm": 9.150352478027344, + "learning_rate": 8.264905195427477e-05, + "loss": 0.6074, + "step": 13425 + }, + { + "epoch": 0.909682227793211, + "grad_norm": 7.335263252258301, + "learning_rate": 8.264768293517695e-05, + "loss": 0.7082, + "step": 13426 + }, + { + "epoch": 0.909749983061183, + "grad_norm": 6.145523548126221, + "learning_rate": 8.264631391607913e-05, + "loss": 0.7136, + "step": 13427 + }, + { + "epoch": 0.9098177383291551, + "grad_norm": 8.037848472595215, + "learning_rate": 8.264494489698131e-05, + "loss": 0.7071, + "step": 13428 + }, + { + "epoch": 0.9098854935971272, + "grad_norm": 7.532377243041992, + "learning_rate": 8.264357587788351e-05, + "loss": 0.7352, + "step": 13429 + }, + { + "epoch": 0.9099532488650992, + "grad_norm": 5.716782093048096, + "learning_rate": 8.264220685878569e-05, + "loss": 0.6108, + "step": 13430 + }, + { + "epoch": 0.9100210041330713, + "grad_norm": 5.553226947784424, + "learning_rate": 8.264083783968787e-05, + "loss": 0.8414, + "step": 13431 + }, + { + "epoch": 0.9100887594010434, + "grad_norm": 5.714873313903809, + "learning_rate": 8.263946882059005e-05, + "loss": 0.7346, + "step": 13432 + }, + { + "epoch": 0.9101565146690155, + "grad_norm": 8.493986129760742, + "learning_rate": 8.263809980149223e-05, + "loss": 0.7027, + "step": 13433 + }, + { + "epoch": 0.9102242699369876, + "grad_norm": 4.617614269256592, + "learning_rate": 8.263673078239442e-05, + "loss": 0.7141, + "step": 13434 + }, + { + "epoch": 0.9102920252049597, + "grad_norm": 5.8947649002075195, + "learning_rate": 8.26353617632966e-05, + "loss": 0.8719, + "step": 13435 + }, + { + "epoch": 0.9103597804729318, + "grad_norm": 7.168681621551514, + "learning_rate": 8.263399274419878e-05, + "loss": 1.0277, + "step": 13436 + }, + { + "epoch": 0.9104275357409038, + "grad_norm": 7.014023303985596, + "learning_rate": 8.263262372510096e-05, + "loss": 0.9736, + "step": 13437 + }, + { + "epoch": 0.9104952910088759, + "grad_norm": 6.311854362487793, + "learning_rate": 8.263125470600314e-05, + "loss": 0.7287, + "step": 13438 + }, + { + "epoch": 0.910563046276848, + "grad_norm": 6.3631486892700195, + "learning_rate": 8.262988568690534e-05, + "loss": 0.8432, + "step": 13439 + }, + { + "epoch": 0.9106308015448201, + "grad_norm": 7.433747291564941, + "learning_rate": 8.262851666780752e-05, + "loss": 0.8998, + "step": 13440 + }, + { + "epoch": 0.9106985568127922, + "grad_norm": 6.583968639373779, + "learning_rate": 8.26271476487097e-05, + "loss": 0.7926, + "step": 13441 + }, + { + "epoch": 0.9107663120807643, + "grad_norm": 5.414675235748291, + "learning_rate": 8.262577862961188e-05, + "loss": 0.6653, + "step": 13442 + }, + { + "epoch": 0.9108340673487364, + "grad_norm": 6.068636417388916, + "learning_rate": 8.262440961051407e-05, + "loss": 0.7299, + "step": 13443 + }, + { + "epoch": 0.9109018226167085, + "grad_norm": 6.4102277755737305, + "learning_rate": 8.262304059141625e-05, + "loss": 0.6988, + "step": 13444 + }, + { + "epoch": 0.9109695778846806, + "grad_norm": 7.261764049530029, + "learning_rate": 8.262167157231843e-05, + "loss": 0.7473, + "step": 13445 + }, + { + "epoch": 0.9110373331526527, + "grad_norm": 6.129340171813965, + "learning_rate": 8.262030255322061e-05, + "loss": 0.7437, + "step": 13446 + }, + { + "epoch": 0.9111050884206247, + "grad_norm": 7.05224609375, + "learning_rate": 8.26189335341228e-05, + "loss": 0.8106, + "step": 13447 + }, + { + "epoch": 0.9111728436885967, + "grad_norm": 6.788548469543457, + "learning_rate": 8.261756451502499e-05, + "loss": 0.8289, + "step": 13448 + }, + { + "epoch": 0.9112405989565688, + "grad_norm": 6.397700786590576, + "learning_rate": 8.261619549592717e-05, + "loss": 0.7633, + "step": 13449 + }, + { + "epoch": 0.9113083542245409, + "grad_norm": 6.438706874847412, + "learning_rate": 8.261482647682935e-05, + "loss": 1.0421, + "step": 13450 + }, + { + "epoch": 0.911376109492513, + "grad_norm": 5.280536651611328, + "learning_rate": 8.261345745773153e-05, + "loss": 0.7404, + "step": 13451 + }, + { + "epoch": 0.9114438647604851, + "grad_norm": 5.514378070831299, + "learning_rate": 8.261208843863372e-05, + "loss": 0.6536, + "step": 13452 + }, + { + "epoch": 0.9115116200284572, + "grad_norm": 6.182071685791016, + "learning_rate": 8.26107194195359e-05, + "loss": 1.0355, + "step": 13453 + }, + { + "epoch": 0.9115793752964293, + "grad_norm": 5.280673503875732, + "learning_rate": 8.260935040043808e-05, + "loss": 0.6193, + "step": 13454 + }, + { + "epoch": 0.9116471305644014, + "grad_norm": 6.420233726501465, + "learning_rate": 8.260798138134028e-05, + "loss": 0.7545, + "step": 13455 + }, + { + "epoch": 0.9117148858323735, + "grad_norm": 4.983266353607178, + "learning_rate": 8.260661236224246e-05, + "loss": 0.7821, + "step": 13456 + }, + { + "epoch": 0.9117826411003456, + "grad_norm": 7.941768646240234, + "learning_rate": 8.260524334314464e-05, + "loss": 0.851, + "step": 13457 + }, + { + "epoch": 0.9118503963683177, + "grad_norm": 6.576548099517822, + "learning_rate": 8.260387432404683e-05, + "loss": 1.0445, + "step": 13458 + }, + { + "epoch": 0.9119181516362898, + "grad_norm": 4.9454121589660645, + "learning_rate": 8.260250530494901e-05, + "loss": 0.7082, + "step": 13459 + }, + { + "epoch": 0.9119859069042618, + "grad_norm": 5.048241138458252, + "learning_rate": 8.260113628585119e-05, + "loss": 0.6747, + "step": 13460 + }, + { + "epoch": 0.9120536621722339, + "grad_norm": 6.229146957397461, + "learning_rate": 8.259976726675339e-05, + "loss": 0.9753, + "step": 13461 + }, + { + "epoch": 0.912121417440206, + "grad_norm": 7.635250568389893, + "learning_rate": 8.259839824765557e-05, + "loss": 0.7616, + "step": 13462 + }, + { + "epoch": 0.912189172708178, + "grad_norm": 5.580092430114746, + "learning_rate": 8.259702922855775e-05, + "loss": 0.5658, + "step": 13463 + }, + { + "epoch": 0.9122569279761501, + "grad_norm": 5.2181715965271, + "learning_rate": 8.259566020945993e-05, + "loss": 0.7053, + "step": 13464 + }, + { + "epoch": 0.9123246832441222, + "grad_norm": 6.143484592437744, + "learning_rate": 8.259429119036211e-05, + "loss": 0.6313, + "step": 13465 + }, + { + "epoch": 0.9123924385120943, + "grad_norm": 5.8051323890686035, + "learning_rate": 8.25929221712643e-05, + "loss": 0.7655, + "step": 13466 + }, + { + "epoch": 0.9124601937800664, + "grad_norm": 5.798500061035156, + "learning_rate": 8.259155315216648e-05, + "loss": 0.7226, + "step": 13467 + }, + { + "epoch": 0.9125279490480385, + "grad_norm": 5.753519535064697, + "learning_rate": 8.259018413306866e-05, + "loss": 0.7487, + "step": 13468 + }, + { + "epoch": 0.9125957043160106, + "grad_norm": 5.924856662750244, + "learning_rate": 8.258881511397084e-05, + "loss": 0.821, + "step": 13469 + }, + { + "epoch": 0.9126634595839827, + "grad_norm": 5.954171180725098, + "learning_rate": 8.258744609487302e-05, + "loss": 0.8115, + "step": 13470 + }, + { + "epoch": 0.9127312148519547, + "grad_norm": 6.430693626403809, + "learning_rate": 8.258607707577522e-05, + "loss": 0.7691, + "step": 13471 + }, + { + "epoch": 0.9127989701199268, + "grad_norm": 4.200928688049316, + "learning_rate": 8.25847080566774e-05, + "loss": 0.6181, + "step": 13472 + }, + { + "epoch": 0.9128667253878989, + "grad_norm": 6.997094631195068, + "learning_rate": 8.258333903757958e-05, + "loss": 1.0318, + "step": 13473 + }, + { + "epoch": 0.912934480655871, + "grad_norm": 6.03936243057251, + "learning_rate": 8.258197001848176e-05, + "loss": 0.8103, + "step": 13474 + }, + { + "epoch": 0.9130022359238431, + "grad_norm": 8.341856002807617, + "learning_rate": 8.258060099938395e-05, + "loss": 0.8473, + "step": 13475 + }, + { + "epoch": 0.9130699911918152, + "grad_norm": 7.553666591644287, + "learning_rate": 8.257923198028613e-05, + "loss": 0.8751, + "step": 13476 + }, + { + "epoch": 0.9131377464597873, + "grad_norm": 4.391775608062744, + "learning_rate": 8.257786296118831e-05, + "loss": 0.6136, + "step": 13477 + }, + { + "epoch": 0.9132055017277594, + "grad_norm": 5.554635047912598, + "learning_rate": 8.25764939420905e-05, + "loss": 0.584, + "step": 13478 + }, + { + "epoch": 0.9132732569957314, + "grad_norm": 4.761404037475586, + "learning_rate": 8.257512492299267e-05, + "loss": 0.7827, + "step": 13479 + }, + { + "epoch": 0.9133410122637035, + "grad_norm": 5.858761310577393, + "learning_rate": 8.257375590389487e-05, + "loss": 0.7184, + "step": 13480 + }, + { + "epoch": 0.9134087675316755, + "grad_norm": 5.618086338043213, + "learning_rate": 8.257238688479705e-05, + "loss": 0.6523, + "step": 13481 + }, + { + "epoch": 0.9134765227996476, + "grad_norm": 5.3456854820251465, + "learning_rate": 8.257101786569923e-05, + "loss": 0.7383, + "step": 13482 + }, + { + "epoch": 0.9135442780676197, + "grad_norm": 7.581971645355225, + "learning_rate": 8.256964884660141e-05, + "loss": 0.8992, + "step": 13483 + }, + { + "epoch": 0.9136120333355918, + "grad_norm": 7.308854579925537, + "learning_rate": 8.25682798275036e-05, + "loss": 0.8322, + "step": 13484 + }, + { + "epoch": 0.9136797886035639, + "grad_norm": 6.603922367095947, + "learning_rate": 8.256691080840578e-05, + "loss": 0.7841, + "step": 13485 + }, + { + "epoch": 0.913747543871536, + "grad_norm": 5.805631637573242, + "learning_rate": 8.256554178930796e-05, + "loss": 0.6594, + "step": 13486 + }, + { + "epoch": 0.9138152991395081, + "grad_norm": 5.206735134124756, + "learning_rate": 8.256417277021014e-05, + "loss": 0.6513, + "step": 13487 + }, + { + "epoch": 0.9138830544074802, + "grad_norm": 7.223725318908691, + "learning_rate": 8.256280375111232e-05, + "loss": 0.768, + "step": 13488 + }, + { + "epoch": 0.9139508096754523, + "grad_norm": 5.860501766204834, + "learning_rate": 8.256143473201452e-05, + "loss": 0.8824, + "step": 13489 + }, + { + "epoch": 0.9140185649434244, + "grad_norm": 7.430714130401611, + "learning_rate": 8.25600657129167e-05, + "loss": 0.8971, + "step": 13490 + }, + { + "epoch": 0.9140863202113965, + "grad_norm": 6.006752014160156, + "learning_rate": 8.255869669381888e-05, + "loss": 0.7208, + "step": 13491 + }, + { + "epoch": 0.9141540754793686, + "grad_norm": 6.061511993408203, + "learning_rate": 8.255732767472106e-05, + "loss": 0.7222, + "step": 13492 + }, + { + "epoch": 0.9142218307473406, + "grad_norm": 5.890130043029785, + "learning_rate": 8.255595865562324e-05, + "loss": 0.7764, + "step": 13493 + }, + { + "epoch": 0.9142895860153127, + "grad_norm": 6.152371883392334, + "learning_rate": 8.255458963652543e-05, + "loss": 0.896, + "step": 13494 + }, + { + "epoch": 0.9143573412832848, + "grad_norm": 5.317947864532471, + "learning_rate": 8.255322061742761e-05, + "loss": 0.7058, + "step": 13495 + }, + { + "epoch": 0.9144250965512568, + "grad_norm": 5.516234397888184, + "learning_rate": 8.25518515983298e-05, + "loss": 0.7669, + "step": 13496 + }, + { + "epoch": 0.9144928518192289, + "grad_norm": 5.327831268310547, + "learning_rate": 8.255048257923197e-05, + "loss": 0.6878, + "step": 13497 + }, + { + "epoch": 0.914560607087201, + "grad_norm": 6.158792495727539, + "learning_rate": 8.254911356013417e-05, + "loss": 0.7393, + "step": 13498 + }, + { + "epoch": 0.9146283623551731, + "grad_norm": 5.450939655303955, + "learning_rate": 8.254774454103635e-05, + "loss": 0.7131, + "step": 13499 + }, + { + "epoch": 0.9146961176231452, + "grad_norm": 5.03593635559082, + "learning_rate": 8.254637552193853e-05, + "loss": 0.5819, + "step": 13500 + }, + { + "epoch": 0.9147638728911173, + "grad_norm": 7.179828643798828, + "learning_rate": 8.254500650284072e-05, + "loss": 0.8142, + "step": 13501 + }, + { + "epoch": 0.9148316281590894, + "grad_norm": 7.470744609832764, + "learning_rate": 8.25436374837429e-05, + "loss": 0.7455, + "step": 13502 + }, + { + "epoch": 0.9148993834270615, + "grad_norm": 6.138060569763184, + "learning_rate": 8.254226846464508e-05, + "loss": 0.7474, + "step": 13503 + }, + { + "epoch": 0.9149671386950335, + "grad_norm": 6.90313720703125, + "learning_rate": 8.254089944554728e-05, + "loss": 0.6724, + "step": 13504 + }, + { + "epoch": 0.9150348939630056, + "grad_norm": 6.103466033935547, + "learning_rate": 8.253953042644946e-05, + "loss": 0.7163, + "step": 13505 + }, + { + "epoch": 0.9151026492309777, + "grad_norm": 7.782615661621094, + "learning_rate": 8.253816140735164e-05, + "loss": 0.9758, + "step": 13506 + }, + { + "epoch": 0.9151704044989498, + "grad_norm": 6.611595630645752, + "learning_rate": 8.253679238825383e-05, + "loss": 0.7287, + "step": 13507 + }, + { + "epoch": 0.9152381597669219, + "grad_norm": 7.578495502471924, + "learning_rate": 8.253542336915601e-05, + "loss": 0.8566, + "step": 13508 + }, + { + "epoch": 0.915305915034894, + "grad_norm": 5.957526206970215, + "learning_rate": 8.253405435005819e-05, + "loss": 0.712, + "step": 13509 + }, + { + "epoch": 0.9153736703028661, + "grad_norm": 7.773584365844727, + "learning_rate": 8.253268533096037e-05, + "loss": 0.8836, + "step": 13510 + }, + { + "epoch": 0.9154414255708382, + "grad_norm": 6.391456604003906, + "learning_rate": 8.253131631186255e-05, + "loss": 0.6992, + "step": 13511 + }, + { + "epoch": 0.9155091808388102, + "grad_norm": 6.0647687911987305, + "learning_rate": 8.252994729276475e-05, + "loss": 0.6445, + "step": 13512 + }, + { + "epoch": 0.9155769361067823, + "grad_norm": 5.610140800476074, + "learning_rate": 8.252857827366693e-05, + "loss": 0.7257, + "step": 13513 + }, + { + "epoch": 0.9156446913747543, + "grad_norm": 4.631004810333252, + "learning_rate": 8.252720925456911e-05, + "loss": 0.7086, + "step": 13514 + }, + { + "epoch": 0.9157124466427264, + "grad_norm": 5.651133060455322, + "learning_rate": 8.252584023547129e-05, + "loss": 1.0037, + "step": 13515 + }, + { + "epoch": 0.9157802019106985, + "grad_norm": 6.308481693267822, + "learning_rate": 8.252447121637348e-05, + "loss": 0.8149, + "step": 13516 + }, + { + "epoch": 0.9158479571786706, + "grad_norm": 7.645565509796143, + "learning_rate": 8.252310219727566e-05, + "loss": 0.698, + "step": 13517 + }, + { + "epoch": 0.9159157124466427, + "grad_norm": 6.530562400817871, + "learning_rate": 8.252173317817784e-05, + "loss": 0.6629, + "step": 13518 + }, + { + "epoch": 0.9159834677146148, + "grad_norm": 6.290469169616699, + "learning_rate": 8.252036415908002e-05, + "loss": 0.9803, + "step": 13519 + }, + { + "epoch": 0.9160512229825869, + "grad_norm": 6.369983673095703, + "learning_rate": 8.25189951399822e-05, + "loss": 0.8061, + "step": 13520 + }, + { + "epoch": 0.916118978250559, + "grad_norm": 5.648921489715576, + "learning_rate": 8.25176261208844e-05, + "loss": 0.8378, + "step": 13521 + }, + { + "epoch": 0.9161867335185311, + "grad_norm": 5.623139381408691, + "learning_rate": 8.251625710178658e-05, + "loss": 0.9232, + "step": 13522 + }, + { + "epoch": 0.9162544887865032, + "grad_norm": 7.031765460968018, + "learning_rate": 8.251488808268876e-05, + "loss": 0.9497, + "step": 13523 + }, + { + "epoch": 0.9163222440544753, + "grad_norm": 5.612135410308838, + "learning_rate": 8.251351906359094e-05, + "loss": 0.931, + "step": 13524 + }, + { + "epoch": 0.9163899993224474, + "grad_norm": 4.216965675354004, + "learning_rate": 8.251215004449312e-05, + "loss": 0.716, + "step": 13525 + }, + { + "epoch": 0.9164577545904194, + "grad_norm": 5.5640645027160645, + "learning_rate": 8.251078102539531e-05, + "loss": 0.6835, + "step": 13526 + }, + { + "epoch": 0.9165255098583915, + "grad_norm": 4.81538200378418, + "learning_rate": 8.250941200629749e-05, + "loss": 0.7627, + "step": 13527 + }, + { + "epoch": 0.9165932651263635, + "grad_norm": 5.384982109069824, + "learning_rate": 8.250804298719967e-05, + "loss": 0.5343, + "step": 13528 + }, + { + "epoch": 0.9166610203943356, + "grad_norm": 6.007334232330322, + "learning_rate": 8.250667396810185e-05, + "loss": 0.7204, + "step": 13529 + }, + { + "epoch": 0.9167287756623077, + "grad_norm": 6.315242290496826, + "learning_rate": 8.250530494900405e-05, + "loss": 0.7633, + "step": 13530 + }, + { + "epoch": 0.9167965309302798, + "grad_norm": 5.613879203796387, + "learning_rate": 8.250393592990623e-05, + "loss": 0.6349, + "step": 13531 + }, + { + "epoch": 0.9168642861982519, + "grad_norm": 6.26859712600708, + "learning_rate": 8.250256691080841e-05, + "loss": 0.7504, + "step": 13532 + }, + { + "epoch": 0.916932041466224, + "grad_norm": 7.103095054626465, + "learning_rate": 8.250119789171059e-05, + "loss": 1.1573, + "step": 13533 + }, + { + "epoch": 0.9169997967341961, + "grad_norm": 5.007758617401123, + "learning_rate": 8.249982887261277e-05, + "loss": 0.7824, + "step": 13534 + }, + { + "epoch": 0.9170675520021682, + "grad_norm": 7.608954429626465, + "learning_rate": 8.249845985351496e-05, + "loss": 0.9533, + "step": 13535 + }, + { + "epoch": 0.9171353072701403, + "grad_norm": 6.751156806945801, + "learning_rate": 8.249709083441714e-05, + "loss": 0.6626, + "step": 13536 + }, + { + "epoch": 0.9172030625381123, + "grad_norm": 6.883885860443115, + "learning_rate": 8.249572181531932e-05, + "loss": 0.7154, + "step": 13537 + }, + { + "epoch": 0.9172708178060844, + "grad_norm": 5.1738481521606445, + "learning_rate": 8.24943527962215e-05, + "loss": 0.6065, + "step": 13538 + }, + { + "epoch": 0.9173385730740565, + "grad_norm": 5.373385906219482, + "learning_rate": 8.24929837771237e-05, + "loss": 0.7061, + "step": 13539 + }, + { + "epoch": 0.9174063283420286, + "grad_norm": 6.883655548095703, + "learning_rate": 8.249161475802588e-05, + "loss": 1.0318, + "step": 13540 + }, + { + "epoch": 0.9174740836100007, + "grad_norm": 6.240233898162842, + "learning_rate": 8.249024573892806e-05, + "loss": 0.7813, + "step": 13541 + }, + { + "epoch": 0.9175418388779728, + "grad_norm": 3.9278335571289062, + "learning_rate": 8.248887671983024e-05, + "loss": 0.5349, + "step": 13542 + }, + { + "epoch": 0.9176095941459449, + "grad_norm": 4.724002838134766, + "learning_rate": 8.248750770073242e-05, + "loss": 0.6608, + "step": 13543 + }, + { + "epoch": 0.917677349413917, + "grad_norm": 6.562617778778076, + "learning_rate": 8.248613868163461e-05, + "loss": 0.7865, + "step": 13544 + }, + { + "epoch": 0.917745104681889, + "grad_norm": 5.862000942230225, + "learning_rate": 8.248476966253679e-05, + "loss": 0.7887, + "step": 13545 + }, + { + "epoch": 0.9178128599498611, + "grad_norm": 5.171735763549805, + "learning_rate": 8.248340064343897e-05, + "loss": 0.5521, + "step": 13546 + }, + { + "epoch": 0.9178806152178332, + "grad_norm": 6.028750419616699, + "learning_rate": 8.248203162434117e-05, + "loss": 0.7401, + "step": 13547 + }, + { + "epoch": 0.9179483704858052, + "grad_norm": 6.340005397796631, + "learning_rate": 8.248066260524335e-05, + "loss": 0.8042, + "step": 13548 + }, + { + "epoch": 0.9180161257537773, + "grad_norm": 6.127305030822754, + "learning_rate": 8.247929358614553e-05, + "loss": 0.5721, + "step": 13549 + }, + { + "epoch": 0.9180838810217494, + "grad_norm": 8.631043434143066, + "learning_rate": 8.247792456704772e-05, + "loss": 0.9929, + "step": 13550 + }, + { + "epoch": 0.9181516362897215, + "grad_norm": 7.284387111663818, + "learning_rate": 8.24765555479499e-05, + "loss": 0.9029, + "step": 13551 + }, + { + "epoch": 0.9182193915576936, + "grad_norm": 4.782154560089111, + "learning_rate": 8.247518652885208e-05, + "loss": 0.6233, + "step": 13552 + }, + { + "epoch": 0.9182871468256657, + "grad_norm": 7.067320346832275, + "learning_rate": 8.247381750975428e-05, + "loss": 0.8092, + "step": 13553 + }, + { + "epoch": 0.9183549020936378, + "grad_norm": 6.034327983856201, + "learning_rate": 8.247244849065646e-05, + "loss": 0.8534, + "step": 13554 + }, + { + "epoch": 0.9184226573616099, + "grad_norm": 7.13966178894043, + "learning_rate": 8.247107947155864e-05, + "loss": 0.8442, + "step": 13555 + }, + { + "epoch": 0.918490412629582, + "grad_norm": 6.099913120269775, + "learning_rate": 8.246971045246082e-05, + "loss": 0.862, + "step": 13556 + }, + { + "epoch": 0.9185581678975541, + "grad_norm": 5.149213790893555, + "learning_rate": 8.2468341433363e-05, + "loss": 0.6381, + "step": 13557 + }, + { + "epoch": 0.9186259231655262, + "grad_norm": 5.378545761108398, + "learning_rate": 8.246697241426519e-05, + "loss": 0.8651, + "step": 13558 + }, + { + "epoch": 0.9186936784334983, + "grad_norm": 5.342377185821533, + "learning_rate": 8.246560339516737e-05, + "loss": 0.7287, + "step": 13559 + }, + { + "epoch": 0.9187614337014703, + "grad_norm": 6.145232200622559, + "learning_rate": 8.246423437606955e-05, + "loss": 0.9639, + "step": 13560 + }, + { + "epoch": 0.9188291889694423, + "grad_norm": 5.195524215698242, + "learning_rate": 8.246286535697173e-05, + "loss": 0.5293, + "step": 13561 + }, + { + "epoch": 0.9188969442374144, + "grad_norm": 5.601246356964111, + "learning_rate": 8.246149633787393e-05, + "loss": 0.808, + "step": 13562 + }, + { + "epoch": 0.9189646995053865, + "grad_norm": 7.764710426330566, + "learning_rate": 8.246012731877611e-05, + "loss": 0.6616, + "step": 13563 + }, + { + "epoch": 0.9190324547733586, + "grad_norm": 5.822594165802002, + "learning_rate": 8.245875829967829e-05, + "loss": 0.6981, + "step": 13564 + }, + { + "epoch": 0.9191002100413307, + "grad_norm": 6.025305271148682, + "learning_rate": 8.245738928058047e-05, + "loss": 0.8169, + "step": 13565 + }, + { + "epoch": 0.9191679653093028, + "grad_norm": 5.495863437652588, + "learning_rate": 8.245602026148265e-05, + "loss": 0.9271, + "step": 13566 + }, + { + "epoch": 0.9192357205772749, + "grad_norm": 6.753162384033203, + "learning_rate": 8.245465124238484e-05, + "loss": 0.5892, + "step": 13567 + }, + { + "epoch": 0.919303475845247, + "grad_norm": 6.381758213043213, + "learning_rate": 8.245328222328702e-05, + "loss": 0.9818, + "step": 13568 + }, + { + "epoch": 0.9193712311132191, + "grad_norm": 6.070631980895996, + "learning_rate": 8.24519132041892e-05, + "loss": 0.7322, + "step": 13569 + }, + { + "epoch": 0.9194389863811911, + "grad_norm": 4.506857872009277, + "learning_rate": 8.245054418509138e-05, + "loss": 0.6529, + "step": 13570 + }, + { + "epoch": 0.9195067416491632, + "grad_norm": 6.378490924835205, + "learning_rate": 8.244917516599356e-05, + "loss": 0.8146, + "step": 13571 + }, + { + "epoch": 0.9195744969171353, + "grad_norm": 4.585328102111816, + "learning_rate": 8.244780614689576e-05, + "loss": 0.6215, + "step": 13572 + }, + { + "epoch": 0.9196422521851074, + "grad_norm": 6.161722660064697, + "learning_rate": 8.244643712779794e-05, + "loss": 0.7576, + "step": 13573 + }, + { + "epoch": 0.9197100074530795, + "grad_norm": 5.033822536468506, + "learning_rate": 8.244506810870012e-05, + "loss": 0.625, + "step": 13574 + }, + { + "epoch": 0.9197777627210516, + "grad_norm": 4.940533638000488, + "learning_rate": 8.24436990896023e-05, + "loss": 0.5779, + "step": 13575 + }, + { + "epoch": 0.9198455179890237, + "grad_norm": 5.566405296325684, + "learning_rate": 8.244233007050449e-05, + "loss": 0.7108, + "step": 13576 + }, + { + "epoch": 0.9199132732569957, + "grad_norm": 5.4210896492004395, + "learning_rate": 8.244096105140667e-05, + "loss": 0.6497, + "step": 13577 + }, + { + "epoch": 0.9199810285249678, + "grad_norm": 8.807047843933105, + "learning_rate": 8.243959203230885e-05, + "loss": 0.8228, + "step": 13578 + }, + { + "epoch": 0.9200487837929399, + "grad_norm": 6.093658447265625, + "learning_rate": 8.243822301321103e-05, + "loss": 0.99, + "step": 13579 + }, + { + "epoch": 0.920116539060912, + "grad_norm": 7.17921781539917, + "learning_rate": 8.243685399411321e-05, + "loss": 0.9162, + "step": 13580 + }, + { + "epoch": 0.920184294328884, + "grad_norm": 5.841926574707031, + "learning_rate": 8.243548497501541e-05, + "loss": 0.8073, + "step": 13581 + }, + { + "epoch": 0.9202520495968561, + "grad_norm": 6.103463649749756, + "learning_rate": 8.243411595591759e-05, + "loss": 0.7466, + "step": 13582 + }, + { + "epoch": 0.9203198048648282, + "grad_norm": 7.478387355804443, + "learning_rate": 8.243274693681977e-05, + "loss": 0.5682, + "step": 13583 + }, + { + "epoch": 0.9203875601328003, + "grad_norm": 7.954921245574951, + "learning_rate": 8.243137791772195e-05, + "loss": 0.7956, + "step": 13584 + }, + { + "epoch": 0.9204553154007724, + "grad_norm": 5.1390509605407715, + "learning_rate": 8.243000889862414e-05, + "loss": 0.7722, + "step": 13585 + }, + { + "epoch": 0.9205230706687445, + "grad_norm": 5.33015251159668, + "learning_rate": 8.242863987952632e-05, + "loss": 0.8534, + "step": 13586 + }, + { + "epoch": 0.9205908259367166, + "grad_norm": 6.494523525238037, + "learning_rate": 8.24272708604285e-05, + "loss": 0.7041, + "step": 13587 + }, + { + "epoch": 0.9206585812046887, + "grad_norm": 5.0413055419921875, + "learning_rate": 8.242590184133068e-05, + "loss": 0.8746, + "step": 13588 + }, + { + "epoch": 0.9207263364726608, + "grad_norm": 5.492376327514648, + "learning_rate": 8.242453282223286e-05, + "loss": 0.7865, + "step": 13589 + }, + { + "epoch": 0.9207940917406329, + "grad_norm": 4.8724870681762695, + "learning_rate": 8.242316380313506e-05, + "loss": 0.7237, + "step": 13590 + }, + { + "epoch": 0.920861847008605, + "grad_norm": 7.61802339553833, + "learning_rate": 8.242179478403724e-05, + "loss": 1.0021, + "step": 13591 + }, + { + "epoch": 0.920929602276577, + "grad_norm": 7.968650817871094, + "learning_rate": 8.242042576493942e-05, + "loss": 0.8771, + "step": 13592 + }, + { + "epoch": 0.920997357544549, + "grad_norm": 4.494787216186523, + "learning_rate": 8.24190567458416e-05, + "loss": 0.6646, + "step": 13593 + }, + { + "epoch": 0.9210651128125211, + "grad_norm": 7.044222831726074, + "learning_rate": 8.241768772674379e-05, + "loss": 0.9055, + "step": 13594 + }, + { + "epoch": 0.9211328680804932, + "grad_norm": 6.417403697967529, + "learning_rate": 8.241631870764597e-05, + "loss": 0.6975, + "step": 13595 + }, + { + "epoch": 0.9212006233484653, + "grad_norm": 7.259861469268799, + "learning_rate": 8.241494968854815e-05, + "loss": 0.736, + "step": 13596 + }, + { + "epoch": 0.9212683786164374, + "grad_norm": 7.667874813079834, + "learning_rate": 8.241358066945035e-05, + "loss": 0.741, + "step": 13597 + }, + { + "epoch": 0.9213361338844095, + "grad_norm": 5.993992805480957, + "learning_rate": 8.241221165035253e-05, + "loss": 0.74, + "step": 13598 + }, + { + "epoch": 0.9214038891523816, + "grad_norm": 6.172394275665283, + "learning_rate": 8.241084263125471e-05, + "loss": 0.6285, + "step": 13599 + }, + { + "epoch": 0.9214716444203537, + "grad_norm": 5.315210342407227, + "learning_rate": 8.24094736121569e-05, + "loss": 0.8465, + "step": 13600 + }, + { + "epoch": 0.9215393996883258, + "grad_norm": 7.3281683921813965, + "learning_rate": 8.240810459305908e-05, + "loss": 0.8761, + "step": 13601 + }, + { + "epoch": 0.9216071549562979, + "grad_norm": 4.546838760375977, + "learning_rate": 8.240673557396126e-05, + "loss": 0.5582, + "step": 13602 + }, + { + "epoch": 0.92167491022427, + "grad_norm": 10.452659606933594, + "learning_rate": 8.240536655486344e-05, + "loss": 0.7345, + "step": 13603 + }, + { + "epoch": 0.921742665492242, + "grad_norm": 5.254201412200928, + "learning_rate": 8.240399753576564e-05, + "loss": 0.6984, + "step": 13604 + }, + { + "epoch": 0.9218104207602141, + "grad_norm": 7.967467784881592, + "learning_rate": 8.240262851666782e-05, + "loss": 0.775, + "step": 13605 + }, + { + "epoch": 0.9218781760281862, + "grad_norm": 8.285964012145996, + "learning_rate": 8.240125949757e-05, + "loss": 0.7648, + "step": 13606 + }, + { + "epoch": 0.9219459312961583, + "grad_norm": 5.65543794631958, + "learning_rate": 8.239989047847218e-05, + "loss": 0.6109, + "step": 13607 + }, + { + "epoch": 0.9220136865641304, + "grad_norm": 5.535354137420654, + "learning_rate": 8.239852145937437e-05, + "loss": 0.5679, + "step": 13608 + }, + { + "epoch": 0.9220814418321025, + "grad_norm": 4.902824878692627, + "learning_rate": 8.239715244027655e-05, + "loss": 0.574, + "step": 13609 + }, + { + "epoch": 0.9221491971000745, + "grad_norm": 5.251827716827393, + "learning_rate": 8.239578342117873e-05, + "loss": 0.5183, + "step": 13610 + }, + { + "epoch": 0.9222169523680466, + "grad_norm": 4.334627151489258, + "learning_rate": 8.239441440208091e-05, + "loss": 0.6654, + "step": 13611 + }, + { + "epoch": 0.9222847076360187, + "grad_norm": 5.845222473144531, + "learning_rate": 8.239304538298309e-05, + "loss": 0.7711, + "step": 13612 + }, + { + "epoch": 0.9223524629039908, + "grad_norm": 4.581849575042725, + "learning_rate": 8.239167636388529e-05, + "loss": 0.6276, + "step": 13613 + }, + { + "epoch": 0.9224202181719628, + "grad_norm": 6.175132751464844, + "learning_rate": 8.239030734478747e-05, + "loss": 0.6509, + "step": 13614 + }, + { + "epoch": 0.9224879734399349, + "grad_norm": 5.571422100067139, + "learning_rate": 8.238893832568965e-05, + "loss": 0.874, + "step": 13615 + }, + { + "epoch": 0.922555728707907, + "grad_norm": 6.991871356964111, + "learning_rate": 8.238756930659183e-05, + "loss": 0.7676, + "step": 13616 + }, + { + "epoch": 0.9226234839758791, + "grad_norm": 7.442737102508545, + "learning_rate": 8.238620028749402e-05, + "loss": 0.9373, + "step": 13617 + }, + { + "epoch": 0.9226912392438512, + "grad_norm": 6.320655822753906, + "learning_rate": 8.23848312683962e-05, + "loss": 0.6271, + "step": 13618 + }, + { + "epoch": 0.9227589945118233, + "grad_norm": 4.894016265869141, + "learning_rate": 8.238346224929838e-05, + "loss": 0.7149, + "step": 13619 + }, + { + "epoch": 0.9228267497797954, + "grad_norm": 6.294023036956787, + "learning_rate": 8.238209323020056e-05, + "loss": 0.6531, + "step": 13620 + }, + { + "epoch": 0.9228945050477675, + "grad_norm": 8.631871223449707, + "learning_rate": 8.238072421110274e-05, + "loss": 0.6823, + "step": 13621 + }, + { + "epoch": 0.9229622603157396, + "grad_norm": 9.09997844696045, + "learning_rate": 8.237935519200494e-05, + "loss": 0.8513, + "step": 13622 + }, + { + "epoch": 0.9230300155837117, + "grad_norm": 5.959611892700195, + "learning_rate": 8.237798617290712e-05, + "loss": 0.8519, + "step": 13623 + }, + { + "epoch": 0.9230977708516838, + "grad_norm": 7.821328163146973, + "learning_rate": 8.23766171538093e-05, + "loss": 0.7974, + "step": 13624 + }, + { + "epoch": 0.9231655261196559, + "grad_norm": 5.780052185058594, + "learning_rate": 8.237524813471148e-05, + "loss": 0.8274, + "step": 13625 + }, + { + "epoch": 0.9232332813876278, + "grad_norm": 5.173895359039307, + "learning_rate": 8.237387911561366e-05, + "loss": 0.6296, + "step": 13626 + }, + { + "epoch": 0.9233010366555999, + "grad_norm": 6.263997554779053, + "learning_rate": 8.237251009651585e-05, + "loss": 0.8015, + "step": 13627 + }, + { + "epoch": 0.923368791923572, + "grad_norm": 5.063817501068115, + "learning_rate": 8.237114107741803e-05, + "loss": 0.8485, + "step": 13628 + }, + { + "epoch": 0.9234365471915441, + "grad_norm": 5.420182704925537, + "learning_rate": 8.236977205832021e-05, + "loss": 0.8133, + "step": 13629 + }, + { + "epoch": 0.9235043024595162, + "grad_norm": 6.21091890335083, + "learning_rate": 8.236840303922239e-05, + "loss": 0.9033, + "step": 13630 + }, + { + "epoch": 0.9235720577274883, + "grad_norm": 5.321242332458496, + "learning_rate": 8.236703402012459e-05, + "loss": 0.7809, + "step": 13631 + }, + { + "epoch": 0.9236398129954604, + "grad_norm": 4.922853469848633, + "learning_rate": 8.236566500102677e-05, + "loss": 0.6684, + "step": 13632 + }, + { + "epoch": 0.9237075682634325, + "grad_norm": 7.8503193855285645, + "learning_rate": 8.236429598192895e-05, + "loss": 0.9825, + "step": 13633 + }, + { + "epoch": 0.9237753235314046, + "grad_norm": 6.110293388366699, + "learning_rate": 8.236292696283113e-05, + "loss": 0.7034, + "step": 13634 + }, + { + "epoch": 0.9238430787993767, + "grad_norm": 7.469491481781006, + "learning_rate": 8.236155794373331e-05, + "loss": 0.9736, + "step": 13635 + }, + { + "epoch": 0.9239108340673488, + "grad_norm": 6.1995086669921875, + "learning_rate": 8.23601889246355e-05, + "loss": 0.8417, + "step": 13636 + }, + { + "epoch": 0.9239785893353208, + "grad_norm": 4.243618011474609, + "learning_rate": 8.235881990553768e-05, + "loss": 0.5993, + "step": 13637 + }, + { + "epoch": 0.9240463446032929, + "grad_norm": 5.490956783294678, + "learning_rate": 8.235745088643986e-05, + "loss": 0.7956, + "step": 13638 + }, + { + "epoch": 0.924114099871265, + "grad_norm": 6.069158554077148, + "learning_rate": 8.235608186734204e-05, + "loss": 0.6775, + "step": 13639 + }, + { + "epoch": 0.9241818551392371, + "grad_norm": 6.96622896194458, + "learning_rate": 8.235471284824424e-05, + "loss": 0.8268, + "step": 13640 + }, + { + "epoch": 0.9242496104072092, + "grad_norm": 8.088959693908691, + "learning_rate": 8.235334382914642e-05, + "loss": 0.9065, + "step": 13641 + }, + { + "epoch": 0.9243173656751812, + "grad_norm": 5.17006778717041, + "learning_rate": 8.23519748100486e-05, + "loss": 0.5922, + "step": 13642 + }, + { + "epoch": 0.9243851209431533, + "grad_norm": 5.012120246887207, + "learning_rate": 8.235060579095079e-05, + "loss": 0.6376, + "step": 13643 + }, + { + "epoch": 0.9244528762111254, + "grad_norm": 5.864466667175293, + "learning_rate": 8.234923677185297e-05, + "loss": 0.6989, + "step": 13644 + }, + { + "epoch": 0.9245206314790975, + "grad_norm": 5.791763782501221, + "learning_rate": 8.234786775275515e-05, + "loss": 0.818, + "step": 13645 + }, + { + "epoch": 0.9245883867470696, + "grad_norm": 5.13279390335083, + "learning_rate": 8.234649873365735e-05, + "loss": 0.6238, + "step": 13646 + }, + { + "epoch": 0.9246561420150416, + "grad_norm": 6.566293716430664, + "learning_rate": 8.234512971455953e-05, + "loss": 0.7528, + "step": 13647 + }, + { + "epoch": 0.9247238972830137, + "grad_norm": 4.981358528137207, + "learning_rate": 8.234376069546171e-05, + "loss": 0.713, + "step": 13648 + }, + { + "epoch": 0.9247916525509858, + "grad_norm": 5.388360500335693, + "learning_rate": 8.23423916763639e-05, + "loss": 0.8413, + "step": 13649 + }, + { + "epoch": 0.9248594078189579, + "grad_norm": 5.867083549499512, + "learning_rate": 8.234102265726608e-05, + "loss": 0.7376, + "step": 13650 + }, + { + "epoch": 0.92492716308693, + "grad_norm": 6.004108428955078, + "learning_rate": 8.233965363816826e-05, + "loss": 0.7179, + "step": 13651 + }, + { + "epoch": 0.9249949183549021, + "grad_norm": 5.956449031829834, + "learning_rate": 8.233828461907044e-05, + "loss": 0.856, + "step": 13652 + }, + { + "epoch": 0.9250626736228742, + "grad_norm": 4.9217209815979, + "learning_rate": 8.233691559997262e-05, + "loss": 0.7254, + "step": 13653 + }, + { + "epoch": 0.9251304288908463, + "grad_norm": 5.971928596496582, + "learning_rate": 8.233554658087482e-05, + "loss": 0.8707, + "step": 13654 + }, + { + "epoch": 0.9251981841588184, + "grad_norm": 5.525825500488281, + "learning_rate": 8.2334177561777e-05, + "loss": 0.6256, + "step": 13655 + }, + { + "epoch": 0.9252659394267905, + "grad_norm": 6.858259201049805, + "learning_rate": 8.233280854267918e-05, + "loss": 0.771, + "step": 13656 + }, + { + "epoch": 0.9253336946947626, + "grad_norm": 5.588975429534912, + "learning_rate": 8.233143952358136e-05, + "loss": 0.7937, + "step": 13657 + }, + { + "epoch": 0.9254014499627347, + "grad_norm": 5.949060916900635, + "learning_rate": 8.233007050448354e-05, + "loss": 0.7975, + "step": 13658 + }, + { + "epoch": 0.9254692052307066, + "grad_norm": 5.091200828552246, + "learning_rate": 8.232870148538573e-05, + "loss": 0.5729, + "step": 13659 + }, + { + "epoch": 0.9255369604986787, + "grad_norm": 7.370169639587402, + "learning_rate": 8.232733246628791e-05, + "loss": 0.9528, + "step": 13660 + }, + { + "epoch": 0.9256047157666508, + "grad_norm": 5.088458061218262, + "learning_rate": 8.232596344719009e-05, + "loss": 0.7491, + "step": 13661 + }, + { + "epoch": 0.9256724710346229, + "grad_norm": 6.5189433097839355, + "learning_rate": 8.232459442809227e-05, + "loss": 0.7068, + "step": 13662 + }, + { + "epoch": 0.925740226302595, + "grad_norm": 6.852166175842285, + "learning_rate": 8.232322540899447e-05, + "loss": 1.0309, + "step": 13663 + }, + { + "epoch": 0.9258079815705671, + "grad_norm": 6.470550060272217, + "learning_rate": 8.232185638989665e-05, + "loss": 0.8087, + "step": 13664 + }, + { + "epoch": 0.9258757368385392, + "grad_norm": 6.046988010406494, + "learning_rate": 8.232048737079883e-05, + "loss": 0.7183, + "step": 13665 + }, + { + "epoch": 0.9259434921065113, + "grad_norm": 5.661501884460449, + "learning_rate": 8.231911835170101e-05, + "loss": 0.806, + "step": 13666 + }, + { + "epoch": 0.9260112473744834, + "grad_norm": 7.144009113311768, + "learning_rate": 8.231774933260319e-05, + "loss": 0.8562, + "step": 13667 + }, + { + "epoch": 0.9260790026424555, + "grad_norm": 6.237162113189697, + "learning_rate": 8.231638031350538e-05, + "loss": 0.6964, + "step": 13668 + }, + { + "epoch": 0.9261467579104276, + "grad_norm": 10.49614429473877, + "learning_rate": 8.231501129440756e-05, + "loss": 0.8433, + "step": 13669 + }, + { + "epoch": 0.9262145131783996, + "grad_norm": 8.371081352233887, + "learning_rate": 8.231364227530974e-05, + "loss": 0.8938, + "step": 13670 + }, + { + "epoch": 0.9262822684463717, + "grad_norm": 5.304482936859131, + "learning_rate": 8.231227325621192e-05, + "loss": 0.8161, + "step": 13671 + }, + { + "epoch": 0.9263500237143438, + "grad_norm": 7.333506107330322, + "learning_rate": 8.231090423711412e-05, + "loss": 1.0648, + "step": 13672 + }, + { + "epoch": 0.9264177789823159, + "grad_norm": 6.7197065353393555, + "learning_rate": 8.23095352180163e-05, + "loss": 0.5901, + "step": 13673 + }, + { + "epoch": 0.926485534250288, + "grad_norm": 6.151739120483398, + "learning_rate": 8.230816619891848e-05, + "loss": 0.832, + "step": 13674 + }, + { + "epoch": 0.92655328951826, + "grad_norm": 5.662747859954834, + "learning_rate": 8.230679717982066e-05, + "loss": 0.8096, + "step": 13675 + }, + { + "epoch": 0.9266210447862321, + "grad_norm": 6.029799938201904, + "learning_rate": 8.230542816072284e-05, + "loss": 0.7201, + "step": 13676 + }, + { + "epoch": 0.9266888000542042, + "grad_norm": 4.9484686851501465, + "learning_rate": 8.230405914162503e-05, + "loss": 0.6618, + "step": 13677 + }, + { + "epoch": 0.9267565553221763, + "grad_norm": 6.27154016494751, + "learning_rate": 8.230269012252721e-05, + "loss": 0.9026, + "step": 13678 + }, + { + "epoch": 0.9268243105901484, + "grad_norm": 7.863352298736572, + "learning_rate": 8.230132110342939e-05, + "loss": 0.6036, + "step": 13679 + }, + { + "epoch": 0.9268920658581205, + "grad_norm": 6.766676425933838, + "learning_rate": 8.229995208433157e-05, + "loss": 0.9587, + "step": 13680 + }, + { + "epoch": 0.9269598211260925, + "grad_norm": 7.4474711418151855, + "learning_rate": 8.229858306523375e-05, + "loss": 0.8504, + "step": 13681 + }, + { + "epoch": 0.9270275763940646, + "grad_norm": 5.815866470336914, + "learning_rate": 8.229721404613595e-05, + "loss": 0.6245, + "step": 13682 + }, + { + "epoch": 0.9270953316620367, + "grad_norm": 7.574525833129883, + "learning_rate": 8.229584502703813e-05, + "loss": 0.6652, + "step": 13683 + }, + { + "epoch": 0.9271630869300088, + "grad_norm": 8.183697700500488, + "learning_rate": 8.229447600794031e-05, + "loss": 0.7956, + "step": 13684 + }, + { + "epoch": 0.9272308421979809, + "grad_norm": 5.091358184814453, + "learning_rate": 8.229310698884249e-05, + "loss": 0.6387, + "step": 13685 + }, + { + "epoch": 0.927298597465953, + "grad_norm": 5.971275806427002, + "learning_rate": 8.229173796974468e-05, + "loss": 1.0076, + "step": 13686 + }, + { + "epoch": 0.9273663527339251, + "grad_norm": 5.206945896148682, + "learning_rate": 8.229036895064686e-05, + "loss": 0.615, + "step": 13687 + }, + { + "epoch": 0.9274341080018972, + "grad_norm": 6.2954864501953125, + "learning_rate": 8.228899993154904e-05, + "loss": 0.7028, + "step": 13688 + }, + { + "epoch": 0.9275018632698693, + "grad_norm": 7.77675724029541, + "learning_rate": 8.228763091245124e-05, + "loss": 0.8924, + "step": 13689 + }, + { + "epoch": 0.9275696185378414, + "grad_norm": 7.986929893493652, + "learning_rate": 8.228626189335342e-05, + "loss": 0.9092, + "step": 13690 + }, + { + "epoch": 0.9276373738058133, + "grad_norm": 6.880285739898682, + "learning_rate": 8.22848928742556e-05, + "loss": 0.8328, + "step": 13691 + }, + { + "epoch": 0.9277051290737854, + "grad_norm": 5.309557914733887, + "learning_rate": 8.228352385515779e-05, + "loss": 0.6386, + "step": 13692 + }, + { + "epoch": 0.9277728843417575, + "grad_norm": 5.744555473327637, + "learning_rate": 8.228215483605997e-05, + "loss": 0.8106, + "step": 13693 + }, + { + "epoch": 0.9278406396097296, + "grad_norm": 10.664202690124512, + "learning_rate": 8.228078581696215e-05, + "loss": 0.6584, + "step": 13694 + }, + { + "epoch": 0.9279083948777017, + "grad_norm": 5.7767791748046875, + "learning_rate": 8.227941679786435e-05, + "loss": 0.7672, + "step": 13695 + }, + { + "epoch": 0.9279761501456738, + "grad_norm": 4.621129035949707, + "learning_rate": 8.227804777876653e-05, + "loss": 0.8555, + "step": 13696 + }, + { + "epoch": 0.9280439054136459, + "grad_norm": 5.1587018966674805, + "learning_rate": 8.22766787596687e-05, + "loss": 0.526, + "step": 13697 + }, + { + "epoch": 0.928111660681618, + "grad_norm": 6.041534900665283, + "learning_rate": 8.227530974057089e-05, + "loss": 0.6588, + "step": 13698 + }, + { + "epoch": 0.9281794159495901, + "grad_norm": 5.863411903381348, + "learning_rate": 8.227394072147307e-05, + "loss": 0.821, + "step": 13699 + }, + { + "epoch": 0.9282471712175622, + "grad_norm": 5.905420780181885, + "learning_rate": 8.227257170237526e-05, + "loss": 0.9116, + "step": 13700 + }, + { + "epoch": 0.9283149264855343, + "grad_norm": 8.96728229522705, + "learning_rate": 8.227120268327744e-05, + "loss": 0.7997, + "step": 13701 + }, + { + "epoch": 0.9283826817535064, + "grad_norm": 7.430652141571045, + "learning_rate": 8.226983366417962e-05, + "loss": 0.8505, + "step": 13702 + }, + { + "epoch": 0.9284504370214784, + "grad_norm": 6.5419921875, + "learning_rate": 8.22684646450818e-05, + "loss": 1.1061, + "step": 13703 + }, + { + "epoch": 0.9285181922894505, + "grad_norm": 5.459079265594482, + "learning_rate": 8.226709562598398e-05, + "loss": 0.7481, + "step": 13704 + }, + { + "epoch": 0.9285859475574226, + "grad_norm": 6.421117782592773, + "learning_rate": 8.226572660688618e-05, + "loss": 0.7031, + "step": 13705 + }, + { + "epoch": 0.9286537028253947, + "grad_norm": 5.486372470855713, + "learning_rate": 8.226435758778836e-05, + "loss": 0.7731, + "step": 13706 + }, + { + "epoch": 0.9287214580933668, + "grad_norm": 8.439654350280762, + "learning_rate": 8.226298856869054e-05, + "loss": 0.8965, + "step": 13707 + }, + { + "epoch": 0.9287892133613388, + "grad_norm": 5.904208183288574, + "learning_rate": 8.226161954959272e-05, + "loss": 0.8596, + "step": 13708 + }, + { + "epoch": 0.9288569686293109, + "grad_norm": 7.829963684082031, + "learning_rate": 8.226025053049491e-05, + "loss": 0.8663, + "step": 13709 + }, + { + "epoch": 0.928924723897283, + "grad_norm": 6.749328136444092, + "learning_rate": 8.225888151139709e-05, + "loss": 0.6586, + "step": 13710 + }, + { + "epoch": 0.9289924791652551, + "grad_norm": 6.042569637298584, + "learning_rate": 8.225751249229927e-05, + "loss": 0.6156, + "step": 13711 + }, + { + "epoch": 0.9290602344332272, + "grad_norm": 4.903025150299072, + "learning_rate": 8.225614347320145e-05, + "loss": 0.7658, + "step": 13712 + }, + { + "epoch": 0.9291279897011993, + "grad_norm": 5.892563343048096, + "learning_rate": 8.225477445410363e-05, + "loss": 0.8609, + "step": 13713 + }, + { + "epoch": 0.9291957449691713, + "grad_norm": 5.452030658721924, + "learning_rate": 8.225340543500583e-05, + "loss": 1.0457, + "step": 13714 + }, + { + "epoch": 0.9292635002371434, + "grad_norm": 6.059377193450928, + "learning_rate": 8.2252036415908e-05, + "loss": 0.8657, + "step": 13715 + }, + { + "epoch": 0.9293312555051155, + "grad_norm": 5.9446210861206055, + "learning_rate": 8.225066739681019e-05, + "loss": 1.0325, + "step": 13716 + }, + { + "epoch": 0.9293990107730876, + "grad_norm": 5.6474690437316895, + "learning_rate": 8.224929837771237e-05, + "loss": 0.6533, + "step": 13717 + }, + { + "epoch": 0.9294667660410597, + "grad_norm": 6.072984218597412, + "learning_rate": 8.224792935861456e-05, + "loss": 0.8738, + "step": 13718 + }, + { + "epoch": 0.9295345213090318, + "grad_norm": 7.003471851348877, + "learning_rate": 8.224656033951674e-05, + "loss": 0.7797, + "step": 13719 + }, + { + "epoch": 0.9296022765770039, + "grad_norm": 6.7634148597717285, + "learning_rate": 8.224519132041892e-05, + "loss": 0.9885, + "step": 13720 + }, + { + "epoch": 0.929670031844976, + "grad_norm": 4.705183506011963, + "learning_rate": 8.22438223013211e-05, + "loss": 0.8115, + "step": 13721 + }, + { + "epoch": 0.9297377871129481, + "grad_norm": 5.442021369934082, + "learning_rate": 8.224245328222328e-05, + "loss": 0.9005, + "step": 13722 + }, + { + "epoch": 0.9298055423809202, + "grad_norm": 5.542171001434326, + "learning_rate": 8.224108426312548e-05, + "loss": 0.6873, + "step": 13723 + }, + { + "epoch": 0.9298732976488921, + "grad_norm": 4.973824977874756, + "learning_rate": 8.223971524402766e-05, + "loss": 0.8902, + "step": 13724 + }, + { + "epoch": 0.9299410529168642, + "grad_norm": 6.488674640655518, + "learning_rate": 8.223834622492984e-05, + "loss": 0.6895, + "step": 13725 + }, + { + "epoch": 0.9300088081848363, + "grad_norm": 6.3187150955200195, + "learning_rate": 8.223697720583202e-05, + "loss": 0.591, + "step": 13726 + }, + { + "epoch": 0.9300765634528084, + "grad_norm": 4.76509952545166, + "learning_rate": 8.223560818673421e-05, + "loss": 0.5401, + "step": 13727 + }, + { + "epoch": 0.9301443187207805, + "grad_norm": 6.474403381347656, + "learning_rate": 8.223423916763639e-05, + "loss": 0.788, + "step": 13728 + }, + { + "epoch": 0.9302120739887526, + "grad_norm": 5.295207977294922, + "learning_rate": 8.223287014853857e-05, + "loss": 0.8156, + "step": 13729 + }, + { + "epoch": 0.9302798292567247, + "grad_norm": 4.847303867340088, + "learning_rate": 8.223150112944075e-05, + "loss": 0.7142, + "step": 13730 + }, + { + "epoch": 0.9303475845246968, + "grad_norm": 7.824878692626953, + "learning_rate": 8.223013211034293e-05, + "loss": 0.6943, + "step": 13731 + }, + { + "epoch": 0.9304153397926689, + "grad_norm": 5.352818965911865, + "learning_rate": 8.222876309124513e-05, + "loss": 0.6813, + "step": 13732 + }, + { + "epoch": 0.930483095060641, + "grad_norm": 5.922751426696777, + "learning_rate": 8.222739407214731e-05, + "loss": 0.6284, + "step": 13733 + }, + { + "epoch": 0.9305508503286131, + "grad_norm": 6.715836048126221, + "learning_rate": 8.222602505304949e-05, + "loss": 0.7602, + "step": 13734 + }, + { + "epoch": 0.9306186055965852, + "grad_norm": 5.062824726104736, + "learning_rate": 8.222465603395168e-05, + "loss": 0.6008, + "step": 13735 + }, + { + "epoch": 0.9306863608645572, + "grad_norm": 5.607885837554932, + "learning_rate": 8.222328701485386e-05, + "loss": 0.7161, + "step": 13736 + }, + { + "epoch": 0.9307541161325293, + "grad_norm": 6.048231601715088, + "learning_rate": 8.222191799575604e-05, + "loss": 0.6162, + "step": 13737 + }, + { + "epoch": 0.9308218714005014, + "grad_norm": 6.022711277008057, + "learning_rate": 8.222054897665824e-05, + "loss": 0.7804, + "step": 13738 + }, + { + "epoch": 0.9308896266684735, + "grad_norm": 8.368260383605957, + "learning_rate": 8.221917995756042e-05, + "loss": 0.8318, + "step": 13739 + }, + { + "epoch": 0.9309573819364455, + "grad_norm": 4.99493932723999, + "learning_rate": 8.22178109384626e-05, + "loss": 0.6902, + "step": 13740 + }, + { + "epoch": 0.9310251372044176, + "grad_norm": 5.73579740524292, + "learning_rate": 8.221644191936479e-05, + "loss": 0.6253, + "step": 13741 + }, + { + "epoch": 0.9310928924723897, + "grad_norm": 6.078760623931885, + "learning_rate": 8.221507290026697e-05, + "loss": 0.9188, + "step": 13742 + }, + { + "epoch": 0.9311606477403618, + "grad_norm": 5.923210620880127, + "learning_rate": 8.221370388116915e-05, + "loss": 0.8844, + "step": 13743 + }, + { + "epoch": 0.9312284030083339, + "grad_norm": 4.4179229736328125, + "learning_rate": 8.221233486207133e-05, + "loss": 0.6355, + "step": 13744 + }, + { + "epoch": 0.931296158276306, + "grad_norm": 5.3702311515808105, + "learning_rate": 8.221096584297351e-05, + "loss": 0.8017, + "step": 13745 + }, + { + "epoch": 0.931363913544278, + "grad_norm": 7.519911766052246, + "learning_rate": 8.22095968238757e-05, + "loss": 0.8302, + "step": 13746 + }, + { + "epoch": 0.9314316688122501, + "grad_norm": 9.204391479492188, + "learning_rate": 8.220822780477789e-05, + "loss": 0.8657, + "step": 13747 + }, + { + "epoch": 0.9314994240802222, + "grad_norm": 6.563374042510986, + "learning_rate": 8.220685878568007e-05, + "loss": 0.7257, + "step": 13748 + }, + { + "epoch": 0.9315671793481943, + "grad_norm": 6.106207370758057, + "learning_rate": 8.220548976658225e-05, + "loss": 0.836, + "step": 13749 + }, + { + "epoch": 0.9316349346161664, + "grad_norm": 8.019493103027344, + "learning_rate": 8.220412074748444e-05, + "loss": 0.7405, + "step": 13750 + }, + { + "epoch": 0.9317026898841385, + "grad_norm": 5.366190433502197, + "learning_rate": 8.220275172838662e-05, + "loss": 0.7408, + "step": 13751 + }, + { + "epoch": 0.9317704451521106, + "grad_norm": 3.966377019882202, + "learning_rate": 8.22013827092888e-05, + "loss": 0.5997, + "step": 13752 + }, + { + "epoch": 0.9318382004200827, + "grad_norm": 8.112096786499023, + "learning_rate": 8.220001369019098e-05, + "loss": 1.1498, + "step": 13753 + }, + { + "epoch": 0.9319059556880548, + "grad_norm": 7.886656284332275, + "learning_rate": 8.219864467109316e-05, + "loss": 0.8555, + "step": 13754 + }, + { + "epoch": 0.9319737109560269, + "grad_norm": 5.4407219886779785, + "learning_rate": 8.219727565199536e-05, + "loss": 0.8506, + "step": 13755 + }, + { + "epoch": 0.932041466223999, + "grad_norm": 6.584644794464111, + "learning_rate": 8.219590663289754e-05, + "loss": 0.8408, + "step": 13756 + }, + { + "epoch": 0.932109221491971, + "grad_norm": 5.497494697570801, + "learning_rate": 8.219453761379972e-05, + "loss": 0.6424, + "step": 13757 + }, + { + "epoch": 0.932176976759943, + "grad_norm": 5.6908698081970215, + "learning_rate": 8.21931685947019e-05, + "loss": 0.6629, + "step": 13758 + }, + { + "epoch": 0.9322447320279151, + "grad_norm": 5.446488380432129, + "learning_rate": 8.219179957560408e-05, + "loss": 0.6875, + "step": 13759 + }, + { + "epoch": 0.9323124872958872, + "grad_norm": 5.45862340927124, + "learning_rate": 8.219043055650627e-05, + "loss": 0.6987, + "step": 13760 + }, + { + "epoch": 0.9323802425638593, + "grad_norm": 5.105297565460205, + "learning_rate": 8.218906153740845e-05, + "loss": 0.5886, + "step": 13761 + }, + { + "epoch": 0.9324479978318314, + "grad_norm": 5.2332892417907715, + "learning_rate": 8.218769251831063e-05, + "loss": 0.7667, + "step": 13762 + }, + { + "epoch": 0.9325157530998035, + "grad_norm": 5.849374294281006, + "learning_rate": 8.218632349921281e-05, + "loss": 0.7964, + "step": 13763 + }, + { + "epoch": 0.9325835083677756, + "grad_norm": 5.5074872970581055, + "learning_rate": 8.2184954480115e-05, + "loss": 0.8375, + "step": 13764 + }, + { + "epoch": 0.9326512636357477, + "grad_norm": 6.26788854598999, + "learning_rate": 8.218358546101719e-05, + "loss": 0.6793, + "step": 13765 + }, + { + "epoch": 0.9327190189037198, + "grad_norm": 7.429625988006592, + "learning_rate": 8.218221644191937e-05, + "loss": 0.8389, + "step": 13766 + }, + { + "epoch": 0.9327867741716919, + "grad_norm": 9.86705207824707, + "learning_rate": 8.218084742282155e-05, + "loss": 0.9663, + "step": 13767 + }, + { + "epoch": 0.932854529439664, + "grad_norm": 7.741724491119385, + "learning_rate": 8.217947840372373e-05, + "loss": 0.8483, + "step": 13768 + }, + { + "epoch": 0.932922284707636, + "grad_norm": 5.413618087768555, + "learning_rate": 8.217810938462592e-05, + "loss": 0.6838, + "step": 13769 + }, + { + "epoch": 0.9329900399756081, + "grad_norm": 5.695152282714844, + "learning_rate": 8.21767403655281e-05, + "loss": 0.8978, + "step": 13770 + }, + { + "epoch": 0.9330577952435802, + "grad_norm": 6.387299537658691, + "learning_rate": 8.217537134643028e-05, + "loss": 0.5846, + "step": 13771 + }, + { + "epoch": 0.9331255505115523, + "grad_norm": 4.797947883605957, + "learning_rate": 8.217400232733246e-05, + "loss": 0.617, + "step": 13772 + }, + { + "epoch": 0.9331933057795243, + "grad_norm": 5.718557357788086, + "learning_rate": 8.217263330823466e-05, + "loss": 0.7501, + "step": 13773 + }, + { + "epoch": 0.9332610610474964, + "grad_norm": 6.03369665145874, + "learning_rate": 8.217126428913684e-05, + "loss": 0.5945, + "step": 13774 + }, + { + "epoch": 0.9333288163154685, + "grad_norm": 4.953873634338379, + "learning_rate": 8.216989527003902e-05, + "loss": 0.6359, + "step": 13775 + }, + { + "epoch": 0.9333965715834406, + "grad_norm": 6.724967956542969, + "learning_rate": 8.21685262509412e-05, + "loss": 0.788, + "step": 13776 + }, + { + "epoch": 0.9334643268514127, + "grad_norm": 5.307444095611572, + "learning_rate": 8.216715723184338e-05, + "loss": 0.8166, + "step": 13777 + }, + { + "epoch": 0.9335320821193848, + "grad_norm": 7.63820219039917, + "learning_rate": 8.216578821274557e-05, + "loss": 1.0157, + "step": 13778 + }, + { + "epoch": 0.9335998373873569, + "grad_norm": 5.278408527374268, + "learning_rate": 8.216441919364775e-05, + "loss": 0.6433, + "step": 13779 + }, + { + "epoch": 0.933667592655329, + "grad_norm": 8.680825233459473, + "learning_rate": 8.216305017454993e-05, + "loss": 0.84, + "step": 13780 + }, + { + "epoch": 0.933735347923301, + "grad_norm": 5.572779655456543, + "learning_rate": 8.216168115545213e-05, + "loss": 0.783, + "step": 13781 + }, + { + "epoch": 0.9338031031912731, + "grad_norm": 9.58791732788086, + "learning_rate": 8.21603121363543e-05, + "loss": 1.0121, + "step": 13782 + }, + { + "epoch": 0.9338708584592452, + "grad_norm": 6.080748081207275, + "learning_rate": 8.215894311725649e-05, + "loss": 0.6494, + "step": 13783 + }, + { + "epoch": 0.9339386137272173, + "grad_norm": 6.609582901000977, + "learning_rate": 8.215757409815868e-05, + "loss": 0.7456, + "step": 13784 + }, + { + "epoch": 0.9340063689951894, + "grad_norm": 7.191807746887207, + "learning_rate": 8.215620507906086e-05, + "loss": 0.7427, + "step": 13785 + }, + { + "epoch": 0.9340741242631615, + "grad_norm": 6.619680404663086, + "learning_rate": 8.215483605996304e-05, + "loss": 0.8251, + "step": 13786 + }, + { + "epoch": 0.9341418795311336, + "grad_norm": 6.401364803314209, + "learning_rate": 8.215346704086524e-05, + "loss": 0.6732, + "step": 13787 + }, + { + "epoch": 0.9342096347991057, + "grad_norm": 4.641686916351318, + "learning_rate": 8.215209802176742e-05, + "loss": 0.5844, + "step": 13788 + }, + { + "epoch": 0.9342773900670777, + "grad_norm": 5.998747825622559, + "learning_rate": 8.21507290026696e-05, + "loss": 0.9057, + "step": 13789 + }, + { + "epoch": 0.9343451453350498, + "grad_norm": 6.920633316040039, + "learning_rate": 8.214935998357178e-05, + "loss": 1.0017, + "step": 13790 + }, + { + "epoch": 0.9344129006030218, + "grad_norm": 5.0339460372924805, + "learning_rate": 8.214799096447396e-05, + "loss": 0.7182, + "step": 13791 + }, + { + "epoch": 0.9344806558709939, + "grad_norm": 6.2433762550354, + "learning_rate": 8.214662194537615e-05, + "loss": 0.5608, + "step": 13792 + }, + { + "epoch": 0.934548411138966, + "grad_norm": 4.9427056312561035, + "learning_rate": 8.214525292627833e-05, + "loss": 0.6635, + "step": 13793 + }, + { + "epoch": 0.9346161664069381, + "grad_norm": 5.293502330780029, + "learning_rate": 8.214388390718051e-05, + "loss": 0.6524, + "step": 13794 + }, + { + "epoch": 0.9346839216749102, + "grad_norm": 5.291229724884033, + "learning_rate": 8.214251488808269e-05, + "loss": 0.8088, + "step": 13795 + }, + { + "epoch": 0.9347516769428823, + "grad_norm": 6.6330389976501465, + "learning_rate": 8.214114586898489e-05, + "loss": 0.83, + "step": 13796 + }, + { + "epoch": 0.9348194322108544, + "grad_norm": 4.883660316467285, + "learning_rate": 8.213977684988707e-05, + "loss": 0.849, + "step": 13797 + }, + { + "epoch": 0.9348871874788265, + "grad_norm": 9.394811630249023, + "learning_rate": 8.213840783078925e-05, + "loss": 1.1274, + "step": 13798 + }, + { + "epoch": 0.9349549427467986, + "grad_norm": 7.191971778869629, + "learning_rate": 8.213703881169143e-05, + "loss": 1.022, + "step": 13799 + }, + { + "epoch": 0.9350226980147707, + "grad_norm": 5.284100532531738, + "learning_rate": 8.21356697925936e-05, + "loss": 0.7243, + "step": 13800 + }, + { + "epoch": 0.9350904532827428, + "grad_norm": 4.5223469734191895, + "learning_rate": 8.21343007734958e-05, + "loss": 0.7437, + "step": 13801 + }, + { + "epoch": 0.9351582085507149, + "grad_norm": 9.137798309326172, + "learning_rate": 8.213293175439798e-05, + "loss": 0.8741, + "step": 13802 + }, + { + "epoch": 0.935225963818687, + "grad_norm": 5.241482257843018, + "learning_rate": 8.213156273530016e-05, + "loss": 0.7517, + "step": 13803 + }, + { + "epoch": 0.935293719086659, + "grad_norm": 6.769075870513916, + "learning_rate": 8.213019371620234e-05, + "loss": 0.9897, + "step": 13804 + }, + { + "epoch": 0.935361474354631, + "grad_norm": 5.048720836639404, + "learning_rate": 8.212882469710454e-05, + "loss": 0.7311, + "step": 13805 + }, + { + "epoch": 0.9354292296226031, + "grad_norm": 5.8939385414123535, + "learning_rate": 8.212745567800672e-05, + "loss": 0.7927, + "step": 13806 + }, + { + "epoch": 0.9354969848905752, + "grad_norm": 6.683951377868652, + "learning_rate": 8.21260866589089e-05, + "loss": 0.7332, + "step": 13807 + }, + { + "epoch": 0.9355647401585473, + "grad_norm": 6.631941318511963, + "learning_rate": 8.212471763981108e-05, + "loss": 0.8927, + "step": 13808 + }, + { + "epoch": 0.9356324954265194, + "grad_norm": 6.161096096038818, + "learning_rate": 8.212334862071326e-05, + "loss": 0.9347, + "step": 13809 + }, + { + "epoch": 0.9357002506944915, + "grad_norm": 9.04511547088623, + "learning_rate": 8.212197960161545e-05, + "loss": 0.7569, + "step": 13810 + }, + { + "epoch": 0.9357680059624636, + "grad_norm": 6.299793720245361, + "learning_rate": 8.212061058251763e-05, + "loss": 0.8722, + "step": 13811 + }, + { + "epoch": 0.9358357612304357, + "grad_norm": 6.218029975891113, + "learning_rate": 8.211924156341981e-05, + "loss": 0.4517, + "step": 13812 + }, + { + "epoch": 0.9359035164984078, + "grad_norm": 6.690893650054932, + "learning_rate": 8.211787254432199e-05, + "loss": 0.7149, + "step": 13813 + }, + { + "epoch": 0.9359712717663798, + "grad_norm": 5.8090009689331055, + "learning_rate": 8.211650352522417e-05, + "loss": 0.7553, + "step": 13814 + }, + { + "epoch": 0.9360390270343519, + "grad_norm": 6.034711837768555, + "learning_rate": 8.211513450612637e-05, + "loss": 0.7217, + "step": 13815 + }, + { + "epoch": 0.936106782302324, + "grad_norm": 5.093395233154297, + "learning_rate": 8.211376548702855e-05, + "loss": 0.9626, + "step": 13816 + }, + { + "epoch": 0.9361745375702961, + "grad_norm": 7.061664581298828, + "learning_rate": 8.211239646793073e-05, + "loss": 1.0739, + "step": 13817 + }, + { + "epoch": 0.9362422928382682, + "grad_norm": 5.97583532333374, + "learning_rate": 8.211102744883291e-05, + "loss": 1.0293, + "step": 13818 + }, + { + "epoch": 0.9363100481062403, + "grad_norm": 6.2469282150268555, + "learning_rate": 8.21096584297351e-05, + "loss": 0.8048, + "step": 13819 + }, + { + "epoch": 0.9363778033742124, + "grad_norm": 6.801420211791992, + "learning_rate": 8.210828941063728e-05, + "loss": 1.1723, + "step": 13820 + }, + { + "epoch": 0.9364455586421845, + "grad_norm": 5.765828609466553, + "learning_rate": 8.210692039153946e-05, + "loss": 0.8612, + "step": 13821 + }, + { + "epoch": 0.9365133139101565, + "grad_norm": 5.4772233963012695, + "learning_rate": 8.210555137244164e-05, + "loss": 0.7415, + "step": 13822 + }, + { + "epoch": 0.9365810691781286, + "grad_norm": 5.3950042724609375, + "learning_rate": 8.210418235334382e-05, + "loss": 0.623, + "step": 13823 + }, + { + "epoch": 0.9366488244461006, + "grad_norm": 6.950892925262451, + "learning_rate": 8.210281333424602e-05, + "loss": 0.9234, + "step": 13824 + }, + { + "epoch": 0.9367165797140727, + "grad_norm": 6.8263044357299805, + "learning_rate": 8.21014443151482e-05, + "loss": 0.6791, + "step": 13825 + }, + { + "epoch": 0.9367843349820448, + "grad_norm": 6.001094818115234, + "learning_rate": 8.210007529605038e-05, + "loss": 0.8851, + "step": 13826 + }, + { + "epoch": 0.9368520902500169, + "grad_norm": 5.480862617492676, + "learning_rate": 8.209870627695256e-05, + "loss": 0.6752, + "step": 13827 + }, + { + "epoch": 0.936919845517989, + "grad_norm": 8.880070686340332, + "learning_rate": 8.209733725785475e-05, + "loss": 0.6895, + "step": 13828 + }, + { + "epoch": 0.9369876007859611, + "grad_norm": 5.244697570800781, + "learning_rate": 8.209596823875693e-05, + "loss": 0.7239, + "step": 13829 + }, + { + "epoch": 0.9370553560539332, + "grad_norm": 5.342809677124023, + "learning_rate": 8.209459921965911e-05, + "loss": 0.9466, + "step": 13830 + }, + { + "epoch": 0.9371231113219053, + "grad_norm": 5.547102928161621, + "learning_rate": 8.20932302005613e-05, + "loss": 0.668, + "step": 13831 + }, + { + "epoch": 0.9371908665898774, + "grad_norm": 5.530054092407227, + "learning_rate": 8.209186118146349e-05, + "loss": 0.7622, + "step": 13832 + }, + { + "epoch": 0.9372586218578495, + "grad_norm": 6.689416885375977, + "learning_rate": 8.209049216236567e-05, + "loss": 0.7291, + "step": 13833 + }, + { + "epoch": 0.9373263771258216, + "grad_norm": 5.052443981170654, + "learning_rate": 8.208912314326786e-05, + "loss": 0.7491, + "step": 13834 + }, + { + "epoch": 0.9373941323937937, + "grad_norm": 8.584953308105469, + "learning_rate": 8.208775412417004e-05, + "loss": 0.867, + "step": 13835 + }, + { + "epoch": 0.9374618876617657, + "grad_norm": 6.425045967102051, + "learning_rate": 8.208638510507222e-05, + "loss": 0.6844, + "step": 13836 + }, + { + "epoch": 0.9375296429297378, + "grad_norm": 7.346054553985596, + "learning_rate": 8.20850160859744e-05, + "loss": 0.7747, + "step": 13837 + }, + { + "epoch": 0.9375973981977098, + "grad_norm": 5.191468715667725, + "learning_rate": 8.20836470668766e-05, + "loss": 0.8087, + "step": 13838 + }, + { + "epoch": 0.9376651534656819, + "grad_norm": 5.554154396057129, + "learning_rate": 8.208227804777878e-05, + "loss": 0.6445, + "step": 13839 + }, + { + "epoch": 0.937732908733654, + "grad_norm": 6.577019214630127, + "learning_rate": 8.208090902868096e-05, + "loss": 0.9777, + "step": 13840 + }, + { + "epoch": 0.9378006640016261, + "grad_norm": 5.5466742515563965, + "learning_rate": 8.207954000958314e-05, + "loss": 0.7303, + "step": 13841 + }, + { + "epoch": 0.9378684192695982, + "grad_norm": 6.088878154754639, + "learning_rate": 8.207817099048533e-05, + "loss": 0.8181, + "step": 13842 + }, + { + "epoch": 0.9379361745375703, + "grad_norm": 6.197640895843506, + "learning_rate": 8.207680197138751e-05, + "loss": 0.7192, + "step": 13843 + }, + { + "epoch": 0.9380039298055424, + "grad_norm": 4.92954158782959, + "learning_rate": 8.207543295228969e-05, + "loss": 0.6276, + "step": 13844 + }, + { + "epoch": 0.9380716850735145, + "grad_norm": 5.180205345153809, + "learning_rate": 8.207406393319187e-05, + "loss": 0.783, + "step": 13845 + }, + { + "epoch": 0.9381394403414866, + "grad_norm": 5.946224212646484, + "learning_rate": 8.207269491409405e-05, + "loss": 0.775, + "step": 13846 + }, + { + "epoch": 0.9382071956094586, + "grad_norm": 5.4234490394592285, + "learning_rate": 8.207132589499625e-05, + "loss": 0.6428, + "step": 13847 + }, + { + "epoch": 0.9382749508774307, + "grad_norm": 4.664503574371338, + "learning_rate": 8.206995687589843e-05, + "loss": 0.7846, + "step": 13848 + }, + { + "epoch": 0.9383427061454028, + "grad_norm": 5.197528839111328, + "learning_rate": 8.20685878568006e-05, + "loss": 0.7588, + "step": 13849 + }, + { + "epoch": 0.9384104614133749, + "grad_norm": 6.116962909698486, + "learning_rate": 8.206721883770279e-05, + "loss": 0.9564, + "step": 13850 + }, + { + "epoch": 0.938478216681347, + "grad_norm": 5.113432884216309, + "learning_rate": 8.206584981860498e-05, + "loss": 0.7534, + "step": 13851 + }, + { + "epoch": 0.9385459719493191, + "grad_norm": 6.907203674316406, + "learning_rate": 8.206448079950716e-05, + "loss": 0.9213, + "step": 13852 + }, + { + "epoch": 0.9386137272172912, + "grad_norm": 7.037908554077148, + "learning_rate": 8.206311178040934e-05, + "loss": 0.7943, + "step": 13853 + }, + { + "epoch": 0.9386814824852632, + "grad_norm": 6.385385036468506, + "learning_rate": 8.206174276131152e-05, + "loss": 0.7192, + "step": 13854 + }, + { + "epoch": 0.9387492377532353, + "grad_norm": 6.877847194671631, + "learning_rate": 8.20603737422137e-05, + "loss": 1.0433, + "step": 13855 + }, + { + "epoch": 0.9388169930212074, + "grad_norm": 6.244350910186768, + "learning_rate": 8.20590047231159e-05, + "loss": 0.7018, + "step": 13856 + }, + { + "epoch": 0.9388847482891794, + "grad_norm": 5.529423236846924, + "learning_rate": 8.205763570401808e-05, + "loss": 0.825, + "step": 13857 + }, + { + "epoch": 0.9389525035571515, + "grad_norm": 5.644784450531006, + "learning_rate": 8.205626668492026e-05, + "loss": 0.6996, + "step": 13858 + }, + { + "epoch": 0.9390202588251236, + "grad_norm": 6.088039875030518, + "learning_rate": 8.205489766582244e-05, + "loss": 0.7494, + "step": 13859 + }, + { + "epoch": 0.9390880140930957, + "grad_norm": 4.0728936195373535, + "learning_rate": 8.205352864672463e-05, + "loss": 0.5493, + "step": 13860 + }, + { + "epoch": 0.9391557693610678, + "grad_norm": 6.4296698570251465, + "learning_rate": 8.205215962762681e-05, + "loss": 0.8059, + "step": 13861 + }, + { + "epoch": 0.9392235246290399, + "grad_norm": 4.51001501083374, + "learning_rate": 8.205079060852899e-05, + "loss": 0.6298, + "step": 13862 + }, + { + "epoch": 0.939291279897012, + "grad_norm": 5.673875331878662, + "learning_rate": 8.204942158943117e-05, + "loss": 0.8035, + "step": 13863 + }, + { + "epoch": 0.9393590351649841, + "grad_norm": 6.233775615692139, + "learning_rate": 8.204805257033335e-05, + "loss": 0.7389, + "step": 13864 + }, + { + "epoch": 0.9394267904329562, + "grad_norm": 6.151493549346924, + "learning_rate": 8.204668355123555e-05, + "loss": 0.6998, + "step": 13865 + }, + { + "epoch": 0.9394945457009283, + "grad_norm": 4.745614051818848, + "learning_rate": 8.204531453213773e-05, + "loss": 0.5749, + "step": 13866 + }, + { + "epoch": 0.9395623009689004, + "grad_norm": 6.141815185546875, + "learning_rate": 8.20439455130399e-05, + "loss": 0.5891, + "step": 13867 + }, + { + "epoch": 0.9396300562368725, + "grad_norm": 6.6597490310668945, + "learning_rate": 8.204257649394209e-05, + "loss": 0.7727, + "step": 13868 + }, + { + "epoch": 0.9396978115048445, + "grad_norm": 6.619930267333984, + "learning_rate": 8.204120747484427e-05, + "loss": 0.6702, + "step": 13869 + }, + { + "epoch": 0.9397655667728166, + "grad_norm": 5.507278919219971, + "learning_rate": 8.203983845574646e-05, + "loss": 0.7741, + "step": 13870 + }, + { + "epoch": 0.9398333220407886, + "grad_norm": 5.685328960418701, + "learning_rate": 8.203846943664864e-05, + "loss": 0.9224, + "step": 13871 + }, + { + "epoch": 0.9399010773087607, + "grad_norm": 5.3461456298828125, + "learning_rate": 8.203710041755082e-05, + "loss": 0.6182, + "step": 13872 + }, + { + "epoch": 0.9399688325767328, + "grad_norm": 8.528802871704102, + "learning_rate": 8.2035731398453e-05, + "loss": 0.7898, + "step": 13873 + }, + { + "epoch": 0.9400365878447049, + "grad_norm": 6.036088943481445, + "learning_rate": 8.20343623793552e-05, + "loss": 1.0215, + "step": 13874 + }, + { + "epoch": 0.940104343112677, + "grad_norm": 7.7004289627075195, + "learning_rate": 8.203299336025738e-05, + "loss": 0.7898, + "step": 13875 + }, + { + "epoch": 0.9401720983806491, + "grad_norm": 6.902210712432861, + "learning_rate": 8.203162434115956e-05, + "loss": 0.7609, + "step": 13876 + }, + { + "epoch": 0.9402398536486212, + "grad_norm": 9.464262962341309, + "learning_rate": 8.203025532206175e-05, + "loss": 1.1929, + "step": 13877 + }, + { + "epoch": 0.9403076089165933, + "grad_norm": 6.184045791625977, + "learning_rate": 8.202888630296393e-05, + "loss": 0.7753, + "step": 13878 + }, + { + "epoch": 0.9403753641845654, + "grad_norm": 7.176342964172363, + "learning_rate": 8.202751728386611e-05, + "loss": 0.6658, + "step": 13879 + }, + { + "epoch": 0.9404431194525374, + "grad_norm": 5.406729221343994, + "learning_rate": 8.20261482647683e-05, + "loss": 0.6304, + "step": 13880 + }, + { + "epoch": 0.9405108747205095, + "grad_norm": 6.102396488189697, + "learning_rate": 8.202477924567049e-05, + "loss": 0.8271, + "step": 13881 + }, + { + "epoch": 0.9405786299884816, + "grad_norm": 5.444383144378662, + "learning_rate": 8.202341022657267e-05, + "loss": 0.6307, + "step": 13882 + }, + { + "epoch": 0.9406463852564537, + "grad_norm": 4.637354850769043, + "learning_rate": 8.202204120747486e-05, + "loss": 0.7097, + "step": 13883 + }, + { + "epoch": 0.9407141405244258, + "grad_norm": 9.361139297485352, + "learning_rate": 8.202067218837704e-05, + "loss": 0.7545, + "step": 13884 + }, + { + "epoch": 0.9407818957923979, + "grad_norm": 5.362797260284424, + "learning_rate": 8.201930316927922e-05, + "loss": 0.8605, + "step": 13885 + }, + { + "epoch": 0.94084965106037, + "grad_norm": 5.900387287139893, + "learning_rate": 8.20179341501814e-05, + "loss": 0.6518, + "step": 13886 + }, + { + "epoch": 0.940917406328342, + "grad_norm": 5.875095844268799, + "learning_rate": 8.201656513108358e-05, + "loss": 0.7603, + "step": 13887 + }, + { + "epoch": 0.9409851615963141, + "grad_norm": 7.324997425079346, + "learning_rate": 8.201519611198577e-05, + "loss": 0.7821, + "step": 13888 + }, + { + "epoch": 0.9410529168642862, + "grad_norm": 6.422165870666504, + "learning_rate": 8.201382709288796e-05, + "loss": 0.7973, + "step": 13889 + }, + { + "epoch": 0.9411206721322583, + "grad_norm": 5.074305534362793, + "learning_rate": 8.201245807379014e-05, + "loss": 0.6436, + "step": 13890 + }, + { + "epoch": 0.9411884274002303, + "grad_norm": 6.061481952667236, + "learning_rate": 8.201108905469232e-05, + "loss": 0.7467, + "step": 13891 + }, + { + "epoch": 0.9412561826682024, + "grad_norm": 5.551920413970947, + "learning_rate": 8.20097200355945e-05, + "loss": 0.6732, + "step": 13892 + }, + { + "epoch": 0.9413239379361745, + "grad_norm": 6.896834373474121, + "learning_rate": 8.200835101649669e-05, + "loss": 0.7616, + "step": 13893 + }, + { + "epoch": 0.9413916932041466, + "grad_norm": 6.073055744171143, + "learning_rate": 8.200698199739887e-05, + "loss": 0.6834, + "step": 13894 + }, + { + "epoch": 0.9414594484721187, + "grad_norm": 7.429131984710693, + "learning_rate": 8.200561297830105e-05, + "loss": 0.6438, + "step": 13895 + }, + { + "epoch": 0.9415272037400908, + "grad_norm": 8.393160820007324, + "learning_rate": 8.200424395920323e-05, + "loss": 0.8012, + "step": 13896 + }, + { + "epoch": 0.9415949590080629, + "grad_norm": 6.223710060119629, + "learning_rate": 8.200287494010542e-05, + "loss": 0.8734, + "step": 13897 + }, + { + "epoch": 0.941662714276035, + "grad_norm": 4.849613666534424, + "learning_rate": 8.20015059210076e-05, + "loss": 0.8162, + "step": 13898 + }, + { + "epoch": 0.9417304695440071, + "grad_norm": 5.789686679840088, + "learning_rate": 8.200013690190979e-05, + "loss": 0.6433, + "step": 13899 + }, + { + "epoch": 0.9417982248119792, + "grad_norm": 5.670434951782227, + "learning_rate": 8.199876788281197e-05, + "loss": 0.7761, + "step": 13900 + }, + { + "epoch": 0.9418659800799513, + "grad_norm": 6.698735237121582, + "learning_rate": 8.199739886371415e-05, + "loss": 0.6752, + "step": 13901 + }, + { + "epoch": 0.9419337353479234, + "grad_norm": 6.325132369995117, + "learning_rate": 8.199602984461634e-05, + "loss": 0.7789, + "step": 13902 + }, + { + "epoch": 0.9420014906158953, + "grad_norm": 5.625211715698242, + "learning_rate": 8.199466082551852e-05, + "loss": 0.883, + "step": 13903 + }, + { + "epoch": 0.9420692458838674, + "grad_norm": 6.5696516036987305, + "learning_rate": 8.19932918064207e-05, + "loss": 0.7999, + "step": 13904 + }, + { + "epoch": 0.9421370011518395, + "grad_norm": 4.903794288635254, + "learning_rate": 8.199192278732288e-05, + "loss": 0.8234, + "step": 13905 + }, + { + "epoch": 0.9422047564198116, + "grad_norm": 4.687190532684326, + "learning_rate": 8.199055376822508e-05, + "loss": 0.7286, + "step": 13906 + }, + { + "epoch": 0.9422725116877837, + "grad_norm": 6.075998783111572, + "learning_rate": 8.198918474912726e-05, + "loss": 0.8778, + "step": 13907 + }, + { + "epoch": 0.9423402669557558, + "grad_norm": 6.295614719390869, + "learning_rate": 8.198781573002944e-05, + "loss": 0.8878, + "step": 13908 + }, + { + "epoch": 0.9424080222237279, + "grad_norm": 8.557680130004883, + "learning_rate": 8.198644671093162e-05, + "loss": 0.776, + "step": 13909 + }, + { + "epoch": 0.9424757774917, + "grad_norm": 6.864640712738037, + "learning_rate": 8.19850776918338e-05, + "loss": 1.0199, + "step": 13910 + }, + { + "epoch": 0.9425435327596721, + "grad_norm": 5.549873352050781, + "learning_rate": 8.198370867273599e-05, + "loss": 0.8881, + "step": 13911 + }, + { + "epoch": 0.9426112880276442, + "grad_norm": 6.9793548583984375, + "learning_rate": 8.198233965363817e-05, + "loss": 0.79, + "step": 13912 + }, + { + "epoch": 0.9426790432956162, + "grad_norm": 7.035325527191162, + "learning_rate": 8.198097063454035e-05, + "loss": 0.7801, + "step": 13913 + }, + { + "epoch": 0.9427467985635883, + "grad_norm": 6.695631980895996, + "learning_rate": 8.197960161544253e-05, + "loss": 0.7903, + "step": 13914 + }, + { + "epoch": 0.9428145538315604, + "grad_norm": 7.544031143188477, + "learning_rate": 8.197823259634473e-05, + "loss": 0.5813, + "step": 13915 + }, + { + "epoch": 0.9428823090995325, + "grad_norm": 5.196893215179443, + "learning_rate": 8.19768635772469e-05, + "loss": 0.7587, + "step": 13916 + }, + { + "epoch": 0.9429500643675046, + "grad_norm": 6.548089027404785, + "learning_rate": 8.197549455814909e-05, + "loss": 0.9122, + "step": 13917 + }, + { + "epoch": 0.9430178196354767, + "grad_norm": 4.924033164978027, + "learning_rate": 8.197412553905127e-05, + "loss": 0.6318, + "step": 13918 + }, + { + "epoch": 0.9430855749034488, + "grad_norm": 5.648553371429443, + "learning_rate": 8.197275651995345e-05, + "loss": 0.7063, + "step": 13919 + }, + { + "epoch": 0.9431533301714208, + "grad_norm": 7.385311126708984, + "learning_rate": 8.197138750085564e-05, + "loss": 0.782, + "step": 13920 + }, + { + "epoch": 0.9432210854393929, + "grad_norm": 5.233833312988281, + "learning_rate": 8.197001848175782e-05, + "loss": 0.6857, + "step": 13921 + }, + { + "epoch": 0.943288840707365, + "grad_norm": 7.0770487785339355, + "learning_rate": 8.196864946266e-05, + "loss": 0.6966, + "step": 13922 + }, + { + "epoch": 0.943356595975337, + "grad_norm": 6.8348307609558105, + "learning_rate": 8.19672804435622e-05, + "loss": 0.8207, + "step": 13923 + }, + { + "epoch": 0.9434243512433091, + "grad_norm": 4.237405300140381, + "learning_rate": 8.196591142446438e-05, + "loss": 0.8074, + "step": 13924 + }, + { + "epoch": 0.9434921065112812, + "grad_norm": 6.041856288909912, + "learning_rate": 8.196454240536656e-05, + "loss": 0.7794, + "step": 13925 + }, + { + "epoch": 0.9435598617792533, + "grad_norm": 8.183391571044922, + "learning_rate": 8.196317338626875e-05, + "loss": 0.7523, + "step": 13926 + }, + { + "epoch": 0.9436276170472254, + "grad_norm": 5.387772083282471, + "learning_rate": 8.196180436717093e-05, + "loss": 0.7759, + "step": 13927 + }, + { + "epoch": 0.9436953723151975, + "grad_norm": 5.296437740325928, + "learning_rate": 8.196043534807311e-05, + "loss": 0.7972, + "step": 13928 + }, + { + "epoch": 0.9437631275831696, + "grad_norm": 5.269390106201172, + "learning_rate": 8.19590663289753e-05, + "loss": 0.7055, + "step": 13929 + }, + { + "epoch": 0.9438308828511417, + "grad_norm": 7.258601665496826, + "learning_rate": 8.195769730987748e-05, + "loss": 0.8937, + "step": 13930 + }, + { + "epoch": 0.9438986381191138, + "grad_norm": 6.784587383270264, + "learning_rate": 8.195632829077966e-05, + "loss": 1.0161, + "step": 13931 + }, + { + "epoch": 0.9439663933870859, + "grad_norm": 5.093071937561035, + "learning_rate": 8.195495927168185e-05, + "loss": 0.7299, + "step": 13932 + }, + { + "epoch": 0.944034148655058, + "grad_norm": 5.3693013191223145, + "learning_rate": 8.195359025258403e-05, + "loss": 0.7403, + "step": 13933 + }, + { + "epoch": 0.9441019039230301, + "grad_norm": 5.527505874633789, + "learning_rate": 8.195222123348622e-05, + "loss": 0.6796, + "step": 13934 + }, + { + "epoch": 0.9441696591910022, + "grad_norm": 5.87165641784668, + "learning_rate": 8.19508522143884e-05, + "loss": 0.7385, + "step": 13935 + }, + { + "epoch": 0.9442374144589741, + "grad_norm": 6.087599754333496, + "learning_rate": 8.194948319529058e-05, + "loss": 0.545, + "step": 13936 + }, + { + "epoch": 0.9443051697269462, + "grad_norm": 5.642385005950928, + "learning_rate": 8.194811417619276e-05, + "loss": 0.7826, + "step": 13937 + }, + { + "epoch": 0.9443729249949183, + "grad_norm": 4.871824741363525, + "learning_rate": 8.194674515709495e-05, + "loss": 0.6956, + "step": 13938 + }, + { + "epoch": 0.9444406802628904, + "grad_norm": 4.260469913482666, + "learning_rate": 8.194537613799713e-05, + "loss": 0.561, + "step": 13939 + }, + { + "epoch": 0.9445084355308625, + "grad_norm": 5.726165294647217, + "learning_rate": 8.194400711889932e-05, + "loss": 0.9119, + "step": 13940 + }, + { + "epoch": 0.9445761907988346, + "grad_norm": 8.435538291931152, + "learning_rate": 8.19426380998015e-05, + "loss": 1.0353, + "step": 13941 + }, + { + "epoch": 0.9446439460668067, + "grad_norm": 5.619915962219238, + "learning_rate": 8.194126908070368e-05, + "loss": 0.7213, + "step": 13942 + }, + { + "epoch": 0.9447117013347788, + "grad_norm": 6.574455261230469, + "learning_rate": 8.193990006160587e-05, + "loss": 0.9633, + "step": 13943 + }, + { + "epoch": 0.9447794566027509, + "grad_norm": 6.439619064331055, + "learning_rate": 8.193853104250805e-05, + "loss": 0.7964, + "step": 13944 + }, + { + "epoch": 0.944847211870723, + "grad_norm": 6.305572509765625, + "learning_rate": 8.193716202341023e-05, + "loss": 0.72, + "step": 13945 + }, + { + "epoch": 0.944914967138695, + "grad_norm": 5.285477638244629, + "learning_rate": 8.193579300431241e-05, + "loss": 0.9051, + "step": 13946 + }, + { + "epoch": 0.9449827224066671, + "grad_norm": 5.163026332855225, + "learning_rate": 8.193442398521459e-05, + "loss": 0.57, + "step": 13947 + }, + { + "epoch": 0.9450504776746392, + "grad_norm": 5.176440715789795, + "learning_rate": 8.193305496611678e-05, + "loss": 0.7803, + "step": 13948 + }, + { + "epoch": 0.9451182329426113, + "grad_norm": 6.866820335388184, + "learning_rate": 8.193168594701897e-05, + "loss": 0.7486, + "step": 13949 + }, + { + "epoch": 0.9451859882105834, + "grad_norm": 6.827968597412109, + "learning_rate": 8.193031692792115e-05, + "loss": 0.7738, + "step": 13950 + }, + { + "epoch": 0.9452537434785555, + "grad_norm": 6.111838340759277, + "learning_rate": 8.192894790882333e-05, + "loss": 0.8302, + "step": 13951 + }, + { + "epoch": 0.9453214987465275, + "grad_norm": 6.111614227294922, + "learning_rate": 8.192757888972552e-05, + "loss": 0.8413, + "step": 13952 + }, + { + "epoch": 0.9453892540144996, + "grad_norm": 5.463453769683838, + "learning_rate": 8.19262098706277e-05, + "loss": 0.7054, + "step": 13953 + }, + { + "epoch": 0.9454570092824717, + "grad_norm": 6.099150657653809, + "learning_rate": 8.192484085152988e-05, + "loss": 0.596, + "step": 13954 + }, + { + "epoch": 0.9455247645504438, + "grad_norm": 6.040791988372803, + "learning_rate": 8.192347183243206e-05, + "loss": 0.6796, + "step": 13955 + }, + { + "epoch": 0.9455925198184159, + "grad_norm": 6.111145973205566, + "learning_rate": 8.192210281333424e-05, + "loss": 0.8444, + "step": 13956 + }, + { + "epoch": 0.945660275086388, + "grad_norm": 6.970156192779541, + "learning_rate": 8.192073379423644e-05, + "loss": 0.5742, + "step": 13957 + }, + { + "epoch": 0.94572803035436, + "grad_norm": 6.0487165451049805, + "learning_rate": 8.191936477513862e-05, + "loss": 0.8061, + "step": 13958 + }, + { + "epoch": 0.9457957856223321, + "grad_norm": 4.807767868041992, + "learning_rate": 8.19179957560408e-05, + "loss": 0.6691, + "step": 13959 + }, + { + "epoch": 0.9458635408903042, + "grad_norm": 5.062884330749512, + "learning_rate": 8.191662673694298e-05, + "loss": 0.9048, + "step": 13960 + }, + { + "epoch": 0.9459312961582763, + "grad_norm": 6.347843647003174, + "learning_rate": 8.191525771784517e-05, + "loss": 0.8603, + "step": 13961 + }, + { + "epoch": 0.9459990514262484, + "grad_norm": 5.537858009338379, + "learning_rate": 8.191388869874735e-05, + "loss": 0.8675, + "step": 13962 + }, + { + "epoch": 0.9460668066942205, + "grad_norm": 5.778237819671631, + "learning_rate": 8.191251967964953e-05, + "loss": 0.5481, + "step": 13963 + }, + { + "epoch": 0.9461345619621926, + "grad_norm": 5.969203472137451, + "learning_rate": 8.191115066055171e-05, + "loss": 0.7099, + "step": 13964 + }, + { + "epoch": 0.9462023172301647, + "grad_norm": 7.126950263977051, + "learning_rate": 8.190978164145389e-05, + "loss": 0.8117, + "step": 13965 + }, + { + "epoch": 0.9462700724981368, + "grad_norm": 5.3434624671936035, + "learning_rate": 8.190841262235609e-05, + "loss": 0.8133, + "step": 13966 + }, + { + "epoch": 0.9463378277661089, + "grad_norm": 5.5889105796813965, + "learning_rate": 8.190704360325827e-05, + "loss": 0.8104, + "step": 13967 + }, + { + "epoch": 0.946405583034081, + "grad_norm": 6.536361217498779, + "learning_rate": 8.190567458416045e-05, + "loss": 0.7897, + "step": 13968 + }, + { + "epoch": 0.9464733383020529, + "grad_norm": 6.82296085357666, + "learning_rate": 8.190430556506264e-05, + "loss": 1.0064, + "step": 13969 + }, + { + "epoch": 0.946541093570025, + "grad_norm": 6.4476470947265625, + "learning_rate": 8.190293654596482e-05, + "loss": 0.9211, + "step": 13970 + }, + { + "epoch": 0.9466088488379971, + "grad_norm": 6.636000156402588, + "learning_rate": 8.1901567526867e-05, + "loss": 0.6633, + "step": 13971 + }, + { + "epoch": 0.9466766041059692, + "grad_norm": 6.36086368560791, + "learning_rate": 8.19001985077692e-05, + "loss": 0.6275, + "step": 13972 + }, + { + "epoch": 0.9467443593739413, + "grad_norm": 7.286365032196045, + "learning_rate": 8.189882948867137e-05, + "loss": 0.7431, + "step": 13973 + }, + { + "epoch": 0.9468121146419134, + "grad_norm": 7.394494533538818, + "learning_rate": 8.189746046957356e-05, + "loss": 0.8701, + "step": 13974 + }, + { + "epoch": 0.9468798699098855, + "grad_norm": 5.378866195678711, + "learning_rate": 8.189609145047575e-05, + "loss": 0.6846, + "step": 13975 + }, + { + "epoch": 0.9469476251778576, + "grad_norm": 9.122882843017578, + "learning_rate": 8.189472243137793e-05, + "loss": 0.8023, + "step": 13976 + }, + { + "epoch": 0.9470153804458297, + "grad_norm": 6.454582214355469, + "learning_rate": 8.189335341228011e-05, + "loss": 0.7631, + "step": 13977 + }, + { + "epoch": 0.9470831357138018, + "grad_norm": 5.218212604522705, + "learning_rate": 8.189198439318229e-05, + "loss": 0.7752, + "step": 13978 + }, + { + "epoch": 0.9471508909817739, + "grad_norm": 6.383564472198486, + "learning_rate": 8.189061537408447e-05, + "loss": 0.7355, + "step": 13979 + }, + { + "epoch": 0.9472186462497459, + "grad_norm": 4.193008899688721, + "learning_rate": 8.188924635498666e-05, + "loss": 0.5955, + "step": 13980 + }, + { + "epoch": 0.947286401517718, + "grad_norm": 6.7840423583984375, + "learning_rate": 8.188787733588884e-05, + "loss": 0.8695, + "step": 13981 + }, + { + "epoch": 0.9473541567856901, + "grad_norm": 5.606986999511719, + "learning_rate": 8.188650831679102e-05, + "loss": 0.8062, + "step": 13982 + }, + { + "epoch": 0.9474219120536622, + "grad_norm": 4.632526397705078, + "learning_rate": 8.18851392976932e-05, + "loss": 0.9447, + "step": 13983 + }, + { + "epoch": 0.9474896673216343, + "grad_norm": 7.079931259155273, + "learning_rate": 8.18837702785954e-05, + "loss": 0.7606, + "step": 13984 + }, + { + "epoch": 0.9475574225896063, + "grad_norm": 6.6071858406066895, + "learning_rate": 8.188240125949758e-05, + "loss": 1.0442, + "step": 13985 + }, + { + "epoch": 0.9476251778575784, + "grad_norm": 6.782355308532715, + "learning_rate": 8.188103224039976e-05, + "loss": 0.6173, + "step": 13986 + }, + { + "epoch": 0.9476929331255505, + "grad_norm": 6.116751194000244, + "learning_rate": 8.187966322130194e-05, + "loss": 0.7438, + "step": 13987 + }, + { + "epoch": 0.9477606883935226, + "grad_norm": 6.912460803985596, + "learning_rate": 8.187829420220412e-05, + "loss": 0.7509, + "step": 13988 + }, + { + "epoch": 0.9478284436614947, + "grad_norm": 6.737586498260498, + "learning_rate": 8.187692518310631e-05, + "loss": 0.8741, + "step": 13989 + }, + { + "epoch": 0.9478961989294667, + "grad_norm": 4.6144938468933105, + "learning_rate": 8.18755561640085e-05, + "loss": 0.8468, + "step": 13990 + }, + { + "epoch": 0.9479639541974388, + "grad_norm": 6.440502166748047, + "learning_rate": 8.187418714491068e-05, + "loss": 0.7353, + "step": 13991 + }, + { + "epoch": 0.9480317094654109, + "grad_norm": 7.09966516494751, + "learning_rate": 8.187281812581286e-05, + "loss": 0.8539, + "step": 13992 + }, + { + "epoch": 0.948099464733383, + "grad_norm": 7.213123321533203, + "learning_rate": 8.187144910671505e-05, + "loss": 0.5578, + "step": 13993 + }, + { + "epoch": 0.9481672200013551, + "grad_norm": 5.926023006439209, + "learning_rate": 8.187008008761723e-05, + "loss": 0.7481, + "step": 13994 + }, + { + "epoch": 0.9482349752693272, + "grad_norm": 6.6714768409729, + "learning_rate": 8.186871106851941e-05, + "loss": 0.8587, + "step": 13995 + }, + { + "epoch": 0.9483027305372993, + "grad_norm": 6.985418319702148, + "learning_rate": 8.186734204942159e-05, + "loss": 0.9756, + "step": 13996 + }, + { + "epoch": 0.9483704858052714, + "grad_norm": 4.339714050292969, + "learning_rate": 8.186597303032377e-05, + "loss": 0.7206, + "step": 13997 + }, + { + "epoch": 0.9484382410732435, + "grad_norm": 5.789028644561768, + "learning_rate": 8.186460401122596e-05, + "loss": 0.7851, + "step": 13998 + }, + { + "epoch": 0.9485059963412156, + "grad_norm": 4.326128005981445, + "learning_rate": 8.186323499212814e-05, + "loss": 0.6611, + "step": 13999 + }, + { + "epoch": 0.9485737516091877, + "grad_norm": 4.751250267028809, + "learning_rate": 8.186186597303033e-05, + "loss": 0.791, + "step": 14000 + }, + { + "epoch": 0.9486415068771596, + "grad_norm": 6.147752285003662, + "learning_rate": 8.18604969539325e-05, + "loss": 0.9781, + "step": 14001 + }, + { + "epoch": 0.9487092621451317, + "grad_norm": 5.021920680999756, + "learning_rate": 8.185912793483469e-05, + "loss": 0.634, + "step": 14002 + }, + { + "epoch": 0.9487770174131038, + "grad_norm": 5.617037773132324, + "learning_rate": 8.185775891573688e-05, + "loss": 0.6997, + "step": 14003 + }, + { + "epoch": 0.9488447726810759, + "grad_norm": 5.281715393066406, + "learning_rate": 8.185638989663906e-05, + "loss": 0.6422, + "step": 14004 + }, + { + "epoch": 0.948912527949048, + "grad_norm": 5.466105937957764, + "learning_rate": 8.185502087754124e-05, + "loss": 0.7793, + "step": 14005 + }, + { + "epoch": 0.9489802832170201, + "grad_norm": 8.395241737365723, + "learning_rate": 8.185365185844342e-05, + "loss": 1.0168, + "step": 14006 + }, + { + "epoch": 0.9490480384849922, + "grad_norm": 5.434301376342773, + "learning_rate": 8.185228283934561e-05, + "loss": 0.6043, + "step": 14007 + }, + { + "epoch": 0.9491157937529643, + "grad_norm": 6.209643363952637, + "learning_rate": 8.18509138202478e-05, + "loss": 0.9309, + "step": 14008 + }, + { + "epoch": 0.9491835490209364, + "grad_norm": 5.438107013702393, + "learning_rate": 8.184954480114998e-05, + "loss": 0.8226, + "step": 14009 + }, + { + "epoch": 0.9492513042889085, + "grad_norm": 5.4521050453186035, + "learning_rate": 8.184817578205216e-05, + "loss": 0.9139, + "step": 14010 + }, + { + "epoch": 0.9493190595568806, + "grad_norm": 6.864973068237305, + "learning_rate": 8.184680676295434e-05, + "loss": 0.9011, + "step": 14011 + }, + { + "epoch": 0.9493868148248527, + "grad_norm": 3.955416202545166, + "learning_rate": 8.184543774385653e-05, + "loss": 0.6179, + "step": 14012 + }, + { + "epoch": 0.9494545700928247, + "grad_norm": 4.496962547302246, + "learning_rate": 8.184406872475871e-05, + "loss": 0.7532, + "step": 14013 + }, + { + "epoch": 0.9495223253607968, + "grad_norm": 7.178885459899902, + "learning_rate": 8.184269970566089e-05, + "loss": 0.8709, + "step": 14014 + }, + { + "epoch": 0.9495900806287689, + "grad_norm": 8.725399017333984, + "learning_rate": 8.184133068656308e-05, + "loss": 0.6709, + "step": 14015 + }, + { + "epoch": 0.949657835896741, + "grad_norm": 4.831404685974121, + "learning_rate": 8.183996166746526e-05, + "loss": 0.6221, + "step": 14016 + }, + { + "epoch": 0.949725591164713, + "grad_norm": 5.273083209991455, + "learning_rate": 8.183859264836745e-05, + "loss": 0.6876, + "step": 14017 + }, + { + "epoch": 0.9497933464326851, + "grad_norm": 5.9395833015441895, + "learning_rate": 8.183722362926964e-05, + "loss": 0.7032, + "step": 14018 + }, + { + "epoch": 0.9498611017006572, + "grad_norm": 5.861425876617432, + "learning_rate": 8.183585461017182e-05, + "loss": 0.7697, + "step": 14019 + }, + { + "epoch": 0.9499288569686293, + "grad_norm": 6.970915794372559, + "learning_rate": 8.1834485591074e-05, + "loss": 0.693, + "step": 14020 + }, + { + "epoch": 0.9499966122366014, + "grad_norm": 5.096658229827881, + "learning_rate": 8.18331165719762e-05, + "loss": 0.9212, + "step": 14021 + }, + { + "epoch": 0.9500643675045735, + "grad_norm": 6.328000068664551, + "learning_rate": 8.183174755287837e-05, + "loss": 0.7588, + "step": 14022 + }, + { + "epoch": 0.9501321227725456, + "grad_norm": 5.18841552734375, + "learning_rate": 8.183037853378055e-05, + "loss": 0.57, + "step": 14023 + }, + { + "epoch": 0.9501998780405176, + "grad_norm": 7.505560398101807, + "learning_rate": 8.182900951468273e-05, + "loss": 0.7265, + "step": 14024 + }, + { + "epoch": 0.9502676333084897, + "grad_norm": 6.134244918823242, + "learning_rate": 8.182764049558492e-05, + "loss": 0.6735, + "step": 14025 + }, + { + "epoch": 0.9503353885764618, + "grad_norm": 6.682973384857178, + "learning_rate": 8.182627147648711e-05, + "loss": 0.7814, + "step": 14026 + }, + { + "epoch": 0.9504031438444339, + "grad_norm": 6.492305755615234, + "learning_rate": 8.182490245738929e-05, + "loss": 0.7248, + "step": 14027 + }, + { + "epoch": 0.950470899112406, + "grad_norm": 4.836461067199707, + "learning_rate": 8.182353343829147e-05, + "loss": 0.5105, + "step": 14028 + }, + { + "epoch": 0.9505386543803781, + "grad_norm": 6.350803375244141, + "learning_rate": 8.182216441919365e-05, + "loss": 0.9946, + "step": 14029 + }, + { + "epoch": 0.9506064096483502, + "grad_norm": 6.154294490814209, + "learning_rate": 8.182079540009584e-05, + "loss": 0.6859, + "step": 14030 + }, + { + "epoch": 0.9506741649163223, + "grad_norm": 5.958618640899658, + "learning_rate": 8.181942638099802e-05, + "loss": 0.8397, + "step": 14031 + }, + { + "epoch": 0.9507419201842944, + "grad_norm": 5.353884696960449, + "learning_rate": 8.18180573619002e-05, + "loss": 0.8651, + "step": 14032 + }, + { + "epoch": 0.9508096754522665, + "grad_norm": 4.889853000640869, + "learning_rate": 8.181668834280238e-05, + "loss": 0.7572, + "step": 14033 + }, + { + "epoch": 0.9508774307202384, + "grad_norm": 6.175332069396973, + "learning_rate": 8.181531932370457e-05, + "loss": 0.9212, + "step": 14034 + }, + { + "epoch": 0.9509451859882105, + "grad_norm": 6.263129711151123, + "learning_rate": 8.181395030460676e-05, + "loss": 0.6871, + "step": 14035 + }, + { + "epoch": 0.9510129412561826, + "grad_norm": 5.999005317687988, + "learning_rate": 8.181258128550894e-05, + "loss": 0.8786, + "step": 14036 + }, + { + "epoch": 0.9510806965241547, + "grad_norm": 5.486205577850342, + "learning_rate": 8.181121226641112e-05, + "loss": 0.7679, + "step": 14037 + }, + { + "epoch": 0.9511484517921268, + "grad_norm": 4.255964756011963, + "learning_rate": 8.18098432473133e-05, + "loss": 0.5138, + "step": 14038 + }, + { + "epoch": 0.9512162070600989, + "grad_norm": 7.037053108215332, + "learning_rate": 8.18084742282155e-05, + "loss": 0.8799, + "step": 14039 + }, + { + "epoch": 0.951283962328071, + "grad_norm": 5.0967302322387695, + "learning_rate": 8.180710520911767e-05, + "loss": 0.6126, + "step": 14040 + }, + { + "epoch": 0.9513517175960431, + "grad_norm": 4.5067458152771, + "learning_rate": 8.180573619001985e-05, + "loss": 0.5718, + "step": 14041 + }, + { + "epoch": 0.9514194728640152, + "grad_norm": 4.694755554199219, + "learning_rate": 8.180436717092204e-05, + "loss": 0.712, + "step": 14042 + }, + { + "epoch": 0.9514872281319873, + "grad_norm": 6.1123738288879395, + "learning_rate": 8.180299815182422e-05, + "loss": 0.7131, + "step": 14043 + }, + { + "epoch": 0.9515549833999594, + "grad_norm": 7.102774143218994, + "learning_rate": 8.180162913272641e-05, + "loss": 0.6774, + "step": 14044 + }, + { + "epoch": 0.9516227386679315, + "grad_norm": 5.857600212097168, + "learning_rate": 8.180026011362859e-05, + "loss": 0.7464, + "step": 14045 + }, + { + "epoch": 0.9516904939359035, + "grad_norm": 5.622432708740234, + "learning_rate": 8.179889109453077e-05, + "loss": 0.6566, + "step": 14046 + }, + { + "epoch": 0.9517582492038756, + "grad_norm": 8.488066673278809, + "learning_rate": 8.179752207543295e-05, + "loss": 0.9199, + "step": 14047 + }, + { + "epoch": 0.9518260044718477, + "grad_norm": 7.23390531539917, + "learning_rate": 8.179615305633514e-05, + "loss": 0.884, + "step": 14048 + }, + { + "epoch": 0.9518937597398198, + "grad_norm": 6.6131391525268555, + "learning_rate": 8.179478403723732e-05, + "loss": 1.0312, + "step": 14049 + }, + { + "epoch": 0.9519615150077918, + "grad_norm": 5.167440891265869, + "learning_rate": 8.17934150181395e-05, + "loss": 0.6041, + "step": 14050 + }, + { + "epoch": 0.9520292702757639, + "grad_norm": 6.33022403717041, + "learning_rate": 8.179204599904169e-05, + "loss": 0.8595, + "step": 14051 + }, + { + "epoch": 0.952097025543736, + "grad_norm": 6.872222900390625, + "learning_rate": 8.179067697994387e-05, + "loss": 0.7336, + "step": 14052 + }, + { + "epoch": 0.9521647808117081, + "grad_norm": 4.217645168304443, + "learning_rate": 8.178930796084606e-05, + "loss": 0.4555, + "step": 14053 + }, + { + "epoch": 0.9522325360796802, + "grad_norm": 4.8638529777526855, + "learning_rate": 8.178793894174824e-05, + "loss": 0.6661, + "step": 14054 + }, + { + "epoch": 0.9523002913476523, + "grad_norm": 7.580918788909912, + "learning_rate": 8.178656992265042e-05, + "loss": 0.7857, + "step": 14055 + }, + { + "epoch": 0.9523680466156244, + "grad_norm": 6.182562351226807, + "learning_rate": 8.17852009035526e-05, + "loss": 0.7932, + "step": 14056 + }, + { + "epoch": 0.9524358018835964, + "grad_norm": 7.604414939880371, + "learning_rate": 8.178383188445478e-05, + "loss": 0.5992, + "step": 14057 + }, + { + "epoch": 0.9525035571515685, + "grad_norm": 6.542990684509277, + "learning_rate": 8.178246286535697e-05, + "loss": 0.888, + "step": 14058 + }, + { + "epoch": 0.9525713124195406, + "grad_norm": 5.210031986236572, + "learning_rate": 8.178109384625916e-05, + "loss": 0.7931, + "step": 14059 + }, + { + "epoch": 0.9526390676875127, + "grad_norm": 8.13759994506836, + "learning_rate": 8.177972482716134e-05, + "loss": 0.6912, + "step": 14060 + }, + { + "epoch": 0.9527068229554848, + "grad_norm": 5.919729709625244, + "learning_rate": 8.177835580806353e-05, + "loss": 0.7952, + "step": 14061 + }, + { + "epoch": 0.9527745782234569, + "grad_norm": 5.694857597351074, + "learning_rate": 8.177698678896571e-05, + "loss": 0.7651, + "step": 14062 + }, + { + "epoch": 0.952842333491429, + "grad_norm": 6.005049705505371, + "learning_rate": 8.177561776986789e-05, + "loss": 0.6854, + "step": 14063 + }, + { + "epoch": 0.9529100887594011, + "grad_norm": 7.230431079864502, + "learning_rate": 8.177424875077008e-05, + "loss": 0.815, + "step": 14064 + }, + { + "epoch": 0.9529778440273732, + "grad_norm": 6.85237455368042, + "learning_rate": 8.177287973167226e-05, + "loss": 0.6984, + "step": 14065 + }, + { + "epoch": 0.9530455992953452, + "grad_norm": 5.569474697113037, + "learning_rate": 8.177151071257444e-05, + "loss": 0.8195, + "step": 14066 + }, + { + "epoch": 0.9531133545633172, + "grad_norm": 8.014307022094727, + "learning_rate": 8.177014169347664e-05, + "loss": 0.8865, + "step": 14067 + }, + { + "epoch": 0.9531811098312893, + "grad_norm": 5.345339775085449, + "learning_rate": 8.176877267437882e-05, + "loss": 0.8422, + "step": 14068 + }, + { + "epoch": 0.9532488650992614, + "grad_norm": 7.717896461486816, + "learning_rate": 8.1767403655281e-05, + "loss": 0.8969, + "step": 14069 + }, + { + "epoch": 0.9533166203672335, + "grad_norm": 5.41009521484375, + "learning_rate": 8.176603463618318e-05, + "loss": 0.9792, + "step": 14070 + }, + { + "epoch": 0.9533843756352056, + "grad_norm": 4.825246334075928, + "learning_rate": 8.176466561708537e-05, + "loss": 0.5923, + "step": 14071 + }, + { + "epoch": 0.9534521309031777, + "grad_norm": 4.686081409454346, + "learning_rate": 8.176329659798755e-05, + "loss": 0.8116, + "step": 14072 + }, + { + "epoch": 0.9535198861711498, + "grad_norm": 4.988349914550781, + "learning_rate": 8.176192757888973e-05, + "loss": 0.8101, + "step": 14073 + }, + { + "epoch": 0.9535876414391219, + "grad_norm": 6.294574737548828, + "learning_rate": 8.176055855979191e-05, + "loss": 0.7203, + "step": 14074 + }, + { + "epoch": 0.953655396707094, + "grad_norm": 4.857511520385742, + "learning_rate": 8.17591895406941e-05, + "loss": 0.8133, + "step": 14075 + }, + { + "epoch": 0.9537231519750661, + "grad_norm": 6.600233554840088, + "learning_rate": 8.175782052159629e-05, + "loss": 0.9022, + "step": 14076 + }, + { + "epoch": 0.9537909072430382, + "grad_norm": 6.04002046585083, + "learning_rate": 8.175645150249847e-05, + "loss": 0.7713, + "step": 14077 + }, + { + "epoch": 0.9538586625110103, + "grad_norm": 4.784701824188232, + "learning_rate": 8.175508248340065e-05, + "loss": 0.5853, + "step": 14078 + }, + { + "epoch": 0.9539264177789823, + "grad_norm": 5.057199954986572, + "learning_rate": 8.175371346430283e-05, + "loss": 0.7118, + "step": 14079 + }, + { + "epoch": 0.9539941730469544, + "grad_norm": 6.518017768859863, + "learning_rate": 8.175234444520501e-05, + "loss": 0.8091, + "step": 14080 + }, + { + "epoch": 0.9540619283149265, + "grad_norm": 6.6895575523376465, + "learning_rate": 8.17509754261072e-05, + "loss": 0.8893, + "step": 14081 + }, + { + "epoch": 0.9541296835828986, + "grad_norm": 5.571439743041992, + "learning_rate": 8.174960640700938e-05, + "loss": 0.6693, + "step": 14082 + }, + { + "epoch": 0.9541974388508706, + "grad_norm": 5.339274883270264, + "learning_rate": 8.174823738791156e-05, + "loss": 0.5533, + "step": 14083 + }, + { + "epoch": 0.9542651941188427, + "grad_norm": 6.4230523109436035, + "learning_rate": 8.174686836881374e-05, + "loss": 0.6819, + "step": 14084 + }, + { + "epoch": 0.9543329493868148, + "grad_norm": 5.337852954864502, + "learning_rate": 8.174549934971594e-05, + "loss": 0.5664, + "step": 14085 + }, + { + "epoch": 0.9544007046547869, + "grad_norm": 5.271894454956055, + "learning_rate": 8.174413033061812e-05, + "loss": 0.678, + "step": 14086 + }, + { + "epoch": 0.954468459922759, + "grad_norm": 5.684970855712891, + "learning_rate": 8.17427613115203e-05, + "loss": 0.8186, + "step": 14087 + }, + { + "epoch": 0.9545362151907311, + "grad_norm": 8.407366752624512, + "learning_rate": 8.174139229242248e-05, + "loss": 0.7385, + "step": 14088 + }, + { + "epoch": 0.9546039704587032, + "grad_norm": 5.270580768585205, + "learning_rate": 8.174002327332466e-05, + "loss": 0.7859, + "step": 14089 + }, + { + "epoch": 0.9546717257266752, + "grad_norm": 6.393465518951416, + "learning_rate": 8.173865425422685e-05, + "loss": 0.7205, + "step": 14090 + }, + { + "epoch": 0.9547394809946473, + "grad_norm": 6.487541675567627, + "learning_rate": 8.173728523512903e-05, + "loss": 0.9124, + "step": 14091 + }, + { + "epoch": 0.9548072362626194, + "grad_norm": 5.790227890014648, + "learning_rate": 8.173591621603121e-05, + "loss": 0.8357, + "step": 14092 + }, + { + "epoch": 0.9548749915305915, + "grad_norm": 6.707381725311279, + "learning_rate": 8.17345471969334e-05, + "loss": 0.8281, + "step": 14093 + }, + { + "epoch": 0.9549427467985636, + "grad_norm": 5.875377655029297, + "learning_rate": 8.173317817783559e-05, + "loss": 0.917, + "step": 14094 + }, + { + "epoch": 0.9550105020665357, + "grad_norm": 9.384751319885254, + "learning_rate": 8.173180915873777e-05, + "loss": 0.8046, + "step": 14095 + }, + { + "epoch": 0.9550782573345078, + "grad_norm": 4.465388298034668, + "learning_rate": 8.173044013963995e-05, + "loss": 0.4959, + "step": 14096 + }, + { + "epoch": 0.9551460126024799, + "grad_norm": 5.929595947265625, + "learning_rate": 8.172907112054213e-05, + "loss": 0.655, + "step": 14097 + }, + { + "epoch": 0.955213767870452, + "grad_norm": 6.126537322998047, + "learning_rate": 8.172770210144431e-05, + "loss": 0.8669, + "step": 14098 + }, + { + "epoch": 0.955281523138424, + "grad_norm": 4.894435405731201, + "learning_rate": 8.17263330823465e-05, + "loss": 0.8827, + "step": 14099 + }, + { + "epoch": 0.955349278406396, + "grad_norm": 5.8009490966796875, + "learning_rate": 8.172496406324868e-05, + "loss": 0.6743, + "step": 14100 + }, + { + "epoch": 0.9554170336743681, + "grad_norm": 6.6259965896606445, + "learning_rate": 8.172359504415086e-05, + "loss": 0.7902, + "step": 14101 + }, + { + "epoch": 0.9554847889423402, + "grad_norm": 8.393582344055176, + "learning_rate": 8.172222602505305e-05, + "loss": 0.7179, + "step": 14102 + }, + { + "epoch": 0.9555525442103123, + "grad_norm": 5.586965560913086, + "learning_rate": 8.172085700595524e-05, + "loss": 0.8491, + "step": 14103 + }, + { + "epoch": 0.9556202994782844, + "grad_norm": 6.042308807373047, + "learning_rate": 8.171948798685742e-05, + "loss": 0.5834, + "step": 14104 + }, + { + "epoch": 0.9556880547462565, + "grad_norm": 10.39201831817627, + "learning_rate": 8.17181189677596e-05, + "loss": 0.8719, + "step": 14105 + }, + { + "epoch": 0.9557558100142286, + "grad_norm": 6.800583839416504, + "learning_rate": 8.171674994866178e-05, + "loss": 0.6832, + "step": 14106 + }, + { + "epoch": 0.9558235652822007, + "grad_norm": 4.868492603302002, + "learning_rate": 8.171538092956396e-05, + "loss": 0.8444, + "step": 14107 + }, + { + "epoch": 0.9558913205501728, + "grad_norm": 6.521236419677734, + "learning_rate": 8.171401191046615e-05, + "loss": 0.6928, + "step": 14108 + }, + { + "epoch": 0.9559590758181449, + "grad_norm": 6.01854944229126, + "learning_rate": 8.171264289136833e-05, + "loss": 0.9047, + "step": 14109 + }, + { + "epoch": 0.956026831086117, + "grad_norm": 5.620432376861572, + "learning_rate": 8.171127387227052e-05, + "loss": 0.625, + "step": 14110 + }, + { + "epoch": 0.9560945863540891, + "grad_norm": 6.074321269989014, + "learning_rate": 8.170990485317271e-05, + "loss": 0.8075, + "step": 14111 + }, + { + "epoch": 0.9561623416220612, + "grad_norm": 5.468603134155273, + "learning_rate": 8.170853583407489e-05, + "loss": 0.9023, + "step": 14112 + }, + { + "epoch": 0.9562300968900332, + "grad_norm": 7.003314971923828, + "learning_rate": 8.170716681497707e-05, + "loss": 1.0302, + "step": 14113 + }, + { + "epoch": 0.9562978521580053, + "grad_norm": 5.144251346588135, + "learning_rate": 8.170579779587926e-05, + "loss": 0.7184, + "step": 14114 + }, + { + "epoch": 0.9563656074259773, + "grad_norm": 5.8166823387146, + "learning_rate": 8.170442877678144e-05, + "loss": 0.8568, + "step": 14115 + }, + { + "epoch": 0.9564333626939494, + "grad_norm": 5.200114727020264, + "learning_rate": 8.170305975768362e-05, + "loss": 0.8245, + "step": 14116 + }, + { + "epoch": 0.9565011179619215, + "grad_norm": 6.467376708984375, + "learning_rate": 8.170169073858582e-05, + "loss": 0.8682, + "step": 14117 + }, + { + "epoch": 0.9565688732298936, + "grad_norm": 5.500349998474121, + "learning_rate": 8.1700321719488e-05, + "loss": 0.6647, + "step": 14118 + }, + { + "epoch": 0.9566366284978657, + "grad_norm": 5.1452765464782715, + "learning_rate": 8.169895270039018e-05, + "loss": 0.7039, + "step": 14119 + }, + { + "epoch": 0.9567043837658378, + "grad_norm": 7.137358665466309, + "learning_rate": 8.169758368129236e-05, + "loss": 0.7246, + "step": 14120 + }, + { + "epoch": 0.9567721390338099, + "grad_norm": 5.405989170074463, + "learning_rate": 8.169621466219454e-05, + "loss": 0.8214, + "step": 14121 + }, + { + "epoch": 0.956839894301782, + "grad_norm": 6.709090232849121, + "learning_rate": 8.169484564309673e-05, + "loss": 0.8312, + "step": 14122 + }, + { + "epoch": 0.956907649569754, + "grad_norm": 5.676616668701172, + "learning_rate": 8.169347662399891e-05, + "loss": 0.9024, + "step": 14123 + }, + { + "epoch": 0.9569754048377261, + "grad_norm": 4.852606296539307, + "learning_rate": 8.16921076049011e-05, + "loss": 0.7261, + "step": 14124 + }, + { + "epoch": 0.9570431601056982, + "grad_norm": 6.199010372161865, + "learning_rate": 8.169073858580327e-05, + "loss": 0.7481, + "step": 14125 + }, + { + "epoch": 0.9571109153736703, + "grad_norm": 5.474722385406494, + "learning_rate": 8.168936956670547e-05, + "loss": 0.7711, + "step": 14126 + }, + { + "epoch": 0.9571786706416424, + "grad_norm": 6.915562152862549, + "learning_rate": 8.168800054760765e-05, + "loss": 0.9033, + "step": 14127 + }, + { + "epoch": 0.9572464259096145, + "grad_norm": 6.63683557510376, + "learning_rate": 8.168663152850983e-05, + "loss": 0.9166, + "step": 14128 + }, + { + "epoch": 0.9573141811775866, + "grad_norm": 5.392688751220703, + "learning_rate": 8.168526250941201e-05, + "loss": 0.7428, + "step": 14129 + }, + { + "epoch": 0.9573819364455587, + "grad_norm": 7.527129173278809, + "learning_rate": 8.168389349031419e-05, + "loss": 0.7533, + "step": 14130 + }, + { + "epoch": 0.9574496917135308, + "grad_norm": 7.458190441131592, + "learning_rate": 8.168252447121638e-05, + "loss": 0.9663, + "step": 14131 + }, + { + "epoch": 0.9575174469815028, + "grad_norm": 5.647728443145752, + "learning_rate": 8.168115545211856e-05, + "loss": 0.8357, + "step": 14132 + }, + { + "epoch": 0.9575852022494749, + "grad_norm": 6.73082971572876, + "learning_rate": 8.167978643302074e-05, + "loss": 0.9228, + "step": 14133 + }, + { + "epoch": 0.957652957517447, + "grad_norm": 5.708244800567627, + "learning_rate": 8.167841741392292e-05, + "loss": 0.742, + "step": 14134 + }, + { + "epoch": 0.957720712785419, + "grad_norm": 5.192925453186035, + "learning_rate": 8.16770483948251e-05, + "loss": 0.5755, + "step": 14135 + }, + { + "epoch": 0.9577884680533911, + "grad_norm": 5.057267665863037, + "learning_rate": 8.16756793757273e-05, + "loss": 0.7179, + "step": 14136 + }, + { + "epoch": 0.9578562233213632, + "grad_norm": 5.001532554626465, + "learning_rate": 8.167431035662948e-05, + "loss": 0.7356, + "step": 14137 + }, + { + "epoch": 0.9579239785893353, + "grad_norm": 5.9870781898498535, + "learning_rate": 8.167294133753166e-05, + "loss": 0.6898, + "step": 14138 + }, + { + "epoch": 0.9579917338573074, + "grad_norm": 6.193863868713379, + "learning_rate": 8.167157231843384e-05, + "loss": 0.7337, + "step": 14139 + }, + { + "epoch": 0.9580594891252795, + "grad_norm": 5.425492286682129, + "learning_rate": 8.167020329933603e-05, + "loss": 0.5645, + "step": 14140 + }, + { + "epoch": 0.9581272443932516, + "grad_norm": 6.2710700035095215, + "learning_rate": 8.166883428023821e-05, + "loss": 0.6244, + "step": 14141 + }, + { + "epoch": 0.9581949996612237, + "grad_norm": 6.745750904083252, + "learning_rate": 8.16674652611404e-05, + "loss": 0.7168, + "step": 14142 + }, + { + "epoch": 0.9582627549291958, + "grad_norm": 4.833362579345703, + "learning_rate": 8.166609624204257e-05, + "loss": 0.6343, + "step": 14143 + }, + { + "epoch": 0.9583305101971679, + "grad_norm": 7.704063415527344, + "learning_rate": 8.166472722294476e-05, + "loss": 0.8166, + "step": 14144 + }, + { + "epoch": 0.95839826546514, + "grad_norm": 4.506795406341553, + "learning_rate": 8.166335820384695e-05, + "loss": 0.7254, + "step": 14145 + }, + { + "epoch": 0.958466020733112, + "grad_norm": 4.970558166503906, + "learning_rate": 8.166198918474913e-05, + "loss": 0.6922, + "step": 14146 + }, + { + "epoch": 0.9585337760010841, + "grad_norm": 6.441205024719238, + "learning_rate": 8.166062016565131e-05, + "loss": 0.8758, + "step": 14147 + }, + { + "epoch": 0.9586015312690561, + "grad_norm": 5.769437789916992, + "learning_rate": 8.165925114655349e-05, + "loss": 0.6237, + "step": 14148 + }, + { + "epoch": 0.9586692865370282, + "grad_norm": 5.401442527770996, + "learning_rate": 8.165788212745568e-05, + "loss": 1.0237, + "step": 14149 + }, + { + "epoch": 0.9587370418050003, + "grad_norm": 6.560751438140869, + "learning_rate": 8.165651310835786e-05, + "loss": 1.0011, + "step": 14150 + }, + { + "epoch": 0.9588047970729724, + "grad_norm": 5.372631072998047, + "learning_rate": 8.165514408926004e-05, + "loss": 0.787, + "step": 14151 + }, + { + "epoch": 0.9588725523409445, + "grad_norm": 4.6542558670043945, + "learning_rate": 8.165377507016222e-05, + "loss": 0.6897, + "step": 14152 + }, + { + "epoch": 0.9589403076089166, + "grad_norm": 6.8552141189575195, + "learning_rate": 8.16524060510644e-05, + "loss": 0.9957, + "step": 14153 + }, + { + "epoch": 0.9590080628768887, + "grad_norm": 6.167290687561035, + "learning_rate": 8.16510370319666e-05, + "loss": 0.7579, + "step": 14154 + }, + { + "epoch": 0.9590758181448608, + "grad_norm": 4.993210315704346, + "learning_rate": 8.164966801286878e-05, + "loss": 0.8123, + "step": 14155 + }, + { + "epoch": 0.9591435734128329, + "grad_norm": 6.3289713859558105, + "learning_rate": 8.164829899377096e-05, + "loss": 0.7427, + "step": 14156 + }, + { + "epoch": 0.9592113286808049, + "grad_norm": 4.5227837562561035, + "learning_rate": 8.164692997467315e-05, + "loss": 0.7607, + "step": 14157 + }, + { + "epoch": 0.959279083948777, + "grad_norm": 5.881022930145264, + "learning_rate": 8.164556095557533e-05, + "loss": 0.8952, + "step": 14158 + }, + { + "epoch": 0.9593468392167491, + "grad_norm": 5.290414810180664, + "learning_rate": 8.164419193647751e-05, + "loss": 0.7087, + "step": 14159 + }, + { + "epoch": 0.9594145944847212, + "grad_norm": 6.688441276550293, + "learning_rate": 8.164282291737971e-05, + "loss": 0.635, + "step": 14160 + }, + { + "epoch": 0.9594823497526933, + "grad_norm": 5.5742058753967285, + "learning_rate": 8.164145389828189e-05, + "loss": 0.5194, + "step": 14161 + }, + { + "epoch": 0.9595501050206654, + "grad_norm": 7.248497486114502, + "learning_rate": 8.164008487918407e-05, + "loss": 0.603, + "step": 14162 + }, + { + "epoch": 0.9596178602886375, + "grad_norm": 5.80116081237793, + "learning_rate": 8.163871586008626e-05, + "loss": 0.7965, + "step": 14163 + }, + { + "epoch": 0.9596856155566095, + "grad_norm": 7.90059232711792, + "learning_rate": 8.163734684098844e-05, + "loss": 1.0141, + "step": 14164 + }, + { + "epoch": 0.9597533708245816, + "grad_norm": 4.742366790771484, + "learning_rate": 8.163597782189062e-05, + "loss": 0.8223, + "step": 14165 + }, + { + "epoch": 0.9598211260925537, + "grad_norm": 7.762453079223633, + "learning_rate": 8.16346088027928e-05, + "loss": 0.7047, + "step": 14166 + }, + { + "epoch": 0.9598888813605257, + "grad_norm": 5.143554210662842, + "learning_rate": 8.163323978369498e-05, + "loss": 0.7639, + "step": 14167 + }, + { + "epoch": 0.9599566366284978, + "grad_norm": 6.1197285652160645, + "learning_rate": 8.163187076459718e-05, + "loss": 0.6587, + "step": 14168 + }, + { + "epoch": 0.9600243918964699, + "grad_norm": 5.0258049964904785, + "learning_rate": 8.163050174549936e-05, + "loss": 0.576, + "step": 14169 + }, + { + "epoch": 0.960092147164442, + "grad_norm": 4.709690570831299, + "learning_rate": 8.162913272640154e-05, + "loss": 0.5911, + "step": 14170 + }, + { + "epoch": 0.9601599024324141, + "grad_norm": 5.788050174713135, + "learning_rate": 8.162776370730372e-05, + "loss": 0.7834, + "step": 14171 + }, + { + "epoch": 0.9602276577003862, + "grad_norm": 5.154922008514404, + "learning_rate": 8.162639468820591e-05, + "loss": 0.5965, + "step": 14172 + }, + { + "epoch": 0.9602954129683583, + "grad_norm": 8.04469108581543, + "learning_rate": 8.16250256691081e-05, + "loss": 0.722, + "step": 14173 + }, + { + "epoch": 0.9603631682363304, + "grad_norm": 6.361427307128906, + "learning_rate": 8.162365665001027e-05, + "loss": 0.478, + "step": 14174 + }, + { + "epoch": 0.9604309235043025, + "grad_norm": 6.577165603637695, + "learning_rate": 8.162228763091245e-05, + "loss": 0.9066, + "step": 14175 + }, + { + "epoch": 0.9604986787722746, + "grad_norm": 6.255192756652832, + "learning_rate": 8.162091861181463e-05, + "loss": 0.8372, + "step": 14176 + }, + { + "epoch": 0.9605664340402467, + "grad_norm": 7.014744758605957, + "learning_rate": 8.161954959271683e-05, + "loss": 0.9053, + "step": 14177 + }, + { + "epoch": 0.9606341893082188, + "grad_norm": 5.058319091796875, + "learning_rate": 8.161818057361901e-05, + "loss": 0.7486, + "step": 14178 + }, + { + "epoch": 0.9607019445761908, + "grad_norm": 6.558164119720459, + "learning_rate": 8.161681155452119e-05, + "loss": 0.8439, + "step": 14179 + }, + { + "epoch": 0.9607696998441629, + "grad_norm": 6.055545330047607, + "learning_rate": 8.161544253542337e-05, + "loss": 0.6836, + "step": 14180 + }, + { + "epoch": 0.9608374551121349, + "grad_norm": 6.022161483764648, + "learning_rate": 8.161407351632556e-05, + "loss": 0.7918, + "step": 14181 + }, + { + "epoch": 0.960905210380107, + "grad_norm": 5.69798469543457, + "learning_rate": 8.161270449722774e-05, + "loss": 0.9815, + "step": 14182 + }, + { + "epoch": 0.9609729656480791, + "grad_norm": 4.769881248474121, + "learning_rate": 8.161133547812992e-05, + "loss": 0.7156, + "step": 14183 + }, + { + "epoch": 0.9610407209160512, + "grad_norm": 5.956376552581787, + "learning_rate": 8.16099664590321e-05, + "loss": 0.8751, + "step": 14184 + }, + { + "epoch": 0.9611084761840233, + "grad_norm": 5.9965128898620605, + "learning_rate": 8.160859743993428e-05, + "loss": 0.808, + "step": 14185 + }, + { + "epoch": 0.9611762314519954, + "grad_norm": 5.199033737182617, + "learning_rate": 8.160722842083648e-05, + "loss": 0.6829, + "step": 14186 + }, + { + "epoch": 0.9612439867199675, + "grad_norm": 6.520019054412842, + "learning_rate": 8.160585940173866e-05, + "loss": 0.8844, + "step": 14187 + }, + { + "epoch": 0.9613117419879396, + "grad_norm": 5.944149017333984, + "learning_rate": 8.160449038264084e-05, + "loss": 0.7207, + "step": 14188 + }, + { + "epoch": 0.9613794972559117, + "grad_norm": 5.289858818054199, + "learning_rate": 8.160312136354302e-05, + "loss": 0.7532, + "step": 14189 + }, + { + "epoch": 0.9614472525238837, + "grad_norm": 5.0533223152160645, + "learning_rate": 8.16017523444452e-05, + "loss": 0.9016, + "step": 14190 + }, + { + "epoch": 0.9615150077918558, + "grad_norm": 5.705595016479492, + "learning_rate": 8.16003833253474e-05, + "loss": 0.6944, + "step": 14191 + }, + { + "epoch": 0.9615827630598279, + "grad_norm": 5.3832292556762695, + "learning_rate": 8.159901430624957e-05, + "loss": 0.6348, + "step": 14192 + }, + { + "epoch": 0.9616505183278, + "grad_norm": 5.7924041748046875, + "learning_rate": 8.159764528715175e-05, + "loss": 0.7927, + "step": 14193 + }, + { + "epoch": 0.9617182735957721, + "grad_norm": 5.289419174194336, + "learning_rate": 8.159627626805393e-05, + "loss": 0.7989, + "step": 14194 + }, + { + "epoch": 0.9617860288637442, + "grad_norm": 7.650732040405273, + "learning_rate": 8.159490724895613e-05, + "loss": 0.82, + "step": 14195 + }, + { + "epoch": 0.9618537841317163, + "grad_norm": 6.727295398712158, + "learning_rate": 8.159353822985831e-05, + "loss": 0.7943, + "step": 14196 + }, + { + "epoch": 0.9619215393996883, + "grad_norm": 5.865251064300537, + "learning_rate": 8.159216921076049e-05, + "loss": 0.8241, + "step": 14197 + }, + { + "epoch": 0.9619892946676604, + "grad_norm": 6.261574745178223, + "learning_rate": 8.159080019166267e-05, + "loss": 0.8547, + "step": 14198 + }, + { + "epoch": 0.9620570499356325, + "grad_norm": 5.138889789581299, + "learning_rate": 8.158943117256485e-05, + "loss": 0.806, + "step": 14199 + }, + { + "epoch": 0.9621248052036045, + "grad_norm": 6.540297508239746, + "learning_rate": 8.158806215346704e-05, + "loss": 0.783, + "step": 14200 + }, + { + "epoch": 0.9621925604715766, + "grad_norm": 5.47922945022583, + "learning_rate": 8.158669313436922e-05, + "loss": 0.9455, + "step": 14201 + }, + { + "epoch": 0.9622603157395487, + "grad_norm": 4.53643798828125, + "learning_rate": 8.15853241152714e-05, + "loss": 0.6482, + "step": 14202 + }, + { + "epoch": 0.9623280710075208, + "grad_norm": 7.245009422302246, + "learning_rate": 8.15839550961736e-05, + "loss": 1.0062, + "step": 14203 + }, + { + "epoch": 0.9623958262754929, + "grad_norm": 5.578246116638184, + "learning_rate": 8.158258607707578e-05, + "loss": 0.7564, + "step": 14204 + }, + { + "epoch": 0.962463581543465, + "grad_norm": 5.920526027679443, + "learning_rate": 8.158121705797796e-05, + "loss": 0.6843, + "step": 14205 + }, + { + "epoch": 0.9625313368114371, + "grad_norm": 6.317756652832031, + "learning_rate": 8.157984803888015e-05, + "loss": 0.8409, + "step": 14206 + }, + { + "epoch": 0.9625990920794092, + "grad_norm": 7.818577766418457, + "learning_rate": 8.157847901978233e-05, + "loss": 0.8203, + "step": 14207 + }, + { + "epoch": 0.9626668473473813, + "grad_norm": 6.75308084487915, + "learning_rate": 8.157711000068451e-05, + "loss": 0.6391, + "step": 14208 + }, + { + "epoch": 0.9627346026153534, + "grad_norm": 8.25421142578125, + "learning_rate": 8.157574098158671e-05, + "loss": 0.7555, + "step": 14209 + }, + { + "epoch": 0.9628023578833255, + "grad_norm": 7.660693168640137, + "learning_rate": 8.157437196248889e-05, + "loss": 0.9737, + "step": 14210 + }, + { + "epoch": 0.9628701131512976, + "grad_norm": 5.611537456512451, + "learning_rate": 8.157300294339107e-05, + "loss": 0.835, + "step": 14211 + }, + { + "epoch": 0.9629378684192696, + "grad_norm": 6.340275287628174, + "learning_rate": 8.157163392429325e-05, + "loss": 0.6979, + "step": 14212 + }, + { + "epoch": 0.9630056236872416, + "grad_norm": 7.211668968200684, + "learning_rate": 8.157026490519543e-05, + "loss": 0.8135, + "step": 14213 + }, + { + "epoch": 0.9630733789552137, + "grad_norm": 7.507893085479736, + "learning_rate": 8.156889588609762e-05, + "loss": 1.0498, + "step": 14214 + }, + { + "epoch": 0.9631411342231858, + "grad_norm": 6.938470840454102, + "learning_rate": 8.15675268669998e-05, + "loss": 0.6888, + "step": 14215 + }, + { + "epoch": 0.9632088894911579, + "grad_norm": 6.91562557220459, + "learning_rate": 8.156615784790198e-05, + "loss": 0.6908, + "step": 14216 + }, + { + "epoch": 0.96327664475913, + "grad_norm": 5.775163650512695, + "learning_rate": 8.156478882880416e-05, + "loss": 0.7053, + "step": 14217 + }, + { + "epoch": 0.9633444000271021, + "grad_norm": 6.454747676849365, + "learning_rate": 8.156341980970636e-05, + "loss": 0.7105, + "step": 14218 + }, + { + "epoch": 0.9634121552950742, + "grad_norm": 6.036716461181641, + "learning_rate": 8.156205079060854e-05, + "loss": 1.0272, + "step": 14219 + }, + { + "epoch": 0.9634799105630463, + "grad_norm": 5.749178886413574, + "learning_rate": 8.156068177151072e-05, + "loss": 0.8823, + "step": 14220 + }, + { + "epoch": 0.9635476658310184, + "grad_norm": 6.171823978424072, + "learning_rate": 8.15593127524129e-05, + "loss": 0.6209, + "step": 14221 + }, + { + "epoch": 0.9636154210989905, + "grad_norm": 6.761941432952881, + "learning_rate": 8.155794373331508e-05, + "loss": 0.7299, + "step": 14222 + }, + { + "epoch": 0.9636831763669625, + "grad_norm": 4.620570659637451, + "learning_rate": 8.155657471421727e-05, + "loss": 0.9088, + "step": 14223 + }, + { + "epoch": 0.9637509316349346, + "grad_norm": 5.813077449798584, + "learning_rate": 8.155520569511945e-05, + "loss": 0.7737, + "step": 14224 + }, + { + "epoch": 0.9638186869029067, + "grad_norm": 6.989836692810059, + "learning_rate": 8.155383667602163e-05, + "loss": 1.0168, + "step": 14225 + }, + { + "epoch": 0.9638864421708788, + "grad_norm": 8.76766586303711, + "learning_rate": 8.155246765692381e-05, + "loss": 0.7488, + "step": 14226 + }, + { + "epoch": 0.9639541974388509, + "grad_norm": 6.669302940368652, + "learning_rate": 8.155109863782601e-05, + "loss": 0.8634, + "step": 14227 + }, + { + "epoch": 0.964021952706823, + "grad_norm": 6.353033065795898, + "learning_rate": 8.154972961872819e-05, + "loss": 0.8018, + "step": 14228 + }, + { + "epoch": 0.964089707974795, + "grad_norm": 7.134089946746826, + "learning_rate": 8.154836059963037e-05, + "loss": 0.7599, + "step": 14229 + }, + { + "epoch": 0.9641574632427671, + "grad_norm": 5.467618465423584, + "learning_rate": 8.154699158053255e-05, + "loss": 0.7725, + "step": 14230 + }, + { + "epoch": 0.9642252185107392, + "grad_norm": 7.044497489929199, + "learning_rate": 8.154562256143473e-05, + "loss": 0.9365, + "step": 14231 + }, + { + "epoch": 0.9642929737787113, + "grad_norm": 5.421668529510498, + "learning_rate": 8.154425354233692e-05, + "loss": 0.7032, + "step": 14232 + }, + { + "epoch": 0.9643607290466834, + "grad_norm": 7.38834285736084, + "learning_rate": 8.15428845232391e-05, + "loss": 0.8996, + "step": 14233 + }, + { + "epoch": 0.9644284843146554, + "grad_norm": 6.052585124969482, + "learning_rate": 8.154151550414128e-05, + "loss": 0.7436, + "step": 14234 + }, + { + "epoch": 0.9644962395826275, + "grad_norm": 5.307525634765625, + "learning_rate": 8.154014648504346e-05, + "loss": 0.6881, + "step": 14235 + }, + { + "epoch": 0.9645639948505996, + "grad_norm": 5.4196062088012695, + "learning_rate": 8.153877746594566e-05, + "loss": 0.8157, + "step": 14236 + }, + { + "epoch": 0.9646317501185717, + "grad_norm": 7.922184467315674, + "learning_rate": 8.153740844684784e-05, + "loss": 0.7146, + "step": 14237 + }, + { + "epoch": 0.9646995053865438, + "grad_norm": 6.831099510192871, + "learning_rate": 8.153603942775002e-05, + "loss": 0.7294, + "step": 14238 + }, + { + "epoch": 0.9647672606545159, + "grad_norm": 4.776399612426758, + "learning_rate": 8.15346704086522e-05, + "loss": 0.6228, + "step": 14239 + }, + { + "epoch": 0.964835015922488, + "grad_norm": 6.230729103088379, + "learning_rate": 8.153330138955438e-05, + "loss": 0.8572, + "step": 14240 + }, + { + "epoch": 0.9649027711904601, + "grad_norm": 5.276001453399658, + "learning_rate": 8.153193237045657e-05, + "loss": 0.7314, + "step": 14241 + }, + { + "epoch": 0.9649705264584322, + "grad_norm": 7.109437465667725, + "learning_rate": 8.153056335135875e-05, + "loss": 1.0352, + "step": 14242 + }, + { + "epoch": 0.9650382817264043, + "grad_norm": 7.070680141448975, + "learning_rate": 8.152919433226093e-05, + "loss": 0.8926, + "step": 14243 + }, + { + "epoch": 0.9651060369943764, + "grad_norm": 6.073431015014648, + "learning_rate": 8.152782531316311e-05, + "loss": 0.7816, + "step": 14244 + }, + { + "epoch": 0.9651737922623485, + "grad_norm": 8.69691276550293, + "learning_rate": 8.15264562940653e-05, + "loss": 0.9329, + "step": 14245 + }, + { + "epoch": 0.9652415475303204, + "grad_norm": 5.673532962799072, + "learning_rate": 8.152508727496749e-05, + "loss": 0.7321, + "step": 14246 + }, + { + "epoch": 0.9653093027982925, + "grad_norm": 5.062224864959717, + "learning_rate": 8.152371825586967e-05, + "loss": 0.7234, + "step": 14247 + }, + { + "epoch": 0.9653770580662646, + "grad_norm": 7.115236282348633, + "learning_rate": 8.152234923677185e-05, + "loss": 0.6885, + "step": 14248 + }, + { + "epoch": 0.9654448133342367, + "grad_norm": 5.9798173904418945, + "learning_rate": 8.152098021767404e-05, + "loss": 0.8179, + "step": 14249 + }, + { + "epoch": 0.9655125686022088, + "grad_norm": 6.661346435546875, + "learning_rate": 8.151961119857622e-05, + "loss": 0.7467, + "step": 14250 + }, + { + "epoch": 0.9655803238701809, + "grad_norm": 6.935898780822754, + "learning_rate": 8.15182421794784e-05, + "loss": 0.6645, + "step": 14251 + }, + { + "epoch": 0.965648079138153, + "grad_norm": 4.401814937591553, + "learning_rate": 8.15168731603806e-05, + "loss": 0.5323, + "step": 14252 + }, + { + "epoch": 0.9657158344061251, + "grad_norm": 5.884891033172607, + "learning_rate": 8.151550414128278e-05, + "loss": 0.7988, + "step": 14253 + }, + { + "epoch": 0.9657835896740972, + "grad_norm": 5.859108924865723, + "learning_rate": 8.151413512218496e-05, + "loss": 0.7108, + "step": 14254 + }, + { + "epoch": 0.9658513449420693, + "grad_norm": 6.580816745758057, + "learning_rate": 8.151276610308715e-05, + "loss": 0.8701, + "step": 14255 + }, + { + "epoch": 0.9659191002100413, + "grad_norm": 5.882786750793457, + "learning_rate": 8.151139708398933e-05, + "loss": 0.735, + "step": 14256 + }, + { + "epoch": 0.9659868554780134, + "grad_norm": 6.60660982131958, + "learning_rate": 8.151002806489151e-05, + "loss": 0.7883, + "step": 14257 + }, + { + "epoch": 0.9660546107459855, + "grad_norm": 6.070260047912598, + "learning_rate": 8.15086590457937e-05, + "loss": 0.763, + "step": 14258 + }, + { + "epoch": 0.9661223660139576, + "grad_norm": 5.992532730102539, + "learning_rate": 8.150729002669589e-05, + "loss": 1.0009, + "step": 14259 + }, + { + "epoch": 0.9661901212819297, + "grad_norm": 5.648770332336426, + "learning_rate": 8.150592100759807e-05, + "loss": 0.6503, + "step": 14260 + }, + { + "epoch": 0.9662578765499018, + "grad_norm": 7.13828706741333, + "learning_rate": 8.150455198850025e-05, + "loss": 0.6251, + "step": 14261 + }, + { + "epoch": 0.9663256318178738, + "grad_norm": 5.505036354064941, + "learning_rate": 8.150318296940243e-05, + "loss": 0.5837, + "step": 14262 + }, + { + "epoch": 0.9663933870858459, + "grad_norm": 7.71685791015625, + "learning_rate": 8.150181395030461e-05, + "loss": 0.6676, + "step": 14263 + }, + { + "epoch": 0.966461142353818, + "grad_norm": 5.185730934143066, + "learning_rate": 8.15004449312068e-05, + "loss": 0.6651, + "step": 14264 + }, + { + "epoch": 0.9665288976217901, + "grad_norm": 6.014042854309082, + "learning_rate": 8.149907591210898e-05, + "loss": 0.6928, + "step": 14265 + }, + { + "epoch": 0.9665966528897622, + "grad_norm": 6.797776699066162, + "learning_rate": 8.149770689301116e-05, + "loss": 0.6754, + "step": 14266 + }, + { + "epoch": 0.9666644081577342, + "grad_norm": 4.643877029418945, + "learning_rate": 8.149633787391334e-05, + "loss": 0.4717, + "step": 14267 + }, + { + "epoch": 0.9667321634257063, + "grad_norm": 5.930227756500244, + "learning_rate": 8.149496885481552e-05, + "loss": 0.6709, + "step": 14268 + }, + { + "epoch": 0.9667999186936784, + "grad_norm": 6.046914100646973, + "learning_rate": 8.149359983571772e-05, + "loss": 0.7147, + "step": 14269 + }, + { + "epoch": 0.9668676739616505, + "grad_norm": 6.127531051635742, + "learning_rate": 8.14922308166199e-05, + "loss": 0.6944, + "step": 14270 + }, + { + "epoch": 0.9669354292296226, + "grad_norm": 7.445454120635986, + "learning_rate": 8.149086179752208e-05, + "loss": 0.7084, + "step": 14271 + }, + { + "epoch": 0.9670031844975947, + "grad_norm": 5.6586995124816895, + "learning_rate": 8.148949277842426e-05, + "loss": 0.7989, + "step": 14272 + }, + { + "epoch": 0.9670709397655668, + "grad_norm": 7.164182186126709, + "learning_rate": 8.148812375932645e-05, + "loss": 0.9223, + "step": 14273 + }, + { + "epoch": 0.9671386950335389, + "grad_norm": 5.000169277191162, + "learning_rate": 8.148675474022863e-05, + "loss": 0.723, + "step": 14274 + }, + { + "epoch": 0.967206450301511, + "grad_norm": 6.657342433929443, + "learning_rate": 8.148538572113081e-05, + "loss": 0.6303, + "step": 14275 + }, + { + "epoch": 0.9672742055694831, + "grad_norm": 6.077153205871582, + "learning_rate": 8.1484016702033e-05, + "loss": 0.8003, + "step": 14276 + }, + { + "epoch": 0.9673419608374552, + "grad_norm": 6.259696960449219, + "learning_rate": 8.148264768293517e-05, + "loss": 0.8303, + "step": 14277 + }, + { + "epoch": 0.9674097161054271, + "grad_norm": 5.3037190437316895, + "learning_rate": 8.148127866383737e-05, + "loss": 0.5536, + "step": 14278 + }, + { + "epoch": 0.9674774713733992, + "grad_norm": 4.652920722961426, + "learning_rate": 8.147990964473955e-05, + "loss": 0.5946, + "step": 14279 + }, + { + "epoch": 0.9675452266413713, + "grad_norm": 8.112478256225586, + "learning_rate": 8.147854062564173e-05, + "loss": 0.7329, + "step": 14280 + }, + { + "epoch": 0.9676129819093434, + "grad_norm": 6.562613487243652, + "learning_rate": 8.147717160654391e-05, + "loss": 0.81, + "step": 14281 + }, + { + "epoch": 0.9676807371773155, + "grad_norm": 4.556642532348633, + "learning_rate": 8.14758025874461e-05, + "loss": 0.925, + "step": 14282 + }, + { + "epoch": 0.9677484924452876, + "grad_norm": 8.244071006774902, + "learning_rate": 8.147443356834828e-05, + "loss": 1.117, + "step": 14283 + }, + { + "epoch": 0.9678162477132597, + "grad_norm": 7.206400394439697, + "learning_rate": 8.147306454925046e-05, + "loss": 0.5954, + "step": 14284 + }, + { + "epoch": 0.9678840029812318, + "grad_norm": 6.174105644226074, + "learning_rate": 8.147169553015264e-05, + "loss": 0.8997, + "step": 14285 + }, + { + "epoch": 0.9679517582492039, + "grad_norm": 6.076737880706787, + "learning_rate": 8.147032651105482e-05, + "loss": 0.7188, + "step": 14286 + }, + { + "epoch": 0.968019513517176, + "grad_norm": 6.2391462326049805, + "learning_rate": 8.146895749195702e-05, + "loss": 0.8216, + "step": 14287 + }, + { + "epoch": 0.9680872687851481, + "grad_norm": 6.028003215789795, + "learning_rate": 8.14675884728592e-05, + "loss": 0.8959, + "step": 14288 + }, + { + "epoch": 0.9681550240531202, + "grad_norm": 7.618561744689941, + "learning_rate": 8.146621945376138e-05, + "loss": 0.7853, + "step": 14289 + }, + { + "epoch": 0.9682227793210922, + "grad_norm": 5.370663642883301, + "learning_rate": 8.146485043466356e-05, + "loss": 0.7142, + "step": 14290 + }, + { + "epoch": 0.9682905345890643, + "grad_norm": 5.558692455291748, + "learning_rate": 8.146348141556574e-05, + "loss": 0.7017, + "step": 14291 + }, + { + "epoch": 0.9683582898570364, + "grad_norm": 4.1195902824401855, + "learning_rate": 8.146211239646793e-05, + "loss": 0.6091, + "step": 14292 + }, + { + "epoch": 0.9684260451250085, + "grad_norm": 4.796550750732422, + "learning_rate": 8.146074337737011e-05, + "loss": 0.6297, + "step": 14293 + }, + { + "epoch": 0.9684938003929806, + "grad_norm": 5.585738658905029, + "learning_rate": 8.14593743582723e-05, + "loss": 0.7508, + "step": 14294 + }, + { + "epoch": 0.9685615556609526, + "grad_norm": 7.204619407653809, + "learning_rate": 8.145800533917449e-05, + "loss": 0.8782, + "step": 14295 + }, + { + "epoch": 0.9686293109289247, + "grad_norm": 6.494380474090576, + "learning_rate": 8.145663632007667e-05, + "loss": 0.8831, + "step": 14296 + }, + { + "epoch": 0.9686970661968968, + "grad_norm": 4.968986511230469, + "learning_rate": 8.145526730097885e-05, + "loss": 0.6463, + "step": 14297 + }, + { + "epoch": 0.9687648214648689, + "grad_norm": 4.912354946136475, + "learning_rate": 8.145389828188104e-05, + "loss": 0.8504, + "step": 14298 + }, + { + "epoch": 0.968832576732841, + "grad_norm": 8.093084335327148, + "learning_rate": 8.145252926278322e-05, + "loss": 0.7451, + "step": 14299 + }, + { + "epoch": 0.968900332000813, + "grad_norm": 4.853938579559326, + "learning_rate": 8.14511602436854e-05, + "loss": 0.7233, + "step": 14300 + }, + { + "epoch": 0.9689680872687851, + "grad_norm": 5.394782066345215, + "learning_rate": 8.14497912245876e-05, + "loss": 0.9122, + "step": 14301 + }, + { + "epoch": 0.9690358425367572, + "grad_norm": 5.5748820304870605, + "learning_rate": 8.144842220548978e-05, + "loss": 0.9087, + "step": 14302 + }, + { + "epoch": 0.9691035978047293, + "grad_norm": 5.095332145690918, + "learning_rate": 8.144705318639196e-05, + "loss": 0.8094, + "step": 14303 + }, + { + "epoch": 0.9691713530727014, + "grad_norm": 6.252867221832275, + "learning_rate": 8.144568416729414e-05, + "loss": 0.8471, + "step": 14304 + }, + { + "epoch": 0.9692391083406735, + "grad_norm": 5.3329949378967285, + "learning_rate": 8.144431514819633e-05, + "loss": 0.7303, + "step": 14305 + }, + { + "epoch": 0.9693068636086456, + "grad_norm": 6.976050853729248, + "learning_rate": 8.144294612909851e-05, + "loss": 0.7868, + "step": 14306 + }, + { + "epoch": 0.9693746188766177, + "grad_norm": 6.1792073249816895, + "learning_rate": 8.144157711000069e-05, + "loss": 0.9973, + "step": 14307 + }, + { + "epoch": 0.9694423741445898, + "grad_norm": 4.349170684814453, + "learning_rate": 8.144020809090287e-05, + "loss": 0.7458, + "step": 14308 + }, + { + "epoch": 0.9695101294125619, + "grad_norm": 7.618136405944824, + "learning_rate": 8.143883907180505e-05, + "loss": 0.6679, + "step": 14309 + }, + { + "epoch": 0.969577884680534, + "grad_norm": 5.811389923095703, + "learning_rate": 8.143747005270725e-05, + "loss": 0.6621, + "step": 14310 + }, + { + "epoch": 0.9696456399485059, + "grad_norm": 7.1004252433776855, + "learning_rate": 8.143610103360943e-05, + "loss": 0.9404, + "step": 14311 + }, + { + "epoch": 0.969713395216478, + "grad_norm": 6.3730998039245605, + "learning_rate": 8.143473201451161e-05, + "loss": 0.9286, + "step": 14312 + }, + { + "epoch": 0.9697811504844501, + "grad_norm": 5.840987205505371, + "learning_rate": 8.143336299541379e-05, + "loss": 0.6846, + "step": 14313 + }, + { + "epoch": 0.9698489057524222, + "grad_norm": 6.054294109344482, + "learning_rate": 8.143199397631598e-05, + "loss": 0.836, + "step": 14314 + }, + { + "epoch": 0.9699166610203943, + "grad_norm": 5.046802043914795, + "learning_rate": 8.143062495721816e-05, + "loss": 0.7202, + "step": 14315 + }, + { + "epoch": 0.9699844162883664, + "grad_norm": 4.955052375793457, + "learning_rate": 8.142925593812034e-05, + "loss": 0.7018, + "step": 14316 + }, + { + "epoch": 0.9700521715563385, + "grad_norm": 5.141872882843018, + "learning_rate": 8.142788691902252e-05, + "loss": 0.817, + "step": 14317 + }, + { + "epoch": 0.9701199268243106, + "grad_norm": 9.178304672241211, + "learning_rate": 8.14265178999247e-05, + "loss": 0.6665, + "step": 14318 + }, + { + "epoch": 0.9701876820922827, + "grad_norm": 9.323168754577637, + "learning_rate": 8.14251488808269e-05, + "loss": 0.997, + "step": 14319 + }, + { + "epoch": 0.9702554373602548, + "grad_norm": 5.158806800842285, + "learning_rate": 8.142377986172908e-05, + "loss": 0.619, + "step": 14320 + }, + { + "epoch": 0.9703231926282269, + "grad_norm": 4.141146183013916, + "learning_rate": 8.142241084263126e-05, + "loss": 0.6202, + "step": 14321 + }, + { + "epoch": 0.970390947896199, + "grad_norm": 6.7444634437561035, + "learning_rate": 8.142104182353344e-05, + "loss": 0.9239, + "step": 14322 + }, + { + "epoch": 0.970458703164171, + "grad_norm": 6.318787574768066, + "learning_rate": 8.141967280443562e-05, + "loss": 0.6277, + "step": 14323 + }, + { + "epoch": 0.9705264584321431, + "grad_norm": 7.356907844543457, + "learning_rate": 8.141830378533781e-05, + "loss": 0.8639, + "step": 14324 + }, + { + "epoch": 0.9705942137001152, + "grad_norm": 6.62352180480957, + "learning_rate": 8.141693476623999e-05, + "loss": 0.8497, + "step": 14325 + }, + { + "epoch": 0.9706619689680873, + "grad_norm": 7.7815093994140625, + "learning_rate": 8.141556574714217e-05, + "loss": 0.788, + "step": 14326 + }, + { + "epoch": 0.9707297242360593, + "grad_norm": 5.715222358703613, + "learning_rate": 8.141419672804435e-05, + "loss": 0.6403, + "step": 14327 + }, + { + "epoch": 0.9707974795040314, + "grad_norm": 6.145988464355469, + "learning_rate": 8.141282770894655e-05, + "loss": 0.4776, + "step": 14328 + }, + { + "epoch": 0.9708652347720035, + "grad_norm": 5.906881809234619, + "learning_rate": 8.141145868984873e-05, + "loss": 0.6636, + "step": 14329 + }, + { + "epoch": 0.9709329900399756, + "grad_norm": 4.619365215301514, + "learning_rate": 8.141008967075091e-05, + "loss": 0.678, + "step": 14330 + }, + { + "epoch": 0.9710007453079477, + "grad_norm": 6.293912410736084, + "learning_rate": 8.140872065165309e-05, + "loss": 0.6734, + "step": 14331 + }, + { + "epoch": 0.9710685005759198, + "grad_norm": 8.143105506896973, + "learning_rate": 8.140735163255527e-05, + "loss": 0.708, + "step": 14332 + }, + { + "epoch": 0.9711362558438918, + "grad_norm": 5.470721244812012, + "learning_rate": 8.140598261345746e-05, + "loss": 0.9017, + "step": 14333 + }, + { + "epoch": 0.9712040111118639, + "grad_norm": 6.059875011444092, + "learning_rate": 8.140461359435964e-05, + "loss": 0.8164, + "step": 14334 + }, + { + "epoch": 0.971271766379836, + "grad_norm": 6.589235782623291, + "learning_rate": 8.140324457526182e-05, + "loss": 0.9063, + "step": 14335 + }, + { + "epoch": 0.9713395216478081, + "grad_norm": 5.400428771972656, + "learning_rate": 8.1401875556164e-05, + "loss": 0.5756, + "step": 14336 + }, + { + "epoch": 0.9714072769157802, + "grad_norm": 4.511440277099609, + "learning_rate": 8.14005065370662e-05, + "loss": 0.6725, + "step": 14337 + }, + { + "epoch": 0.9714750321837523, + "grad_norm": 8.179146766662598, + "learning_rate": 8.139913751796838e-05, + "loss": 0.8878, + "step": 14338 + }, + { + "epoch": 0.9715427874517244, + "grad_norm": 9.839139938354492, + "learning_rate": 8.139776849887056e-05, + "loss": 0.9057, + "step": 14339 + }, + { + "epoch": 0.9716105427196965, + "grad_norm": 4.975765705108643, + "learning_rate": 8.139639947977274e-05, + "loss": 0.694, + "step": 14340 + }, + { + "epoch": 0.9716782979876686, + "grad_norm": 4.968737602233887, + "learning_rate": 8.139503046067492e-05, + "loss": 0.5297, + "step": 14341 + }, + { + "epoch": 0.9717460532556407, + "grad_norm": 7.143984317779541, + "learning_rate": 8.139366144157711e-05, + "loss": 0.8471, + "step": 14342 + }, + { + "epoch": 0.9718138085236128, + "grad_norm": 6.1333818435668945, + "learning_rate": 8.13922924224793e-05, + "loss": 0.8179, + "step": 14343 + }, + { + "epoch": 0.9718815637915847, + "grad_norm": 5.501560688018799, + "learning_rate": 8.139092340338147e-05, + "loss": 0.6908, + "step": 14344 + }, + { + "epoch": 0.9719493190595568, + "grad_norm": 5.844865322113037, + "learning_rate": 8.138955438428367e-05, + "loss": 0.7405, + "step": 14345 + }, + { + "epoch": 0.9720170743275289, + "grad_norm": 8.380400657653809, + "learning_rate": 8.138818536518585e-05, + "loss": 0.9078, + "step": 14346 + }, + { + "epoch": 0.972084829595501, + "grad_norm": 5.638479232788086, + "learning_rate": 8.138681634608803e-05, + "loss": 0.7455, + "step": 14347 + }, + { + "epoch": 0.9721525848634731, + "grad_norm": 8.631559371948242, + "learning_rate": 8.138544732699022e-05, + "loss": 0.7743, + "step": 14348 + }, + { + "epoch": 0.9722203401314452, + "grad_norm": 4.673583507537842, + "learning_rate": 8.13840783078924e-05, + "loss": 0.5662, + "step": 14349 + }, + { + "epoch": 0.9722880953994173, + "grad_norm": 4.844860553741455, + "learning_rate": 8.138270928879458e-05, + "loss": 0.6779, + "step": 14350 + }, + { + "epoch": 0.9723558506673894, + "grad_norm": 5.8541154861450195, + "learning_rate": 8.138134026969678e-05, + "loss": 0.7505, + "step": 14351 + }, + { + "epoch": 0.9724236059353615, + "grad_norm": 4.8828654289245605, + "learning_rate": 8.137997125059896e-05, + "loss": 0.7733, + "step": 14352 + }, + { + "epoch": 0.9724913612033336, + "grad_norm": 5.3108086585998535, + "learning_rate": 8.137860223150114e-05, + "loss": 0.7705, + "step": 14353 + }, + { + "epoch": 0.9725591164713057, + "grad_norm": 6.572815418243408, + "learning_rate": 8.137723321240332e-05, + "loss": 1.0102, + "step": 14354 + }, + { + "epoch": 0.9726268717392778, + "grad_norm": 6.047853946685791, + "learning_rate": 8.13758641933055e-05, + "loss": 0.9682, + "step": 14355 + }, + { + "epoch": 0.9726946270072498, + "grad_norm": 5.171698093414307, + "learning_rate": 8.137449517420769e-05, + "loss": 0.8997, + "step": 14356 + }, + { + "epoch": 0.9727623822752219, + "grad_norm": 5.896270275115967, + "learning_rate": 8.137312615510987e-05, + "loss": 0.9272, + "step": 14357 + }, + { + "epoch": 0.972830137543194, + "grad_norm": 6.361771583557129, + "learning_rate": 8.137175713601205e-05, + "loss": 0.8364, + "step": 14358 + }, + { + "epoch": 0.9728978928111661, + "grad_norm": 6.05178165435791, + "learning_rate": 8.137038811691423e-05, + "loss": 0.9812, + "step": 14359 + }, + { + "epoch": 0.9729656480791381, + "grad_norm": 5.797706127166748, + "learning_rate": 8.136901909781643e-05, + "loss": 0.8025, + "step": 14360 + }, + { + "epoch": 0.9730334033471102, + "grad_norm": 5.319764614105225, + "learning_rate": 8.136765007871861e-05, + "loss": 0.708, + "step": 14361 + }, + { + "epoch": 0.9731011586150823, + "grad_norm": 6.920981407165527, + "learning_rate": 8.136628105962079e-05, + "loss": 0.6947, + "step": 14362 + }, + { + "epoch": 0.9731689138830544, + "grad_norm": 8.11839771270752, + "learning_rate": 8.136491204052297e-05, + "loss": 0.8238, + "step": 14363 + }, + { + "epoch": 0.9732366691510265, + "grad_norm": 5.530993938446045, + "learning_rate": 8.136354302142515e-05, + "loss": 0.5501, + "step": 14364 + }, + { + "epoch": 0.9733044244189986, + "grad_norm": 5.251955986022949, + "learning_rate": 8.136217400232734e-05, + "loss": 0.706, + "step": 14365 + }, + { + "epoch": 0.9733721796869707, + "grad_norm": 6.496428966522217, + "learning_rate": 8.136080498322952e-05, + "loss": 0.8281, + "step": 14366 + }, + { + "epoch": 0.9734399349549427, + "grad_norm": 4.871181488037109, + "learning_rate": 8.13594359641317e-05, + "loss": 0.7147, + "step": 14367 + }, + { + "epoch": 0.9735076902229148, + "grad_norm": 5.397392272949219, + "learning_rate": 8.135806694503388e-05, + "loss": 0.733, + "step": 14368 + }, + { + "epoch": 0.9735754454908869, + "grad_norm": 5.09192419052124, + "learning_rate": 8.135669792593608e-05, + "loss": 0.8182, + "step": 14369 + }, + { + "epoch": 0.973643200758859, + "grad_norm": 7.045880317687988, + "learning_rate": 8.135532890683826e-05, + "loss": 0.8304, + "step": 14370 + }, + { + "epoch": 0.9737109560268311, + "grad_norm": 5.559905529022217, + "learning_rate": 8.135395988774044e-05, + "loss": 0.7637, + "step": 14371 + }, + { + "epoch": 0.9737787112948032, + "grad_norm": 4.846694469451904, + "learning_rate": 8.135259086864262e-05, + "loss": 0.7953, + "step": 14372 + }, + { + "epoch": 0.9738464665627753, + "grad_norm": 4.448090076446533, + "learning_rate": 8.13512218495448e-05, + "loss": 0.7446, + "step": 14373 + }, + { + "epoch": 0.9739142218307474, + "grad_norm": 5.447312355041504, + "learning_rate": 8.134985283044699e-05, + "loss": 0.8034, + "step": 14374 + }, + { + "epoch": 0.9739819770987195, + "grad_norm": 6.372121334075928, + "learning_rate": 8.134848381134917e-05, + "loss": 0.7034, + "step": 14375 + }, + { + "epoch": 0.9740497323666915, + "grad_norm": 6.085412979125977, + "learning_rate": 8.134711479225135e-05, + "loss": 0.7508, + "step": 14376 + }, + { + "epoch": 0.9741174876346635, + "grad_norm": 5.742150783538818, + "learning_rate": 8.134574577315353e-05, + "loss": 0.8239, + "step": 14377 + }, + { + "epoch": 0.9741852429026356, + "grad_norm": 5.154967308044434, + "learning_rate": 8.134437675405571e-05, + "loss": 0.6667, + "step": 14378 + }, + { + "epoch": 0.9742529981706077, + "grad_norm": 6.060741901397705, + "learning_rate": 8.134300773495791e-05, + "loss": 0.803, + "step": 14379 + }, + { + "epoch": 0.9743207534385798, + "grad_norm": 6.090625286102295, + "learning_rate": 8.134163871586009e-05, + "loss": 0.8115, + "step": 14380 + }, + { + "epoch": 0.9743885087065519, + "grad_norm": 6.097843170166016, + "learning_rate": 8.134026969676227e-05, + "loss": 0.7386, + "step": 14381 + }, + { + "epoch": 0.974456263974524, + "grad_norm": 6.539214611053467, + "learning_rate": 8.133890067766445e-05, + "loss": 0.6162, + "step": 14382 + }, + { + "epoch": 0.9745240192424961, + "grad_norm": 5.923181056976318, + "learning_rate": 8.133753165856664e-05, + "loss": 1.0001, + "step": 14383 + }, + { + "epoch": 0.9745917745104682, + "grad_norm": 6.29371976852417, + "learning_rate": 8.133616263946882e-05, + "loss": 0.7975, + "step": 14384 + }, + { + "epoch": 0.9746595297784403, + "grad_norm": 6.171021461486816, + "learning_rate": 8.1334793620371e-05, + "loss": 0.8991, + "step": 14385 + }, + { + "epoch": 0.9747272850464124, + "grad_norm": 5.814321041107178, + "learning_rate": 8.133342460127318e-05, + "loss": 0.525, + "step": 14386 + }, + { + "epoch": 0.9747950403143845, + "grad_norm": 5.774600028991699, + "learning_rate": 8.133205558217536e-05, + "loss": 0.7866, + "step": 14387 + }, + { + "epoch": 0.9748627955823566, + "grad_norm": 5.909730434417725, + "learning_rate": 8.133068656307756e-05, + "loss": 0.6867, + "step": 14388 + }, + { + "epoch": 0.9749305508503286, + "grad_norm": 6.84520149230957, + "learning_rate": 8.132931754397974e-05, + "loss": 0.7958, + "step": 14389 + }, + { + "epoch": 0.9749983061183007, + "grad_norm": 6.0649518966674805, + "learning_rate": 8.132794852488192e-05, + "loss": 0.8553, + "step": 14390 + }, + { + "epoch": 0.9750660613862728, + "grad_norm": 6.470561504364014, + "learning_rate": 8.132657950578411e-05, + "loss": 0.7563, + "step": 14391 + }, + { + "epoch": 0.9751338166542449, + "grad_norm": 5.470592021942139, + "learning_rate": 8.132521048668629e-05, + "loss": 0.7057, + "step": 14392 + }, + { + "epoch": 0.9752015719222169, + "grad_norm": 5.857933044433594, + "learning_rate": 8.132384146758847e-05, + "loss": 0.6859, + "step": 14393 + }, + { + "epoch": 0.975269327190189, + "grad_norm": 6.267986297607422, + "learning_rate": 8.132247244849067e-05, + "loss": 0.6744, + "step": 14394 + }, + { + "epoch": 0.9753370824581611, + "grad_norm": 6.363813400268555, + "learning_rate": 8.132110342939285e-05, + "loss": 0.7194, + "step": 14395 + }, + { + "epoch": 0.9754048377261332, + "grad_norm": 5.119122505187988, + "learning_rate": 8.131973441029503e-05, + "loss": 0.7292, + "step": 14396 + }, + { + "epoch": 0.9754725929941053, + "grad_norm": 5.589879035949707, + "learning_rate": 8.131836539119722e-05, + "loss": 0.786, + "step": 14397 + }, + { + "epoch": 0.9755403482620774, + "grad_norm": 6.809702396392822, + "learning_rate": 8.13169963720994e-05, + "loss": 0.7465, + "step": 14398 + }, + { + "epoch": 0.9756081035300495, + "grad_norm": 5.51494026184082, + "learning_rate": 8.131562735300158e-05, + "loss": 0.7014, + "step": 14399 + }, + { + "epoch": 0.9756758587980215, + "grad_norm": 5.632194519042969, + "learning_rate": 8.131425833390376e-05, + "loss": 0.7217, + "step": 14400 + }, + { + "epoch": 0.9757436140659936, + "grad_norm": 4.676552772521973, + "learning_rate": 8.131288931480594e-05, + "loss": 0.6789, + "step": 14401 + }, + { + "epoch": 0.9758113693339657, + "grad_norm": 5.2599945068359375, + "learning_rate": 8.131152029570814e-05, + "loss": 0.6691, + "step": 14402 + }, + { + "epoch": 0.9758791246019378, + "grad_norm": 5.05983829498291, + "learning_rate": 8.131015127661032e-05, + "loss": 0.6848, + "step": 14403 + }, + { + "epoch": 0.9759468798699099, + "grad_norm": 5.274669170379639, + "learning_rate": 8.13087822575125e-05, + "loss": 0.7596, + "step": 14404 + }, + { + "epoch": 0.976014635137882, + "grad_norm": 6.312376976013184, + "learning_rate": 8.130741323841468e-05, + "loss": 0.6008, + "step": 14405 + }, + { + "epoch": 0.9760823904058541, + "grad_norm": 5.092833995819092, + "learning_rate": 8.130604421931687e-05, + "loss": 0.7497, + "step": 14406 + }, + { + "epoch": 0.9761501456738262, + "grad_norm": 4.330989837646484, + "learning_rate": 8.130467520021905e-05, + "loss": 0.6639, + "step": 14407 + }, + { + "epoch": 0.9762179009417983, + "grad_norm": 6.680692195892334, + "learning_rate": 8.130330618112123e-05, + "loss": 0.9758, + "step": 14408 + }, + { + "epoch": 0.9762856562097703, + "grad_norm": 4.872468948364258, + "learning_rate": 8.130193716202341e-05, + "loss": 0.5678, + "step": 14409 + }, + { + "epoch": 0.9763534114777424, + "grad_norm": 5.9608659744262695, + "learning_rate": 8.130056814292559e-05, + "loss": 0.766, + "step": 14410 + }, + { + "epoch": 0.9764211667457144, + "grad_norm": 8.0074462890625, + "learning_rate": 8.129919912382779e-05, + "loss": 0.808, + "step": 14411 + }, + { + "epoch": 0.9764889220136865, + "grad_norm": 6.391083240509033, + "learning_rate": 8.129783010472997e-05, + "loss": 0.9077, + "step": 14412 + }, + { + "epoch": 0.9765566772816586, + "grad_norm": 9.006216049194336, + "learning_rate": 8.129646108563215e-05, + "loss": 0.8002, + "step": 14413 + }, + { + "epoch": 0.9766244325496307, + "grad_norm": 5.454526901245117, + "learning_rate": 8.129509206653433e-05, + "loss": 0.662, + "step": 14414 + }, + { + "epoch": 0.9766921878176028, + "grad_norm": 5.372074604034424, + "learning_rate": 8.129372304743652e-05, + "loss": 0.7083, + "step": 14415 + }, + { + "epoch": 0.9767599430855749, + "grad_norm": 5.040616035461426, + "learning_rate": 8.12923540283387e-05, + "loss": 0.7178, + "step": 14416 + }, + { + "epoch": 0.976827698353547, + "grad_norm": 6.438399314880371, + "learning_rate": 8.129098500924088e-05, + "loss": 0.9576, + "step": 14417 + }, + { + "epoch": 0.9768954536215191, + "grad_norm": 5.687475204467773, + "learning_rate": 8.128961599014306e-05, + "loss": 0.8086, + "step": 14418 + }, + { + "epoch": 0.9769632088894912, + "grad_norm": 5.61614990234375, + "learning_rate": 8.128824697104524e-05, + "loss": 0.7917, + "step": 14419 + }, + { + "epoch": 0.9770309641574633, + "grad_norm": 6.389354228973389, + "learning_rate": 8.128687795194744e-05, + "loss": 0.7769, + "step": 14420 + }, + { + "epoch": 0.9770987194254354, + "grad_norm": 7.1694231033325195, + "learning_rate": 8.128550893284962e-05, + "loss": 0.6114, + "step": 14421 + }, + { + "epoch": 0.9771664746934074, + "grad_norm": 6.265122890472412, + "learning_rate": 8.12841399137518e-05, + "loss": 0.6074, + "step": 14422 + }, + { + "epoch": 0.9772342299613795, + "grad_norm": 9.472161293029785, + "learning_rate": 8.128277089465398e-05, + "loss": 0.5208, + "step": 14423 + }, + { + "epoch": 0.9773019852293516, + "grad_norm": 4.76262903213501, + "learning_rate": 8.128140187555616e-05, + "loss": 0.7851, + "step": 14424 + }, + { + "epoch": 0.9773697404973236, + "grad_norm": 4.94804048538208, + "learning_rate": 8.128003285645835e-05, + "loss": 0.7913, + "step": 14425 + }, + { + "epoch": 0.9774374957652957, + "grad_norm": 5.631475448608398, + "learning_rate": 8.127866383736053e-05, + "loss": 0.7001, + "step": 14426 + }, + { + "epoch": 0.9775052510332678, + "grad_norm": 6.504068851470947, + "learning_rate": 8.127729481826271e-05, + "loss": 0.6949, + "step": 14427 + }, + { + "epoch": 0.9775730063012399, + "grad_norm": 4.534459114074707, + "learning_rate": 8.12759257991649e-05, + "loss": 0.6107, + "step": 14428 + }, + { + "epoch": 0.977640761569212, + "grad_norm": 5.583250045776367, + "learning_rate": 8.127455678006709e-05, + "loss": 0.6784, + "step": 14429 + }, + { + "epoch": 0.9777085168371841, + "grad_norm": 5.706220626831055, + "learning_rate": 8.127318776096927e-05, + "loss": 0.7571, + "step": 14430 + }, + { + "epoch": 0.9777762721051562, + "grad_norm": 5.701255798339844, + "learning_rate": 8.127181874187145e-05, + "loss": 0.7379, + "step": 14431 + }, + { + "epoch": 0.9778440273731283, + "grad_norm": 5.034109592437744, + "learning_rate": 8.127044972277363e-05, + "loss": 0.6254, + "step": 14432 + }, + { + "epoch": 0.9779117826411003, + "grad_norm": 7.113419532775879, + "learning_rate": 8.126908070367581e-05, + "loss": 0.7667, + "step": 14433 + }, + { + "epoch": 0.9779795379090724, + "grad_norm": 5.958342552185059, + "learning_rate": 8.1267711684578e-05, + "loss": 0.8242, + "step": 14434 + }, + { + "epoch": 0.9780472931770445, + "grad_norm": 5.358188629150391, + "learning_rate": 8.126634266548018e-05, + "loss": 0.8029, + "step": 14435 + }, + { + "epoch": 0.9781150484450166, + "grad_norm": 5.703382968902588, + "learning_rate": 8.126497364638236e-05, + "loss": 0.64, + "step": 14436 + }, + { + "epoch": 0.9781828037129887, + "grad_norm": 6.205333709716797, + "learning_rate": 8.126360462728456e-05, + "loss": 0.7054, + "step": 14437 + }, + { + "epoch": 0.9782505589809608, + "grad_norm": 6.106006145477295, + "learning_rate": 8.126223560818674e-05, + "loss": 0.7458, + "step": 14438 + }, + { + "epoch": 0.9783183142489329, + "grad_norm": 7.8420305252075195, + "learning_rate": 8.126086658908892e-05, + "loss": 0.9453, + "step": 14439 + }, + { + "epoch": 0.978386069516905, + "grad_norm": 5.559987545013428, + "learning_rate": 8.125949756999111e-05, + "loss": 0.5765, + "step": 14440 + }, + { + "epoch": 0.978453824784877, + "grad_norm": 6.2737040519714355, + "learning_rate": 8.125812855089329e-05, + "loss": 0.6614, + "step": 14441 + }, + { + "epoch": 0.9785215800528491, + "grad_norm": 6.900593280792236, + "learning_rate": 8.125675953179547e-05, + "loss": 0.8438, + "step": 14442 + }, + { + "epoch": 0.9785893353208212, + "grad_norm": 5.86058235168457, + "learning_rate": 8.125539051269767e-05, + "loss": 0.7445, + "step": 14443 + }, + { + "epoch": 0.9786570905887932, + "grad_norm": 4.0302348136901855, + "learning_rate": 8.125402149359985e-05, + "loss": 0.767, + "step": 14444 + }, + { + "epoch": 0.9787248458567653, + "grad_norm": 5.090617656707764, + "learning_rate": 8.125265247450203e-05, + "loss": 0.6529, + "step": 14445 + }, + { + "epoch": 0.9787926011247374, + "grad_norm": 5.470541477203369, + "learning_rate": 8.125128345540421e-05, + "loss": 0.7689, + "step": 14446 + }, + { + "epoch": 0.9788603563927095, + "grad_norm": 5.9749627113342285, + "learning_rate": 8.12499144363064e-05, + "loss": 1.0099, + "step": 14447 + }, + { + "epoch": 0.9789281116606816, + "grad_norm": 5.537027359008789, + "learning_rate": 8.124854541720858e-05, + "loss": 0.6033, + "step": 14448 + }, + { + "epoch": 0.9789958669286537, + "grad_norm": 4.773642063140869, + "learning_rate": 8.124717639811076e-05, + "loss": 0.6899, + "step": 14449 + }, + { + "epoch": 0.9790636221966258, + "grad_norm": 6.153696537017822, + "learning_rate": 8.124580737901294e-05, + "loss": 0.9249, + "step": 14450 + }, + { + "epoch": 0.9791313774645979, + "grad_norm": 5.920269966125488, + "learning_rate": 8.124443835991512e-05, + "loss": 0.7868, + "step": 14451 + }, + { + "epoch": 0.97919913273257, + "grad_norm": 5.916412830352783, + "learning_rate": 8.124306934081732e-05, + "loss": 0.6885, + "step": 14452 + }, + { + "epoch": 0.9792668880005421, + "grad_norm": 7.684385299682617, + "learning_rate": 8.12417003217195e-05, + "loss": 0.8591, + "step": 14453 + }, + { + "epoch": 0.9793346432685142, + "grad_norm": 4.769680500030518, + "learning_rate": 8.124033130262168e-05, + "loss": 0.59, + "step": 14454 + }, + { + "epoch": 0.9794023985364863, + "grad_norm": 6.372145175933838, + "learning_rate": 8.123896228352386e-05, + "loss": 0.9633, + "step": 14455 + }, + { + "epoch": 0.9794701538044583, + "grad_norm": 4.667510509490967, + "learning_rate": 8.123759326442604e-05, + "loss": 0.6554, + "step": 14456 + }, + { + "epoch": 0.9795379090724304, + "grad_norm": 6.719541549682617, + "learning_rate": 8.123622424532823e-05, + "loss": 0.6992, + "step": 14457 + }, + { + "epoch": 0.9796056643404024, + "grad_norm": 5.012288570404053, + "learning_rate": 8.123485522623041e-05, + "loss": 0.6368, + "step": 14458 + }, + { + "epoch": 0.9796734196083745, + "grad_norm": 5.666974067687988, + "learning_rate": 8.123348620713259e-05, + "loss": 0.737, + "step": 14459 + }, + { + "epoch": 0.9797411748763466, + "grad_norm": 6.177326679229736, + "learning_rate": 8.123211718803477e-05, + "loss": 0.8604, + "step": 14460 + }, + { + "epoch": 0.9798089301443187, + "grad_norm": 6.877957344055176, + "learning_rate": 8.123074816893697e-05, + "loss": 0.7917, + "step": 14461 + }, + { + "epoch": 0.9798766854122908, + "grad_norm": 7.843241214752197, + "learning_rate": 8.122937914983915e-05, + "loss": 0.68, + "step": 14462 + }, + { + "epoch": 0.9799444406802629, + "grad_norm": 5.025186061859131, + "learning_rate": 8.122801013074133e-05, + "loss": 0.7137, + "step": 14463 + }, + { + "epoch": 0.980012195948235, + "grad_norm": 5.27938175201416, + "learning_rate": 8.122664111164351e-05, + "loss": 0.7029, + "step": 14464 + }, + { + "epoch": 0.9800799512162071, + "grad_norm": 6.0738301277160645, + "learning_rate": 8.122527209254569e-05, + "loss": 0.7922, + "step": 14465 + }, + { + "epoch": 0.9801477064841791, + "grad_norm": 7.217299461364746, + "learning_rate": 8.122390307344788e-05, + "loss": 0.6941, + "step": 14466 + }, + { + "epoch": 0.9802154617521512, + "grad_norm": 4.96075439453125, + "learning_rate": 8.122253405435006e-05, + "loss": 0.6182, + "step": 14467 + }, + { + "epoch": 0.9802832170201233, + "grad_norm": 5.550745964050293, + "learning_rate": 8.122116503525224e-05, + "loss": 0.7081, + "step": 14468 + }, + { + "epoch": 0.9803509722880954, + "grad_norm": 6.0965752601623535, + "learning_rate": 8.121979601615442e-05, + "loss": 0.6694, + "step": 14469 + }, + { + "epoch": 0.9804187275560675, + "grad_norm": 6.101776123046875, + "learning_rate": 8.121842699705662e-05, + "loss": 0.7803, + "step": 14470 + }, + { + "epoch": 0.9804864828240396, + "grad_norm": 5.098122596740723, + "learning_rate": 8.12170579779588e-05, + "loss": 0.8434, + "step": 14471 + }, + { + "epoch": 0.9805542380920117, + "grad_norm": 4.281574726104736, + "learning_rate": 8.121568895886098e-05, + "loss": 0.7203, + "step": 14472 + }, + { + "epoch": 0.9806219933599838, + "grad_norm": 7.562485694885254, + "learning_rate": 8.121431993976316e-05, + "loss": 0.8041, + "step": 14473 + }, + { + "epoch": 0.9806897486279558, + "grad_norm": 8.38601303100586, + "learning_rate": 8.121295092066534e-05, + "loss": 0.6553, + "step": 14474 + }, + { + "epoch": 0.9807575038959279, + "grad_norm": 7.885288715362549, + "learning_rate": 8.121158190156753e-05, + "loss": 0.9021, + "step": 14475 + }, + { + "epoch": 0.9808252591639, + "grad_norm": 5.9442877769470215, + "learning_rate": 8.121021288246971e-05, + "loss": 0.7343, + "step": 14476 + }, + { + "epoch": 0.980893014431872, + "grad_norm": 7.727574348449707, + "learning_rate": 8.120884386337189e-05, + "loss": 0.9092, + "step": 14477 + }, + { + "epoch": 0.9809607696998441, + "grad_norm": 7.154834270477295, + "learning_rate": 8.120747484427407e-05, + "loss": 0.824, + "step": 14478 + }, + { + "epoch": 0.9810285249678162, + "grad_norm": 5.356253623962402, + "learning_rate": 8.120610582517625e-05, + "loss": 0.7594, + "step": 14479 + }, + { + "epoch": 0.9810962802357883, + "grad_norm": 6.602542400360107, + "learning_rate": 8.120473680607845e-05, + "loss": 0.7641, + "step": 14480 + }, + { + "epoch": 0.9811640355037604, + "grad_norm": 5.2316060066223145, + "learning_rate": 8.120336778698063e-05, + "loss": 0.7131, + "step": 14481 + }, + { + "epoch": 0.9812317907717325, + "grad_norm": 5.679458141326904, + "learning_rate": 8.120199876788281e-05, + "loss": 0.7762, + "step": 14482 + }, + { + "epoch": 0.9812995460397046, + "grad_norm": 6.47125244140625, + "learning_rate": 8.1200629748785e-05, + "loss": 0.7332, + "step": 14483 + }, + { + "epoch": 0.9813673013076767, + "grad_norm": 4.7544169425964355, + "learning_rate": 8.119926072968718e-05, + "loss": 0.7308, + "step": 14484 + }, + { + "epoch": 0.9814350565756488, + "grad_norm": 8.939432144165039, + "learning_rate": 8.119789171058936e-05, + "loss": 0.9312, + "step": 14485 + }, + { + "epoch": 0.9815028118436209, + "grad_norm": 6.649098873138428, + "learning_rate": 8.119652269149156e-05, + "loss": 0.7315, + "step": 14486 + }, + { + "epoch": 0.981570567111593, + "grad_norm": 6.609123706817627, + "learning_rate": 8.119515367239374e-05, + "loss": 0.7891, + "step": 14487 + }, + { + "epoch": 0.981638322379565, + "grad_norm": 4.026298522949219, + "learning_rate": 8.119378465329592e-05, + "loss": 0.6256, + "step": 14488 + }, + { + "epoch": 0.9817060776475371, + "grad_norm": 5.0410075187683105, + "learning_rate": 8.119241563419811e-05, + "loss": 0.5491, + "step": 14489 + }, + { + "epoch": 0.9817738329155091, + "grad_norm": 6.390183448791504, + "learning_rate": 8.119104661510029e-05, + "loss": 0.6793, + "step": 14490 + }, + { + "epoch": 0.9818415881834812, + "grad_norm": 5.242403984069824, + "learning_rate": 8.118967759600247e-05, + "loss": 0.6271, + "step": 14491 + }, + { + "epoch": 0.9819093434514533, + "grad_norm": 6.249171257019043, + "learning_rate": 8.118830857690465e-05, + "loss": 0.8151, + "step": 14492 + }, + { + "epoch": 0.9819770987194254, + "grad_norm": 6.398001194000244, + "learning_rate": 8.118693955780685e-05, + "loss": 0.6908, + "step": 14493 + }, + { + "epoch": 0.9820448539873975, + "grad_norm": 8.341864585876465, + "learning_rate": 8.118557053870903e-05, + "loss": 0.9941, + "step": 14494 + }, + { + "epoch": 0.9821126092553696, + "grad_norm": 7.000919818878174, + "learning_rate": 8.11842015196112e-05, + "loss": 0.751, + "step": 14495 + }, + { + "epoch": 0.9821803645233417, + "grad_norm": 6.331650257110596, + "learning_rate": 8.118283250051339e-05, + "loss": 0.6585, + "step": 14496 + }, + { + "epoch": 0.9822481197913138, + "grad_norm": 6.7721848487854, + "learning_rate": 8.118146348141557e-05, + "loss": 0.8229, + "step": 14497 + }, + { + "epoch": 0.9823158750592859, + "grad_norm": 6.479053497314453, + "learning_rate": 8.118009446231776e-05, + "loss": 0.8373, + "step": 14498 + }, + { + "epoch": 0.982383630327258, + "grad_norm": 6.043035984039307, + "learning_rate": 8.117872544321994e-05, + "loss": 0.7682, + "step": 14499 + }, + { + "epoch": 0.98245138559523, + "grad_norm": 6.7178778648376465, + "learning_rate": 8.117735642412212e-05, + "loss": 0.6542, + "step": 14500 + }, + { + "epoch": 0.9825191408632021, + "grad_norm": 9.148183822631836, + "learning_rate": 8.11759874050243e-05, + "loss": 0.7574, + "step": 14501 + }, + { + "epoch": 0.9825868961311742, + "grad_norm": 5.086976051330566, + "learning_rate": 8.11746183859265e-05, + "loss": 0.6415, + "step": 14502 + }, + { + "epoch": 0.9826546513991463, + "grad_norm": 6.981233596801758, + "learning_rate": 8.117324936682868e-05, + "loss": 1.0351, + "step": 14503 + }, + { + "epoch": 0.9827224066671184, + "grad_norm": 6.20076847076416, + "learning_rate": 8.117188034773086e-05, + "loss": 0.734, + "step": 14504 + }, + { + "epoch": 0.9827901619350905, + "grad_norm": 6.142386436462402, + "learning_rate": 8.117051132863304e-05, + "loss": 0.9636, + "step": 14505 + }, + { + "epoch": 0.9828579172030626, + "grad_norm": 5.595977306365967, + "learning_rate": 8.116914230953522e-05, + "loss": 0.8142, + "step": 14506 + }, + { + "epoch": 0.9829256724710346, + "grad_norm": 5.602009296417236, + "learning_rate": 8.116777329043741e-05, + "loss": 0.8624, + "step": 14507 + }, + { + "epoch": 0.9829934277390067, + "grad_norm": 5.37421989440918, + "learning_rate": 8.116640427133959e-05, + "loss": 0.9434, + "step": 14508 + }, + { + "epoch": 0.9830611830069788, + "grad_norm": 6.307192325592041, + "learning_rate": 8.116503525224177e-05, + "loss": 0.7215, + "step": 14509 + }, + { + "epoch": 0.9831289382749508, + "grad_norm": 5.990005970001221, + "learning_rate": 8.116366623314395e-05, + "loss": 0.6622, + "step": 14510 + }, + { + "epoch": 0.9831966935429229, + "grad_norm": 6.682214736938477, + "learning_rate": 8.116229721404613e-05, + "loss": 0.8302, + "step": 14511 + }, + { + "epoch": 0.983264448810895, + "grad_norm": 8.388869285583496, + "learning_rate": 8.116092819494833e-05, + "loss": 0.948, + "step": 14512 + }, + { + "epoch": 0.9833322040788671, + "grad_norm": 6.352821350097656, + "learning_rate": 8.115955917585051e-05, + "loss": 0.8845, + "step": 14513 + }, + { + "epoch": 0.9833999593468392, + "grad_norm": 5.503759384155273, + "learning_rate": 8.115819015675269e-05, + "loss": 0.6619, + "step": 14514 + }, + { + "epoch": 0.9834677146148113, + "grad_norm": 8.024614334106445, + "learning_rate": 8.115682113765487e-05, + "loss": 0.6846, + "step": 14515 + }, + { + "epoch": 0.9835354698827834, + "grad_norm": 5.9330902099609375, + "learning_rate": 8.115545211855706e-05, + "loss": 0.7577, + "step": 14516 + }, + { + "epoch": 0.9836032251507555, + "grad_norm": 5.600277423858643, + "learning_rate": 8.115408309945924e-05, + "loss": 0.8694, + "step": 14517 + }, + { + "epoch": 0.9836709804187276, + "grad_norm": 5.870060443878174, + "learning_rate": 8.115271408036142e-05, + "loss": 0.7465, + "step": 14518 + }, + { + "epoch": 0.9837387356866997, + "grad_norm": 5.257311820983887, + "learning_rate": 8.11513450612636e-05, + "loss": 0.58, + "step": 14519 + }, + { + "epoch": 0.9838064909546718, + "grad_norm": 5.327232837677002, + "learning_rate": 8.114997604216578e-05, + "loss": 0.823, + "step": 14520 + }, + { + "epoch": 0.9838742462226439, + "grad_norm": 7.09609842300415, + "learning_rate": 8.114860702306798e-05, + "loss": 0.7926, + "step": 14521 + }, + { + "epoch": 0.983942001490616, + "grad_norm": 5.685835838317871, + "learning_rate": 8.114723800397016e-05, + "loss": 0.8741, + "step": 14522 + }, + { + "epoch": 0.9840097567585879, + "grad_norm": 5.486968517303467, + "learning_rate": 8.114586898487234e-05, + "loss": 0.8518, + "step": 14523 + }, + { + "epoch": 0.98407751202656, + "grad_norm": 4.868188858032227, + "learning_rate": 8.114449996577452e-05, + "loss": 0.6013, + "step": 14524 + }, + { + "epoch": 0.9841452672945321, + "grad_norm": 6.388209819793701, + "learning_rate": 8.114313094667671e-05, + "loss": 0.827, + "step": 14525 + }, + { + "epoch": 0.9842130225625042, + "grad_norm": 6.438693523406982, + "learning_rate": 8.114176192757889e-05, + "loss": 0.9702, + "step": 14526 + }, + { + "epoch": 0.9842807778304763, + "grad_norm": 4.985934734344482, + "learning_rate": 8.114039290848107e-05, + "loss": 0.6018, + "step": 14527 + }, + { + "epoch": 0.9843485330984484, + "grad_norm": 6.320969581604004, + "learning_rate": 8.113902388938325e-05, + "loss": 0.7803, + "step": 14528 + }, + { + "epoch": 0.9844162883664205, + "grad_norm": 7.519181728363037, + "learning_rate": 8.113765487028545e-05, + "loss": 0.7211, + "step": 14529 + }, + { + "epoch": 0.9844840436343926, + "grad_norm": 5.232812404632568, + "learning_rate": 8.113628585118763e-05, + "loss": 1.0151, + "step": 14530 + }, + { + "epoch": 0.9845517989023647, + "grad_norm": 6.038102149963379, + "learning_rate": 8.113491683208981e-05, + "loss": 0.8097, + "step": 14531 + }, + { + "epoch": 0.9846195541703368, + "grad_norm": 6.541941165924072, + "learning_rate": 8.1133547812992e-05, + "loss": 0.704, + "step": 14532 + }, + { + "epoch": 0.9846873094383088, + "grad_norm": 6.207378387451172, + "learning_rate": 8.113217879389418e-05, + "loss": 0.6292, + "step": 14533 + }, + { + "epoch": 0.9847550647062809, + "grad_norm": 7.586195945739746, + "learning_rate": 8.113080977479636e-05, + "loss": 0.8351, + "step": 14534 + }, + { + "epoch": 0.984822819974253, + "grad_norm": 5.649599075317383, + "learning_rate": 8.112944075569856e-05, + "loss": 0.7277, + "step": 14535 + }, + { + "epoch": 0.9848905752422251, + "grad_norm": 6.21948766708374, + "learning_rate": 8.112807173660074e-05, + "loss": 0.8101, + "step": 14536 + }, + { + "epoch": 0.9849583305101972, + "grad_norm": 5.665065765380859, + "learning_rate": 8.112670271750292e-05, + "loss": 0.9378, + "step": 14537 + }, + { + "epoch": 0.9850260857781693, + "grad_norm": 6.819377899169922, + "learning_rate": 8.11253336984051e-05, + "loss": 0.8962, + "step": 14538 + }, + { + "epoch": 0.9850938410461413, + "grad_norm": 5.853285789489746, + "learning_rate": 8.112396467930729e-05, + "loss": 0.7442, + "step": 14539 + }, + { + "epoch": 0.9851615963141134, + "grad_norm": 6.921646595001221, + "learning_rate": 8.112259566020947e-05, + "loss": 0.8819, + "step": 14540 + }, + { + "epoch": 0.9852293515820855, + "grad_norm": 5.253473281860352, + "learning_rate": 8.112122664111165e-05, + "loss": 0.8615, + "step": 14541 + }, + { + "epoch": 0.9852971068500576, + "grad_norm": 6.091032981872559, + "learning_rate": 8.111985762201383e-05, + "loss": 0.9232, + "step": 14542 + }, + { + "epoch": 0.9853648621180296, + "grad_norm": 4.946970462799072, + "learning_rate": 8.111848860291601e-05, + "loss": 0.5256, + "step": 14543 + }, + { + "epoch": 0.9854326173860017, + "grad_norm": 7.805113792419434, + "learning_rate": 8.11171195838182e-05, + "loss": 0.7799, + "step": 14544 + }, + { + "epoch": 0.9855003726539738, + "grad_norm": 5.948545932769775, + "learning_rate": 8.111575056472039e-05, + "loss": 0.8107, + "step": 14545 + }, + { + "epoch": 0.9855681279219459, + "grad_norm": 4.266178131103516, + "learning_rate": 8.111438154562257e-05, + "loss": 0.6324, + "step": 14546 + }, + { + "epoch": 0.985635883189918, + "grad_norm": 5.126527309417725, + "learning_rate": 8.111301252652475e-05, + "loss": 0.6852, + "step": 14547 + }, + { + "epoch": 0.9857036384578901, + "grad_norm": 7.069212436676025, + "learning_rate": 8.111164350742694e-05, + "loss": 0.7833, + "step": 14548 + }, + { + "epoch": 0.9857713937258622, + "grad_norm": 4.909544944763184, + "learning_rate": 8.111027448832912e-05, + "loss": 0.7001, + "step": 14549 + }, + { + "epoch": 0.9858391489938343, + "grad_norm": 8.045904159545898, + "learning_rate": 8.11089054692313e-05, + "loss": 0.7565, + "step": 14550 + }, + { + "epoch": 0.9859069042618064, + "grad_norm": 6.734328269958496, + "learning_rate": 8.110753645013348e-05, + "loss": 0.6459, + "step": 14551 + }, + { + "epoch": 0.9859746595297785, + "grad_norm": 6.844288349151611, + "learning_rate": 8.110616743103566e-05, + "loss": 0.6501, + "step": 14552 + }, + { + "epoch": 0.9860424147977506, + "grad_norm": 6.474472522735596, + "learning_rate": 8.110479841193786e-05, + "loss": 0.8652, + "step": 14553 + }, + { + "epoch": 0.9861101700657227, + "grad_norm": 5.087035655975342, + "learning_rate": 8.110342939284004e-05, + "loss": 0.6611, + "step": 14554 + }, + { + "epoch": 0.9861779253336947, + "grad_norm": 5.499162673950195, + "learning_rate": 8.110206037374222e-05, + "loss": 0.7477, + "step": 14555 + }, + { + "epoch": 0.9862456806016667, + "grad_norm": 6.489079475402832, + "learning_rate": 8.11006913546444e-05, + "loss": 0.6523, + "step": 14556 + }, + { + "epoch": 0.9863134358696388, + "grad_norm": 5.147678375244141, + "learning_rate": 8.109932233554658e-05, + "loss": 0.7646, + "step": 14557 + }, + { + "epoch": 0.9863811911376109, + "grad_norm": 6.976253986358643, + "learning_rate": 8.109795331644877e-05, + "loss": 0.7634, + "step": 14558 + }, + { + "epoch": 0.986448946405583, + "grad_norm": 6.211210250854492, + "learning_rate": 8.109658429735095e-05, + "loss": 0.788, + "step": 14559 + }, + { + "epoch": 0.9865167016735551, + "grad_norm": 5.917699813842773, + "learning_rate": 8.109521527825313e-05, + "loss": 0.8491, + "step": 14560 + }, + { + "epoch": 0.9865844569415272, + "grad_norm": 9.201217651367188, + "learning_rate": 8.109384625915531e-05, + "loss": 0.8377, + "step": 14561 + }, + { + "epoch": 0.9866522122094993, + "grad_norm": 6.403718948364258, + "learning_rate": 8.10924772400575e-05, + "loss": 0.991, + "step": 14562 + }, + { + "epoch": 0.9867199674774714, + "grad_norm": 4.908394813537598, + "learning_rate": 8.109110822095969e-05, + "loss": 0.7742, + "step": 14563 + }, + { + "epoch": 0.9867877227454435, + "grad_norm": 5.759329795837402, + "learning_rate": 8.108973920186187e-05, + "loss": 0.682, + "step": 14564 + }, + { + "epoch": 0.9868554780134156, + "grad_norm": 5.077083587646484, + "learning_rate": 8.108837018276405e-05, + "loss": 0.6575, + "step": 14565 + }, + { + "epoch": 0.9869232332813876, + "grad_norm": 6.402769088745117, + "learning_rate": 8.108700116366623e-05, + "loss": 0.6313, + "step": 14566 + }, + { + "epoch": 0.9869909885493597, + "grad_norm": 6.894180774688721, + "learning_rate": 8.108563214456842e-05, + "loss": 0.779, + "step": 14567 + }, + { + "epoch": 0.9870587438173318, + "grad_norm": 7.2585062980651855, + "learning_rate": 8.10842631254706e-05, + "loss": 0.8281, + "step": 14568 + }, + { + "epoch": 0.9871264990853039, + "grad_norm": 4.876138210296631, + "learning_rate": 8.108289410637278e-05, + "loss": 0.6501, + "step": 14569 + }, + { + "epoch": 0.987194254353276, + "grad_norm": 6.457757949829102, + "learning_rate": 8.108152508727496e-05, + "loss": 0.6609, + "step": 14570 + }, + { + "epoch": 0.9872620096212481, + "grad_norm": 5.902544021606445, + "learning_rate": 8.108015606817716e-05, + "loss": 0.8231, + "step": 14571 + }, + { + "epoch": 0.9873297648892201, + "grad_norm": 7.340898513793945, + "learning_rate": 8.107878704907934e-05, + "loss": 0.9017, + "step": 14572 + }, + { + "epoch": 0.9873975201571922, + "grad_norm": 4.826013088226318, + "learning_rate": 8.107741802998152e-05, + "loss": 0.7838, + "step": 14573 + }, + { + "epoch": 0.9874652754251643, + "grad_norm": 6.566142559051514, + "learning_rate": 8.10760490108837e-05, + "loss": 0.8644, + "step": 14574 + }, + { + "epoch": 0.9875330306931364, + "grad_norm": 6.540748119354248, + "learning_rate": 8.107467999178589e-05, + "loss": 0.7461, + "step": 14575 + }, + { + "epoch": 0.9876007859611085, + "grad_norm": 8.179930686950684, + "learning_rate": 8.107331097268807e-05, + "loss": 0.6283, + "step": 14576 + }, + { + "epoch": 0.9876685412290805, + "grad_norm": 6.814093112945557, + "learning_rate": 8.107194195359025e-05, + "loss": 0.6905, + "step": 14577 + }, + { + "epoch": 0.9877362964970526, + "grad_norm": 5.342179775238037, + "learning_rate": 8.107057293449245e-05, + "loss": 0.636, + "step": 14578 + }, + { + "epoch": 0.9878040517650247, + "grad_norm": 4.811430931091309, + "learning_rate": 8.106920391539463e-05, + "loss": 0.7746, + "step": 14579 + }, + { + "epoch": 0.9878718070329968, + "grad_norm": 10.804598808288574, + "learning_rate": 8.10678348962968e-05, + "loss": 0.7885, + "step": 14580 + }, + { + "epoch": 0.9879395623009689, + "grad_norm": 5.6060709953308105, + "learning_rate": 8.1066465877199e-05, + "loss": 1.06, + "step": 14581 + }, + { + "epoch": 0.988007317568941, + "grad_norm": 5.063103675842285, + "learning_rate": 8.106509685810118e-05, + "loss": 0.8395, + "step": 14582 + }, + { + "epoch": 0.9880750728369131, + "grad_norm": 6.959988594055176, + "learning_rate": 8.106372783900336e-05, + "loss": 0.6454, + "step": 14583 + }, + { + "epoch": 0.9881428281048852, + "grad_norm": 8.020045280456543, + "learning_rate": 8.106235881990554e-05, + "loss": 0.7632, + "step": 14584 + }, + { + "epoch": 0.9882105833728573, + "grad_norm": 5.018332481384277, + "learning_rate": 8.106098980080774e-05, + "loss": 0.5049, + "step": 14585 + }, + { + "epoch": 0.9882783386408294, + "grad_norm": 6.3518967628479, + "learning_rate": 8.105962078170992e-05, + "loss": 0.7407, + "step": 14586 + }, + { + "epoch": 0.9883460939088015, + "grad_norm": 6.3573899269104, + "learning_rate": 8.10582517626121e-05, + "loss": 0.8245, + "step": 14587 + }, + { + "epoch": 0.9884138491767734, + "grad_norm": 7.538825988769531, + "learning_rate": 8.105688274351428e-05, + "loss": 1.0365, + "step": 14588 + }, + { + "epoch": 0.9884816044447455, + "grad_norm": 6.961862564086914, + "learning_rate": 8.105551372441646e-05, + "loss": 0.688, + "step": 14589 + }, + { + "epoch": 0.9885493597127176, + "grad_norm": 4.587418556213379, + "learning_rate": 8.105414470531865e-05, + "loss": 0.565, + "step": 14590 + }, + { + "epoch": 0.9886171149806897, + "grad_norm": 6.72902774810791, + "learning_rate": 8.105277568622083e-05, + "loss": 0.7931, + "step": 14591 + }, + { + "epoch": 0.9886848702486618, + "grad_norm": 5.065114974975586, + "learning_rate": 8.105140666712301e-05, + "loss": 0.6943, + "step": 14592 + }, + { + "epoch": 0.9887526255166339, + "grad_norm": 4.853360652923584, + "learning_rate": 8.105003764802519e-05, + "loss": 0.6457, + "step": 14593 + }, + { + "epoch": 0.988820380784606, + "grad_norm": 8.033453941345215, + "learning_rate": 8.104866862892739e-05, + "loss": 0.7982, + "step": 14594 + }, + { + "epoch": 0.9888881360525781, + "grad_norm": 5.311244487762451, + "learning_rate": 8.104729960982957e-05, + "loss": 0.8363, + "step": 14595 + }, + { + "epoch": 0.9889558913205502, + "grad_norm": 6.700189590454102, + "learning_rate": 8.104593059073175e-05, + "loss": 0.9098, + "step": 14596 + }, + { + "epoch": 0.9890236465885223, + "grad_norm": 6.449888706207275, + "learning_rate": 8.104456157163393e-05, + "loss": 0.7272, + "step": 14597 + }, + { + "epoch": 0.9890914018564944, + "grad_norm": 6.6049909591674805, + "learning_rate": 8.104319255253611e-05, + "loss": 0.7782, + "step": 14598 + }, + { + "epoch": 0.9891591571244664, + "grad_norm": 5.873762130737305, + "learning_rate": 8.10418235334383e-05, + "loss": 0.7901, + "step": 14599 + }, + { + "epoch": 0.9892269123924385, + "grad_norm": 5.933529376983643, + "learning_rate": 8.104045451434048e-05, + "loss": 0.7743, + "step": 14600 + }, + { + "epoch": 0.9892946676604106, + "grad_norm": 6.905478477478027, + "learning_rate": 8.103908549524266e-05, + "loss": 0.7662, + "step": 14601 + }, + { + "epoch": 0.9893624229283827, + "grad_norm": 5.39354944229126, + "learning_rate": 8.103771647614484e-05, + "loss": 0.6157, + "step": 14602 + }, + { + "epoch": 0.9894301781963548, + "grad_norm": 6.440433979034424, + "learning_rate": 8.103634745704704e-05, + "loss": 0.8263, + "step": 14603 + }, + { + "epoch": 0.9894979334643269, + "grad_norm": 5.653606414794922, + "learning_rate": 8.103497843794922e-05, + "loss": 0.7081, + "step": 14604 + }, + { + "epoch": 0.9895656887322989, + "grad_norm": 5.021646499633789, + "learning_rate": 8.10336094188514e-05, + "loss": 0.7988, + "step": 14605 + }, + { + "epoch": 0.989633444000271, + "grad_norm": 7.331817150115967, + "learning_rate": 8.103224039975358e-05, + "loss": 0.9294, + "step": 14606 + }, + { + "epoch": 0.9897011992682431, + "grad_norm": 6.1719584465026855, + "learning_rate": 8.103087138065576e-05, + "loss": 0.7923, + "step": 14607 + }, + { + "epoch": 0.9897689545362152, + "grad_norm": 7.179246425628662, + "learning_rate": 8.102950236155795e-05, + "loss": 0.9297, + "step": 14608 + }, + { + "epoch": 0.9898367098041873, + "grad_norm": 5.388153076171875, + "learning_rate": 8.102813334246013e-05, + "loss": 0.7016, + "step": 14609 + }, + { + "epoch": 0.9899044650721593, + "grad_norm": 5.60443115234375, + "learning_rate": 8.102676432336231e-05, + "loss": 0.7853, + "step": 14610 + }, + { + "epoch": 0.9899722203401314, + "grad_norm": 6.330133438110352, + "learning_rate": 8.102539530426449e-05, + "loss": 0.6971, + "step": 14611 + }, + { + "epoch": 0.9900399756081035, + "grad_norm": 8.957503318786621, + "learning_rate": 8.102402628516667e-05, + "loss": 0.7834, + "step": 14612 + }, + { + "epoch": 0.9901077308760756, + "grad_norm": 6.6696929931640625, + "learning_rate": 8.102265726606887e-05, + "loss": 0.8092, + "step": 14613 + }, + { + "epoch": 0.9901754861440477, + "grad_norm": 6.236315727233887, + "learning_rate": 8.102128824697105e-05, + "loss": 0.8904, + "step": 14614 + }, + { + "epoch": 0.9902432414120198, + "grad_norm": 4.87846040725708, + "learning_rate": 8.101991922787323e-05, + "loss": 0.6445, + "step": 14615 + }, + { + "epoch": 0.9903109966799919, + "grad_norm": 6.903649806976318, + "learning_rate": 8.101855020877541e-05, + "loss": 0.6749, + "step": 14616 + }, + { + "epoch": 0.990378751947964, + "grad_norm": 5.653696537017822, + "learning_rate": 8.10171811896776e-05, + "loss": 0.4365, + "step": 14617 + }, + { + "epoch": 0.9904465072159361, + "grad_norm": 6.188183784484863, + "learning_rate": 8.101581217057978e-05, + "loss": 0.8174, + "step": 14618 + }, + { + "epoch": 0.9905142624839082, + "grad_norm": 5.165365695953369, + "learning_rate": 8.101444315148196e-05, + "loss": 0.6121, + "step": 14619 + }, + { + "epoch": 0.9905820177518803, + "grad_norm": 7.228616237640381, + "learning_rate": 8.101307413238414e-05, + "loss": 0.5768, + "step": 14620 + }, + { + "epoch": 0.9906497730198522, + "grad_norm": 6.415277004241943, + "learning_rate": 8.101170511328632e-05, + "loss": 0.67, + "step": 14621 + }, + { + "epoch": 0.9907175282878243, + "grad_norm": 6.051969528198242, + "learning_rate": 8.101033609418852e-05, + "loss": 0.8459, + "step": 14622 + }, + { + "epoch": 0.9907852835557964, + "grad_norm": 5.808727741241455, + "learning_rate": 8.10089670750907e-05, + "loss": 0.738, + "step": 14623 + }, + { + "epoch": 0.9908530388237685, + "grad_norm": 7.00636625289917, + "learning_rate": 8.100759805599288e-05, + "loss": 0.6573, + "step": 14624 + }, + { + "epoch": 0.9909207940917406, + "grad_norm": 6.8472900390625, + "learning_rate": 8.100622903689507e-05, + "loss": 1.0119, + "step": 14625 + }, + { + "epoch": 0.9909885493597127, + "grad_norm": 6.012302875518799, + "learning_rate": 8.100486001779725e-05, + "loss": 0.5468, + "step": 14626 + }, + { + "epoch": 0.9910563046276848, + "grad_norm": 4.420849323272705, + "learning_rate": 8.100349099869943e-05, + "loss": 0.5134, + "step": 14627 + }, + { + "epoch": 0.9911240598956569, + "grad_norm": 5.315083980560303, + "learning_rate": 8.100212197960163e-05, + "loss": 0.803, + "step": 14628 + }, + { + "epoch": 0.991191815163629, + "grad_norm": 8.18484115600586, + "learning_rate": 8.10007529605038e-05, + "loss": 1.0685, + "step": 14629 + }, + { + "epoch": 0.9912595704316011, + "grad_norm": 6.134372711181641, + "learning_rate": 8.099938394140599e-05, + "loss": 0.8893, + "step": 14630 + }, + { + "epoch": 0.9913273256995732, + "grad_norm": 7.056315898895264, + "learning_rate": 8.099801492230818e-05, + "loss": 1.0268, + "step": 14631 + }, + { + "epoch": 0.9913950809675453, + "grad_norm": 7.75009298324585, + "learning_rate": 8.099664590321036e-05, + "loss": 0.8632, + "step": 14632 + }, + { + "epoch": 0.9914628362355173, + "grad_norm": 5.707569122314453, + "learning_rate": 8.099527688411254e-05, + "loss": 0.7005, + "step": 14633 + }, + { + "epoch": 0.9915305915034894, + "grad_norm": 5.789709091186523, + "learning_rate": 8.099390786501472e-05, + "loss": 0.9823, + "step": 14634 + }, + { + "epoch": 0.9915983467714615, + "grad_norm": 7.6483941078186035, + "learning_rate": 8.099253884591692e-05, + "loss": 0.7865, + "step": 14635 + }, + { + "epoch": 0.9916661020394336, + "grad_norm": 6.485616683959961, + "learning_rate": 8.09911698268191e-05, + "loss": 0.7863, + "step": 14636 + }, + { + "epoch": 0.9917338573074056, + "grad_norm": 4.72672176361084, + "learning_rate": 8.098980080772128e-05, + "loss": 0.6081, + "step": 14637 + }, + { + "epoch": 0.9918016125753777, + "grad_norm": 5.638705253601074, + "learning_rate": 8.098843178862346e-05, + "loss": 0.773, + "step": 14638 + }, + { + "epoch": 0.9918693678433498, + "grad_norm": 5.565254211425781, + "learning_rate": 8.098706276952564e-05, + "loss": 0.7205, + "step": 14639 + }, + { + "epoch": 0.9919371231113219, + "grad_norm": 5.114184379577637, + "learning_rate": 8.098569375042783e-05, + "loss": 0.6835, + "step": 14640 + }, + { + "epoch": 0.992004878379294, + "grad_norm": 6.048708438873291, + "learning_rate": 8.098432473133001e-05, + "loss": 0.8163, + "step": 14641 + }, + { + "epoch": 0.9920726336472661, + "grad_norm": 7.4688215255737305, + "learning_rate": 8.098295571223219e-05, + "loss": 0.5373, + "step": 14642 + }, + { + "epoch": 0.9921403889152381, + "grad_norm": 5.477198600769043, + "learning_rate": 8.098158669313437e-05, + "loss": 0.8911, + "step": 14643 + }, + { + "epoch": 0.9922081441832102, + "grad_norm": 5.453397274017334, + "learning_rate": 8.098021767403655e-05, + "loss": 0.7153, + "step": 14644 + }, + { + "epoch": 0.9922758994511823, + "grad_norm": 7.451064586639404, + "learning_rate": 8.097884865493875e-05, + "loss": 0.7384, + "step": 14645 + }, + { + "epoch": 0.9923436547191544, + "grad_norm": 6.131191253662109, + "learning_rate": 8.097747963584093e-05, + "loss": 0.7987, + "step": 14646 + }, + { + "epoch": 0.9924114099871265, + "grad_norm": 6.377108573913574, + "learning_rate": 8.09761106167431e-05, + "loss": 0.7305, + "step": 14647 + }, + { + "epoch": 0.9924791652550986, + "grad_norm": 7.368172645568848, + "learning_rate": 8.097474159764529e-05, + "loss": 0.7361, + "step": 14648 + }, + { + "epoch": 0.9925469205230707, + "grad_norm": 5.075170993804932, + "learning_rate": 8.097337257854748e-05, + "loss": 0.8124, + "step": 14649 + }, + { + "epoch": 0.9926146757910428, + "grad_norm": 4.833024501800537, + "learning_rate": 8.097200355944966e-05, + "loss": 0.5609, + "step": 14650 + }, + { + "epoch": 0.9926824310590149, + "grad_norm": 5.358253002166748, + "learning_rate": 8.097063454035184e-05, + "loss": 0.6095, + "step": 14651 + }, + { + "epoch": 0.992750186326987, + "grad_norm": 5.228174209594727, + "learning_rate": 8.096926552125402e-05, + "loss": 0.8636, + "step": 14652 + }, + { + "epoch": 0.992817941594959, + "grad_norm": 5.842618465423584, + "learning_rate": 8.09678965021562e-05, + "loss": 0.6013, + "step": 14653 + }, + { + "epoch": 0.992885696862931, + "grad_norm": 6.454665184020996, + "learning_rate": 8.09665274830584e-05, + "loss": 0.7059, + "step": 14654 + }, + { + "epoch": 0.9929534521309031, + "grad_norm": 4.836513042449951, + "learning_rate": 8.096515846396058e-05, + "loss": 0.7923, + "step": 14655 + }, + { + "epoch": 0.9930212073988752, + "grad_norm": 5.523789405822754, + "learning_rate": 8.096378944486276e-05, + "loss": 0.733, + "step": 14656 + }, + { + "epoch": 0.9930889626668473, + "grad_norm": 6.529998779296875, + "learning_rate": 8.096242042576494e-05, + "loss": 0.7117, + "step": 14657 + }, + { + "epoch": 0.9931567179348194, + "grad_norm": 4.7176408767700195, + "learning_rate": 8.096105140666713e-05, + "loss": 0.603, + "step": 14658 + }, + { + "epoch": 0.9932244732027915, + "grad_norm": 5.4773993492126465, + "learning_rate": 8.095968238756931e-05, + "loss": 0.6101, + "step": 14659 + }, + { + "epoch": 0.9932922284707636, + "grad_norm": 5.661228179931641, + "learning_rate": 8.095831336847149e-05, + "loss": 0.7999, + "step": 14660 + }, + { + "epoch": 0.9933599837387357, + "grad_norm": 6.280917644500732, + "learning_rate": 8.095694434937367e-05, + "loss": 0.6691, + "step": 14661 + }, + { + "epoch": 0.9934277390067078, + "grad_norm": 7.072371959686279, + "learning_rate": 8.095557533027585e-05, + "loss": 0.8523, + "step": 14662 + }, + { + "epoch": 0.9934954942746799, + "grad_norm": 5.228603363037109, + "learning_rate": 8.095420631117805e-05, + "loss": 0.6841, + "step": 14663 + }, + { + "epoch": 0.993563249542652, + "grad_norm": 6.489750862121582, + "learning_rate": 8.095283729208023e-05, + "loss": 0.9927, + "step": 14664 + }, + { + "epoch": 0.993631004810624, + "grad_norm": 6.142980575561523, + "learning_rate": 8.09514682729824e-05, + "loss": 0.62, + "step": 14665 + }, + { + "epoch": 0.9936987600785961, + "grad_norm": 6.388856887817383, + "learning_rate": 8.095009925388459e-05, + "loss": 0.6953, + "step": 14666 + }, + { + "epoch": 0.9937665153465682, + "grad_norm": 6.71579647064209, + "learning_rate": 8.094873023478677e-05, + "loss": 0.6856, + "step": 14667 + }, + { + "epoch": 0.9938342706145403, + "grad_norm": 4.24629545211792, + "learning_rate": 8.094736121568896e-05, + "loss": 0.9145, + "step": 14668 + }, + { + "epoch": 0.9939020258825124, + "grad_norm": 5.748496055603027, + "learning_rate": 8.094599219659114e-05, + "loss": 0.8127, + "step": 14669 + }, + { + "epoch": 0.9939697811504844, + "grad_norm": 6.51283597946167, + "learning_rate": 8.094462317749332e-05, + "loss": 0.9989, + "step": 14670 + }, + { + "epoch": 0.9940375364184565, + "grad_norm": 5.742320537567139, + "learning_rate": 8.094325415839552e-05, + "loss": 0.7456, + "step": 14671 + }, + { + "epoch": 0.9941052916864286, + "grad_norm": 5.277899265289307, + "learning_rate": 8.09418851392977e-05, + "loss": 0.7498, + "step": 14672 + }, + { + "epoch": 0.9941730469544007, + "grad_norm": 5.872328281402588, + "learning_rate": 8.094051612019988e-05, + "loss": 0.7343, + "step": 14673 + }, + { + "epoch": 0.9942408022223728, + "grad_norm": 5.946358680725098, + "learning_rate": 8.093914710110207e-05, + "loss": 0.7802, + "step": 14674 + }, + { + "epoch": 0.9943085574903449, + "grad_norm": 6.713497161865234, + "learning_rate": 8.093777808200425e-05, + "loss": 0.9639, + "step": 14675 + }, + { + "epoch": 0.994376312758317, + "grad_norm": 5.236899375915527, + "learning_rate": 8.093640906290643e-05, + "loss": 0.7111, + "step": 14676 + }, + { + "epoch": 0.994444068026289, + "grad_norm": 6.121020317077637, + "learning_rate": 8.093504004380862e-05, + "loss": 0.8627, + "step": 14677 + }, + { + "epoch": 0.9945118232942611, + "grad_norm": 6.684620380401611, + "learning_rate": 8.09336710247108e-05, + "loss": 0.8863, + "step": 14678 + }, + { + "epoch": 0.9945795785622332, + "grad_norm": 5.249447345733643, + "learning_rate": 8.093230200561299e-05, + "loss": 0.6532, + "step": 14679 + }, + { + "epoch": 0.9946473338302053, + "grad_norm": 5.633591651916504, + "learning_rate": 8.093093298651517e-05, + "loss": 0.8192, + "step": 14680 + }, + { + "epoch": 0.9947150890981774, + "grad_norm": 5.300892353057861, + "learning_rate": 8.092956396741736e-05, + "loss": 0.6628, + "step": 14681 + }, + { + "epoch": 0.9947828443661495, + "grad_norm": 6.91768741607666, + "learning_rate": 8.092819494831954e-05, + "loss": 0.9256, + "step": 14682 + }, + { + "epoch": 0.9948505996341216, + "grad_norm": 6.193354606628418, + "learning_rate": 8.092682592922172e-05, + "loss": 0.8907, + "step": 14683 + }, + { + "epoch": 0.9949183549020937, + "grad_norm": 5.23224401473999, + "learning_rate": 8.09254569101239e-05, + "loss": 0.6606, + "step": 14684 + }, + { + "epoch": 0.9949861101700658, + "grad_norm": 5.615985870361328, + "learning_rate": 8.092408789102608e-05, + "loss": 0.7438, + "step": 14685 + }, + { + "epoch": 0.9950538654380378, + "grad_norm": 4.997095584869385, + "learning_rate": 8.092271887192828e-05, + "loss": 0.6359, + "step": 14686 + }, + { + "epoch": 0.9951216207060098, + "grad_norm": 4.995532989501953, + "learning_rate": 8.092134985283046e-05, + "loss": 0.7636, + "step": 14687 + }, + { + "epoch": 0.9951893759739819, + "grad_norm": 5.627157688140869, + "learning_rate": 8.091998083373264e-05, + "loss": 0.7524, + "step": 14688 + }, + { + "epoch": 0.995257131241954, + "grad_norm": 6.29923152923584, + "learning_rate": 8.091861181463482e-05, + "loss": 0.8923, + "step": 14689 + }, + { + "epoch": 0.9953248865099261, + "grad_norm": 5.104131698608398, + "learning_rate": 8.091724279553701e-05, + "loss": 0.6621, + "step": 14690 + }, + { + "epoch": 0.9953926417778982, + "grad_norm": 6.44821834564209, + "learning_rate": 8.091587377643919e-05, + "loss": 1.0041, + "step": 14691 + }, + { + "epoch": 0.9954603970458703, + "grad_norm": 4.547702789306641, + "learning_rate": 8.091450475734137e-05, + "loss": 0.7067, + "step": 14692 + }, + { + "epoch": 0.9955281523138424, + "grad_norm": 5.443078517913818, + "learning_rate": 8.091313573824355e-05, + "loss": 0.7664, + "step": 14693 + }, + { + "epoch": 0.9955959075818145, + "grad_norm": 5.759512901306152, + "learning_rate": 8.091176671914573e-05, + "loss": 0.7241, + "step": 14694 + }, + { + "epoch": 0.9956636628497866, + "grad_norm": 4.9956746101379395, + "learning_rate": 8.091039770004793e-05, + "loss": 0.6777, + "step": 14695 + }, + { + "epoch": 0.9957314181177587, + "grad_norm": 6.713644027709961, + "learning_rate": 8.09090286809501e-05, + "loss": 0.7727, + "step": 14696 + }, + { + "epoch": 0.9957991733857308, + "grad_norm": 5.988772869110107, + "learning_rate": 8.090765966185229e-05, + "loss": 0.6803, + "step": 14697 + }, + { + "epoch": 0.9958669286537029, + "grad_norm": 5.6772966384887695, + "learning_rate": 8.090629064275447e-05, + "loss": 0.733, + "step": 14698 + }, + { + "epoch": 0.995934683921675, + "grad_norm": 5.93316125869751, + "learning_rate": 8.090492162365665e-05, + "loss": 0.7119, + "step": 14699 + }, + { + "epoch": 0.996002439189647, + "grad_norm": 5.35646915435791, + "learning_rate": 8.090355260455884e-05, + "loss": 0.9498, + "step": 14700 + }, + { + "epoch": 0.9960701944576191, + "grad_norm": 6.932188510894775, + "learning_rate": 8.090218358546102e-05, + "loss": 0.7779, + "step": 14701 + }, + { + "epoch": 0.9961379497255911, + "grad_norm": 4.6098856925964355, + "learning_rate": 8.09008145663632e-05, + "loss": 0.7614, + "step": 14702 + }, + { + "epoch": 0.9962057049935632, + "grad_norm": 6.402628421783447, + "learning_rate": 8.089944554726538e-05, + "loss": 0.7212, + "step": 14703 + }, + { + "epoch": 0.9962734602615353, + "grad_norm": 5.7452473640441895, + "learning_rate": 8.089807652816758e-05, + "loss": 0.9323, + "step": 14704 + }, + { + "epoch": 0.9963412155295074, + "grad_norm": 8.710042953491211, + "learning_rate": 8.089670750906976e-05, + "loss": 0.7616, + "step": 14705 + }, + { + "epoch": 0.9964089707974795, + "grad_norm": 6.53500509262085, + "learning_rate": 8.089533848997194e-05, + "loss": 0.8054, + "step": 14706 + }, + { + "epoch": 0.9964767260654516, + "grad_norm": 6.090463161468506, + "learning_rate": 8.089396947087412e-05, + "loss": 0.8993, + "step": 14707 + }, + { + "epoch": 0.9965444813334237, + "grad_norm": 4.988658428192139, + "learning_rate": 8.08926004517763e-05, + "loss": 0.6045, + "step": 14708 + }, + { + "epoch": 0.9966122366013958, + "grad_norm": 5.254270553588867, + "learning_rate": 8.089123143267849e-05, + "loss": 0.7135, + "step": 14709 + }, + { + "epoch": 0.9966799918693678, + "grad_norm": 6.4775848388671875, + "learning_rate": 8.088986241358067e-05, + "loss": 0.8647, + "step": 14710 + }, + { + "epoch": 0.9967477471373399, + "grad_norm": 8.165122985839844, + "learning_rate": 8.088849339448285e-05, + "loss": 0.5061, + "step": 14711 + }, + { + "epoch": 0.996815502405312, + "grad_norm": 6.5195159912109375, + "learning_rate": 8.088712437538503e-05, + "loss": 1.0305, + "step": 14712 + }, + { + "epoch": 0.9968832576732841, + "grad_norm": 4.683186054229736, + "learning_rate": 8.088575535628723e-05, + "loss": 0.696, + "step": 14713 + }, + { + "epoch": 0.9969510129412562, + "grad_norm": 8.721222877502441, + "learning_rate": 8.08843863371894e-05, + "loss": 0.7871, + "step": 14714 + }, + { + "epoch": 0.9970187682092283, + "grad_norm": 4.208126544952393, + "learning_rate": 8.088301731809159e-05, + "loss": 0.4982, + "step": 14715 + }, + { + "epoch": 0.9970865234772004, + "grad_norm": 8.587434768676758, + "learning_rate": 8.088164829899377e-05, + "loss": 0.7851, + "step": 14716 + }, + { + "epoch": 0.9971542787451725, + "grad_norm": 6.511172294616699, + "learning_rate": 8.088027927989596e-05, + "loss": 0.6705, + "step": 14717 + }, + { + "epoch": 0.9972220340131446, + "grad_norm": 9.866682052612305, + "learning_rate": 8.087891026079814e-05, + "loss": 0.7538, + "step": 14718 + }, + { + "epoch": 0.9972897892811166, + "grad_norm": 5.130730628967285, + "learning_rate": 8.087754124170032e-05, + "loss": 0.5205, + "step": 14719 + }, + { + "epoch": 0.9973575445490886, + "grad_norm": 6.190724849700928, + "learning_rate": 8.087617222260252e-05, + "loss": 0.8204, + "step": 14720 + }, + { + "epoch": 0.9974252998170607, + "grad_norm": 6.667228698730469, + "learning_rate": 8.08748032035047e-05, + "loss": 0.8681, + "step": 14721 + }, + { + "epoch": 0.9974930550850328, + "grad_norm": 5.230597019195557, + "learning_rate": 8.087343418440688e-05, + "loss": 0.6311, + "step": 14722 + }, + { + "epoch": 0.9975608103530049, + "grad_norm": 4.286562919616699, + "learning_rate": 8.087206516530907e-05, + "loss": 0.5128, + "step": 14723 + }, + { + "epoch": 0.997628565620977, + "grad_norm": 8.196592330932617, + "learning_rate": 8.087069614621125e-05, + "loss": 1.0953, + "step": 14724 + }, + { + "epoch": 0.9976963208889491, + "grad_norm": 5.942284107208252, + "learning_rate": 8.086932712711343e-05, + "loss": 0.8896, + "step": 14725 + }, + { + "epoch": 0.9977640761569212, + "grad_norm": 7.276721477508545, + "learning_rate": 8.086795810801561e-05, + "loss": 0.8503, + "step": 14726 + }, + { + "epoch": 0.9978318314248933, + "grad_norm": 6.109722137451172, + "learning_rate": 8.08665890889178e-05, + "loss": 0.563, + "step": 14727 + }, + { + "epoch": 0.9978995866928654, + "grad_norm": 7.156121253967285, + "learning_rate": 8.086522006981998e-05, + "loss": 0.6586, + "step": 14728 + }, + { + "epoch": 0.9979673419608375, + "grad_norm": 5.004053592681885, + "learning_rate": 8.086385105072217e-05, + "loss": 0.7652, + "step": 14729 + }, + { + "epoch": 0.9980350972288096, + "grad_norm": 6.767992973327637, + "learning_rate": 8.086248203162435e-05, + "loss": 0.9194, + "step": 14730 + }, + { + "epoch": 0.9981028524967817, + "grad_norm": 4.350346565246582, + "learning_rate": 8.086111301252653e-05, + "loss": 0.5671, + "step": 14731 + }, + { + "epoch": 0.9981706077647537, + "grad_norm": 4.5186052322387695, + "learning_rate": 8.085974399342872e-05, + "loss": 0.7229, + "step": 14732 + }, + { + "epoch": 0.9982383630327258, + "grad_norm": 4.803741931915283, + "learning_rate": 8.08583749743309e-05, + "loss": 0.5626, + "step": 14733 + }, + { + "epoch": 0.9983061183006979, + "grad_norm": 5.849275588989258, + "learning_rate": 8.085700595523308e-05, + "loss": 0.7053, + "step": 14734 + }, + { + "epoch": 0.9983738735686699, + "grad_norm": 6.240585803985596, + "learning_rate": 8.085563693613526e-05, + "loss": 1.0339, + "step": 14735 + }, + { + "epoch": 0.998441628836642, + "grad_norm": 5.102994441986084, + "learning_rate": 8.085426791703745e-05, + "loss": 0.6594, + "step": 14736 + }, + { + "epoch": 0.9985093841046141, + "grad_norm": 7.48859977722168, + "learning_rate": 8.085289889793964e-05, + "loss": 0.7227, + "step": 14737 + }, + { + "epoch": 0.9985771393725862, + "grad_norm": 4.829802513122559, + "learning_rate": 8.085152987884182e-05, + "loss": 0.6739, + "step": 14738 + }, + { + "epoch": 0.9986448946405583, + "grad_norm": 5.353672504425049, + "learning_rate": 8.0850160859744e-05, + "loss": 0.8112, + "step": 14739 + }, + { + "epoch": 0.9987126499085304, + "grad_norm": 5.165256023406982, + "learning_rate": 8.084879184064618e-05, + "loss": 0.7894, + "step": 14740 + }, + { + "epoch": 0.9987804051765025, + "grad_norm": 6.365817070007324, + "learning_rate": 8.084742282154837e-05, + "loss": 0.6763, + "step": 14741 + }, + { + "epoch": 0.9988481604444746, + "grad_norm": 5.996483325958252, + "learning_rate": 8.084605380245055e-05, + "loss": 0.6717, + "step": 14742 + }, + { + "epoch": 0.9989159157124466, + "grad_norm": 5.232832431793213, + "learning_rate": 8.084468478335273e-05, + "loss": 0.779, + "step": 14743 + }, + { + "epoch": 0.9989836709804187, + "grad_norm": 7.0810089111328125, + "learning_rate": 8.084331576425491e-05, + "loss": 1.0872, + "step": 14744 + }, + { + "epoch": 0.9990514262483908, + "grad_norm": 6.616766452789307, + "learning_rate": 8.084194674515709e-05, + "loss": 0.8411, + "step": 14745 + }, + { + "epoch": 0.9991191815163629, + "grad_norm": 6.06799840927124, + "learning_rate": 8.084057772605929e-05, + "loss": 0.6083, + "step": 14746 + }, + { + "epoch": 0.999186936784335, + "grad_norm": 6.634158134460449, + "learning_rate": 8.083920870696147e-05, + "loss": 0.627, + "step": 14747 + }, + { + "epoch": 0.9992546920523071, + "grad_norm": 6.214893817901611, + "learning_rate": 8.083783968786365e-05, + "loss": 1.025, + "step": 14748 + }, + { + "epoch": 0.9993224473202792, + "grad_norm": 7.438349723815918, + "learning_rate": 8.083647066876583e-05, + "loss": 0.8148, + "step": 14749 + }, + { + "epoch": 0.9993902025882513, + "grad_norm": 5.228566646575928, + "learning_rate": 8.083510164966802e-05, + "loss": 0.741, + "step": 14750 + }, + { + "epoch": 0.9994579578562233, + "grad_norm": 6.1171064376831055, + "learning_rate": 8.08337326305702e-05, + "loss": 0.6582, + "step": 14751 + }, + { + "epoch": 0.9995257131241954, + "grad_norm": 5.6771368980407715, + "learning_rate": 8.083236361147238e-05, + "loss": 0.7143, + "step": 14752 + }, + { + "epoch": 0.9995934683921675, + "grad_norm": 6.686731815338135, + "learning_rate": 8.083099459237456e-05, + "loss": 0.9172, + "step": 14753 + }, + { + "epoch": 0.9996612236601395, + "grad_norm": 5.874754905700684, + "learning_rate": 8.082962557327674e-05, + "loss": 0.8786, + "step": 14754 + }, + { + "epoch": 0.9997289789281116, + "grad_norm": 7.658037185668945, + "learning_rate": 8.082825655417894e-05, + "loss": 0.4877, + "step": 14755 + }, + { + "epoch": 0.9997967341960837, + "grad_norm": 7.869183540344238, + "learning_rate": 8.082688753508112e-05, + "loss": 0.811, + "step": 14756 + }, + { + "epoch": 0.9997967341960837, + "eval_loss": 0.7405052185058594, + "eval_noise_accuracy": 0.0, + "eval_runtime": 1547.7438, + "eval_samples_per_second": 3.32, + "eval_steps_per_second": 0.208, + "eval_wer": 78.51462535428304, + "step": 14756 + }, + { + "epoch": 0.9998644894640558, + "grad_norm": 6.051870822906494, + "learning_rate": 8.08255185159833e-05, + "loss": 0.9022, + "step": 14757 + }, + { + "epoch": 0.9999322447320279, + "grad_norm": 5.921606063842773, + "learning_rate": 8.082414949688548e-05, + "loss": 0.8192, + "step": 14758 + }, + { + "epoch": 1.0, + "grad_norm": 8.148659706115723, + "learning_rate": 8.082278047778767e-05, + "loss": 0.7852, + "step": 14759 + }, + { + "epoch": 1.000067755267972, + "grad_norm": 5.229124069213867, + "learning_rate": 8.082141145868985e-05, + "loss": 0.8277, + "step": 14760 + }, + { + "epoch": 1.0001355105359442, + "grad_norm": 5.082587242126465, + "learning_rate": 8.082004243959203e-05, + "loss": 0.7213, + "step": 14761 + }, + { + "epoch": 1.0002032658039162, + "grad_norm": 6.1493024826049805, + "learning_rate": 8.081867342049421e-05, + "loss": 0.6404, + "step": 14762 + }, + { + "epoch": 1.0002710210718884, + "grad_norm": 6.247502326965332, + "learning_rate": 8.08173044013964e-05, + "loss": 1.1216, + "step": 14763 + }, + { + "epoch": 1.0003387763398603, + "grad_norm": 6.559756755828857, + "learning_rate": 8.081593538229859e-05, + "loss": 0.5577, + "step": 14764 + }, + { + "epoch": 1.0004065316078325, + "grad_norm": 4.563822269439697, + "learning_rate": 8.081456636320077e-05, + "loss": 0.6457, + "step": 14765 + }, + { + "epoch": 1.0004742868758045, + "grad_norm": 4.815116882324219, + "learning_rate": 8.081319734410296e-05, + "loss": 0.8247, + "step": 14766 + }, + { + "epoch": 1.0005420421437767, + "grad_norm": 4.924731731414795, + "learning_rate": 8.081182832500514e-05, + "loss": 0.8842, + "step": 14767 + }, + { + "epoch": 1.0006097974117487, + "grad_norm": 6.610191822052002, + "learning_rate": 8.081045930590732e-05, + "loss": 0.7282, + "step": 14768 + }, + { + "epoch": 1.000677552679721, + "grad_norm": 6.175731658935547, + "learning_rate": 8.080909028680951e-05, + "loss": 0.7621, + "step": 14769 + }, + { + "epoch": 1.000745307947693, + "grad_norm": 5.9794111251831055, + "learning_rate": 8.08077212677117e-05, + "loss": 0.7258, + "step": 14770 + }, + { + "epoch": 1.000813063215665, + "grad_norm": 13.052425384521484, + "learning_rate": 8.080635224861388e-05, + "loss": 0.8328, + "step": 14771 + }, + { + "epoch": 1.000880818483637, + "grad_norm": 6.011143684387207, + "learning_rate": 8.080498322951606e-05, + "loss": 0.62, + "step": 14772 + }, + { + "epoch": 1.0009485737516093, + "grad_norm": 7.789141654968262, + "learning_rate": 8.080361421041825e-05, + "loss": 0.7544, + "step": 14773 + }, + { + "epoch": 1.0010163290195813, + "grad_norm": 5.623871326446533, + "learning_rate": 8.080224519132043e-05, + "loss": 0.5816, + "step": 14774 + }, + { + "epoch": 1.0010840842875535, + "grad_norm": 6.538607597351074, + "learning_rate": 8.080087617222261e-05, + "loss": 0.8612, + "step": 14775 + }, + { + "epoch": 1.0011518395555254, + "grad_norm": 5.990743160247803, + "learning_rate": 8.079950715312479e-05, + "loss": 0.7108, + "step": 14776 + }, + { + "epoch": 1.0012195948234974, + "grad_norm": 5.186371326446533, + "learning_rate": 8.079813813402697e-05, + "loss": 0.5226, + "step": 14777 + }, + { + "epoch": 1.0012873500914696, + "grad_norm": 8.262914657592773, + "learning_rate": 8.079676911492916e-05, + "loss": 0.5101, + "step": 14778 + }, + { + "epoch": 1.0013551053594416, + "grad_norm": 7.4032769203186035, + "learning_rate": 8.079540009583134e-05, + "loss": 0.579, + "step": 14779 + }, + { + "epoch": 1.0014228606274138, + "grad_norm": 5.568051815032959, + "learning_rate": 8.079403107673353e-05, + "loss": 0.7007, + "step": 14780 + }, + { + "epoch": 1.0014906158953858, + "grad_norm": 4.613144397735596, + "learning_rate": 8.07926620576357e-05, + "loss": 0.6206, + "step": 14781 + }, + { + "epoch": 1.001558371163358, + "grad_norm": 5.073683261871338, + "learning_rate": 8.07912930385379e-05, + "loss": 0.4866, + "step": 14782 + }, + { + "epoch": 1.00162612643133, + "grad_norm": 4.251949310302734, + "learning_rate": 8.078992401944008e-05, + "loss": 0.5746, + "step": 14783 + }, + { + "epoch": 1.0016938816993022, + "grad_norm": 6.774303436279297, + "learning_rate": 8.078855500034226e-05, + "loss": 0.81, + "step": 14784 + }, + { + "epoch": 1.0017616369672742, + "grad_norm": 7.1045918464660645, + "learning_rate": 8.078718598124444e-05, + "loss": 0.7654, + "step": 14785 + }, + { + "epoch": 1.0018293922352464, + "grad_norm": 4.904873371124268, + "learning_rate": 8.078581696214662e-05, + "loss": 0.8256, + "step": 14786 + }, + { + "epoch": 1.0018971475032183, + "grad_norm": 6.918723106384277, + "learning_rate": 8.078444794304881e-05, + "loss": 0.7021, + "step": 14787 + }, + { + "epoch": 1.0019649027711905, + "grad_norm": 6.581482887268066, + "learning_rate": 8.0783078923951e-05, + "loss": 0.6402, + "step": 14788 + }, + { + "epoch": 1.0020326580391625, + "grad_norm": 5.610997200012207, + "learning_rate": 8.078170990485318e-05, + "loss": 0.7777, + "step": 14789 + }, + { + "epoch": 1.0021004133071347, + "grad_norm": 6.357514381408691, + "learning_rate": 8.078034088575536e-05, + "loss": 0.7679, + "step": 14790 + }, + { + "epoch": 1.0021681685751067, + "grad_norm": 6.965141296386719, + "learning_rate": 8.077897186665755e-05, + "loss": 0.4878, + "step": 14791 + }, + { + "epoch": 1.0022359238430787, + "grad_norm": 6.122716426849365, + "learning_rate": 8.077760284755973e-05, + "loss": 0.6921, + "step": 14792 + }, + { + "epoch": 1.002303679111051, + "grad_norm": 4.804755210876465, + "learning_rate": 8.077623382846191e-05, + "loss": 0.6617, + "step": 14793 + }, + { + "epoch": 1.0023714343790229, + "grad_norm": 5.5113844871521, + "learning_rate": 8.077486480936409e-05, + "loss": 0.7798, + "step": 14794 + }, + { + "epoch": 1.002439189646995, + "grad_norm": 6.461663246154785, + "learning_rate": 8.077349579026627e-05, + "loss": 0.7959, + "step": 14795 + }, + { + "epoch": 1.002506944914967, + "grad_norm": 6.894604206085205, + "learning_rate": 8.077212677116846e-05, + "loss": 0.7367, + "step": 14796 + }, + { + "epoch": 1.0025747001829393, + "grad_norm": 5.337347984313965, + "learning_rate": 8.077075775207065e-05, + "loss": 0.5788, + "step": 14797 + }, + { + "epoch": 1.0026424554509112, + "grad_norm": 6.755460262298584, + "learning_rate": 8.076938873297283e-05, + "loss": 0.8379, + "step": 14798 + }, + { + "epoch": 1.0027102107188834, + "grad_norm": 5.582238674163818, + "learning_rate": 8.0768019713875e-05, + "loss": 0.6805, + "step": 14799 + }, + { + "epoch": 1.0027779659868554, + "grad_norm": 5.460515975952148, + "learning_rate": 8.076665069477719e-05, + "loss": 0.7466, + "step": 14800 + }, + { + "epoch": 1.0028457212548276, + "grad_norm": 7.810371398925781, + "learning_rate": 8.076528167567938e-05, + "loss": 0.743, + "step": 14801 + }, + { + "epoch": 1.0029134765227996, + "grad_norm": 5.568098545074463, + "learning_rate": 8.076391265658156e-05, + "loss": 0.7604, + "step": 14802 + }, + { + "epoch": 1.0029812317907718, + "grad_norm": 4.993529319763184, + "learning_rate": 8.076254363748374e-05, + "loss": 0.5721, + "step": 14803 + }, + { + "epoch": 1.0030489870587438, + "grad_norm": 4.659602642059326, + "learning_rate": 8.076117461838592e-05, + "loss": 0.5619, + "step": 14804 + }, + { + "epoch": 1.003116742326716, + "grad_norm": 4.9584221839904785, + "learning_rate": 8.075980559928812e-05, + "loss": 0.6407, + "step": 14805 + }, + { + "epoch": 1.003184497594688, + "grad_norm": 6.771777629852295, + "learning_rate": 8.07584365801903e-05, + "loss": 0.6165, + "step": 14806 + }, + { + "epoch": 1.0032522528626602, + "grad_norm": 6.287032604217529, + "learning_rate": 8.075706756109248e-05, + "loss": 0.8346, + "step": 14807 + }, + { + "epoch": 1.0033200081306322, + "grad_norm": 5.438133239746094, + "learning_rate": 8.075569854199466e-05, + "loss": 0.6621, + "step": 14808 + }, + { + "epoch": 1.0033877633986041, + "grad_norm": 5.8191633224487305, + "learning_rate": 8.075432952289685e-05, + "loss": 0.6837, + "step": 14809 + }, + { + "epoch": 1.0034555186665763, + "grad_norm": 5.39967679977417, + "learning_rate": 8.075296050379903e-05, + "loss": 0.6687, + "step": 14810 + }, + { + "epoch": 1.0035232739345483, + "grad_norm": 6.197573184967041, + "learning_rate": 8.075159148470121e-05, + "loss": 0.6243, + "step": 14811 + }, + { + "epoch": 1.0035910292025205, + "grad_norm": 5.200948715209961, + "learning_rate": 8.07502224656034e-05, + "loss": 0.6812, + "step": 14812 + }, + { + "epoch": 1.0036587844704925, + "grad_norm": 4.868316650390625, + "learning_rate": 8.074885344650558e-05, + "loss": 0.6131, + "step": 14813 + }, + { + "epoch": 1.0037265397384647, + "grad_norm": 4.966255187988281, + "learning_rate": 8.074748442740777e-05, + "loss": 0.7787, + "step": 14814 + }, + { + "epoch": 1.0037942950064367, + "grad_norm": 6.747678756713867, + "learning_rate": 8.074611540830996e-05, + "loss": 0.6185, + "step": 14815 + }, + { + "epoch": 1.003862050274409, + "grad_norm": 6.168325424194336, + "learning_rate": 8.074474638921214e-05, + "loss": 0.7458, + "step": 14816 + }, + { + "epoch": 1.0039298055423809, + "grad_norm": 4.884849548339844, + "learning_rate": 8.074337737011432e-05, + "loss": 0.604, + "step": 14817 + }, + { + "epoch": 1.003997560810353, + "grad_norm": 4.729106903076172, + "learning_rate": 8.07420083510165e-05, + "loss": 0.6351, + "step": 14818 + }, + { + "epoch": 1.004065316078325, + "grad_norm": 6.679864406585693, + "learning_rate": 8.07406393319187e-05, + "loss": 0.8324, + "step": 14819 + }, + { + "epoch": 1.0041330713462973, + "grad_norm": 5.62283182144165, + "learning_rate": 8.073927031282087e-05, + "loss": 0.8227, + "step": 14820 + }, + { + "epoch": 1.0042008266142692, + "grad_norm": 8.426139831542969, + "learning_rate": 8.073790129372305e-05, + "loss": 0.6578, + "step": 14821 + }, + { + "epoch": 1.0042685818822414, + "grad_norm": 6.8322038650512695, + "learning_rate": 8.073653227462524e-05, + "loss": 0.7437, + "step": 14822 + }, + { + "epoch": 1.0043363371502134, + "grad_norm": 8.170849800109863, + "learning_rate": 8.073516325552743e-05, + "loss": 0.7791, + "step": 14823 + }, + { + "epoch": 1.0044040924181856, + "grad_norm": 6.2695441246032715, + "learning_rate": 8.073379423642961e-05, + "loss": 0.7124, + "step": 14824 + }, + { + "epoch": 1.0044718476861576, + "grad_norm": 6.771942615509033, + "learning_rate": 8.073242521733179e-05, + "loss": 0.7781, + "step": 14825 + }, + { + "epoch": 1.0045396029541296, + "grad_norm": 5.956495761871338, + "learning_rate": 8.073105619823397e-05, + "loss": 0.6476, + "step": 14826 + }, + { + "epoch": 1.0046073582221018, + "grad_norm": 7.166341304779053, + "learning_rate": 8.072968717913615e-05, + "loss": 0.8113, + "step": 14827 + }, + { + "epoch": 1.0046751134900738, + "grad_norm": 5.417853832244873, + "learning_rate": 8.072831816003834e-05, + "loss": 0.5846, + "step": 14828 + }, + { + "epoch": 1.004742868758046, + "grad_norm": 5.144546031951904, + "learning_rate": 8.072694914094052e-05, + "loss": 0.693, + "step": 14829 + }, + { + "epoch": 1.004810624026018, + "grad_norm": 4.3079094886779785, + "learning_rate": 8.07255801218427e-05, + "loss": 0.7312, + "step": 14830 + }, + { + "epoch": 1.0048783792939902, + "grad_norm": 5.390209674835205, + "learning_rate": 8.072421110274489e-05, + "loss": 0.6347, + "step": 14831 + }, + { + "epoch": 1.0049461345619621, + "grad_norm": 5.938960552215576, + "learning_rate": 8.072284208364707e-05, + "loss": 0.76, + "step": 14832 + }, + { + "epoch": 1.0050138898299343, + "grad_norm": 5.8623247146606445, + "learning_rate": 8.072147306454926e-05, + "loss": 0.8752, + "step": 14833 + }, + { + "epoch": 1.0050816450979063, + "grad_norm": 6.3055219650268555, + "learning_rate": 8.072010404545144e-05, + "loss": 0.783, + "step": 14834 + }, + { + "epoch": 1.0051494003658785, + "grad_norm": 6.132151126861572, + "learning_rate": 8.071873502635362e-05, + "loss": 0.6859, + "step": 14835 + }, + { + "epoch": 1.0052171556338505, + "grad_norm": 4.891525745391846, + "learning_rate": 8.07173660072558e-05, + "loss": 0.5293, + "step": 14836 + }, + { + "epoch": 1.0052849109018227, + "grad_norm": 6.941277980804443, + "learning_rate": 8.0715996988158e-05, + "loss": 0.5511, + "step": 14837 + }, + { + "epoch": 1.0053526661697947, + "grad_norm": 7.667896270751953, + "learning_rate": 8.071462796906017e-05, + "loss": 0.6837, + "step": 14838 + }, + { + "epoch": 1.0054204214377669, + "grad_norm": 4.408740043640137, + "learning_rate": 8.071325894996236e-05, + "loss": 0.5733, + "step": 14839 + }, + { + "epoch": 1.0054881767057389, + "grad_norm": 7.213569164276123, + "learning_rate": 8.071188993086454e-05, + "loss": 0.6838, + "step": 14840 + }, + { + "epoch": 1.0055559319737108, + "grad_norm": 6.651516914367676, + "learning_rate": 8.071052091176672e-05, + "loss": 0.8254, + "step": 14841 + }, + { + "epoch": 1.005623687241683, + "grad_norm": 5.002716541290283, + "learning_rate": 8.070915189266891e-05, + "loss": 0.5535, + "step": 14842 + }, + { + "epoch": 1.005691442509655, + "grad_norm": 7.2078728675842285, + "learning_rate": 8.070778287357109e-05, + "loss": 0.7439, + "step": 14843 + }, + { + "epoch": 1.0057591977776272, + "grad_norm": 4.879020690917969, + "learning_rate": 8.070641385447327e-05, + "loss": 0.7649, + "step": 14844 + }, + { + "epoch": 1.0058269530455992, + "grad_norm": 7.177512168884277, + "learning_rate": 8.070504483537545e-05, + "loss": 0.599, + "step": 14845 + }, + { + "epoch": 1.0058947083135714, + "grad_norm": 7.1817708015441895, + "learning_rate": 8.070367581627764e-05, + "loss": 0.6034, + "step": 14846 + }, + { + "epoch": 1.0059624635815434, + "grad_norm": 11.819538116455078, + "learning_rate": 8.070230679717982e-05, + "loss": 0.6347, + "step": 14847 + }, + { + "epoch": 1.0060302188495156, + "grad_norm": 5.775310516357422, + "learning_rate": 8.0700937778082e-05, + "loss": 0.577, + "step": 14848 + }, + { + "epoch": 1.0060979741174876, + "grad_norm": 6.324820518493652, + "learning_rate": 8.069956875898419e-05, + "loss": 0.7638, + "step": 14849 + }, + { + "epoch": 1.0061657293854598, + "grad_norm": 5.748277187347412, + "learning_rate": 8.069819973988637e-05, + "loss": 0.6587, + "step": 14850 + }, + { + "epoch": 1.0062334846534318, + "grad_norm": 5.290812015533447, + "learning_rate": 8.069683072078856e-05, + "loss": 0.5606, + "step": 14851 + }, + { + "epoch": 1.006301239921404, + "grad_norm": 5.5858941078186035, + "learning_rate": 8.069546170169074e-05, + "loss": 0.4992, + "step": 14852 + }, + { + "epoch": 1.006368995189376, + "grad_norm": 5.581445693969727, + "learning_rate": 8.069409268259292e-05, + "loss": 0.5722, + "step": 14853 + }, + { + "epoch": 1.0064367504573482, + "grad_norm": 6.363956451416016, + "learning_rate": 8.06927236634951e-05, + "loss": 0.5024, + "step": 14854 + }, + { + "epoch": 1.0065045057253201, + "grad_norm": 6.256494998931885, + "learning_rate": 8.069135464439728e-05, + "loss": 0.4683, + "step": 14855 + }, + { + "epoch": 1.0065722609932923, + "grad_norm": 5.651278972625732, + "learning_rate": 8.068998562529948e-05, + "loss": 0.6379, + "step": 14856 + }, + { + "epoch": 1.0066400162612643, + "grad_norm": 7.8025712966918945, + "learning_rate": 8.068861660620166e-05, + "loss": 0.7948, + "step": 14857 + }, + { + "epoch": 1.0067077715292363, + "grad_norm": 6.639724254608154, + "learning_rate": 8.068724758710384e-05, + "loss": 0.8023, + "step": 14858 + }, + { + "epoch": 1.0067755267972085, + "grad_norm": 3.8018884658813477, + "learning_rate": 8.068587856800603e-05, + "loss": 0.5517, + "step": 14859 + }, + { + "epoch": 1.0068432820651805, + "grad_norm": 5.326318740844727, + "learning_rate": 8.068450954890821e-05, + "loss": 0.611, + "step": 14860 + }, + { + "epoch": 1.0069110373331527, + "grad_norm": 6.377422332763672, + "learning_rate": 8.068314052981039e-05, + "loss": 0.6394, + "step": 14861 + }, + { + "epoch": 1.0069787926011247, + "grad_norm": 5.33223295211792, + "learning_rate": 8.068177151071258e-05, + "loss": 0.6427, + "step": 14862 + }, + { + "epoch": 1.0070465478690969, + "grad_norm": 6.365980625152588, + "learning_rate": 8.068040249161476e-05, + "loss": 0.7221, + "step": 14863 + }, + { + "epoch": 1.0071143031370688, + "grad_norm": 6.96981143951416, + "learning_rate": 8.067903347251694e-05, + "loss": 0.9021, + "step": 14864 + }, + { + "epoch": 1.007182058405041, + "grad_norm": 8.809569358825684, + "learning_rate": 8.067766445341914e-05, + "loss": 0.8451, + "step": 14865 + }, + { + "epoch": 1.007249813673013, + "grad_norm": 6.544775485992432, + "learning_rate": 8.067629543432132e-05, + "loss": 0.6729, + "step": 14866 + }, + { + "epoch": 1.0073175689409852, + "grad_norm": 4.71762228012085, + "learning_rate": 8.06749264152235e-05, + "loss": 0.6192, + "step": 14867 + }, + { + "epoch": 1.0073853242089572, + "grad_norm": 5.027548313140869, + "learning_rate": 8.067355739612568e-05, + "loss": 0.6514, + "step": 14868 + }, + { + "epoch": 1.0074530794769294, + "grad_norm": 6.541376113891602, + "learning_rate": 8.067218837702787e-05, + "loss": 0.7091, + "step": 14869 + }, + { + "epoch": 1.0075208347449014, + "grad_norm": 4.369992733001709, + "learning_rate": 8.067081935793005e-05, + "loss": 0.5207, + "step": 14870 + }, + { + "epoch": 1.0075885900128736, + "grad_norm": 6.614040851593018, + "learning_rate": 8.066945033883223e-05, + "loss": 0.4269, + "step": 14871 + }, + { + "epoch": 1.0076563452808456, + "grad_norm": 5.504267692565918, + "learning_rate": 8.066808131973441e-05, + "loss": 0.867, + "step": 14872 + }, + { + "epoch": 1.0077241005488178, + "grad_norm": 5.660549640655518, + "learning_rate": 8.06667123006366e-05, + "loss": 0.7685, + "step": 14873 + }, + { + "epoch": 1.0077918558167898, + "grad_norm": 6.468850612640381, + "learning_rate": 8.066534328153879e-05, + "loss": 0.7089, + "step": 14874 + }, + { + "epoch": 1.0078596110847617, + "grad_norm": 5.692544460296631, + "learning_rate": 8.066397426244097e-05, + "loss": 0.6021, + "step": 14875 + }, + { + "epoch": 1.007927366352734, + "grad_norm": 9.963805198669434, + "learning_rate": 8.066260524334315e-05, + "loss": 0.5066, + "step": 14876 + }, + { + "epoch": 1.007995121620706, + "grad_norm": 6.6660308837890625, + "learning_rate": 8.066123622424533e-05, + "loss": 0.5434, + "step": 14877 + }, + { + "epoch": 1.0080628768886781, + "grad_norm": 5.235628604888916, + "learning_rate": 8.065986720514751e-05, + "loss": 0.7677, + "step": 14878 + }, + { + "epoch": 1.00813063215665, + "grad_norm": 3.8803293704986572, + "learning_rate": 8.06584981860497e-05, + "loss": 0.5942, + "step": 14879 + }, + { + "epoch": 1.0081983874246223, + "grad_norm": 6.2981390953063965, + "learning_rate": 8.065712916695188e-05, + "loss": 0.7267, + "step": 14880 + }, + { + "epoch": 1.0082661426925943, + "grad_norm": 5.843595504760742, + "learning_rate": 8.065576014785406e-05, + "loss": 0.6444, + "step": 14881 + }, + { + "epoch": 1.0083338979605665, + "grad_norm": 4.541314601898193, + "learning_rate": 8.065439112875625e-05, + "loss": 0.5712, + "step": 14882 + }, + { + "epoch": 1.0084016532285385, + "grad_norm": 6.081702709197998, + "learning_rate": 8.065302210965844e-05, + "loss": 0.5991, + "step": 14883 + }, + { + "epoch": 1.0084694084965107, + "grad_norm": 6.3184943199157715, + "learning_rate": 8.065165309056062e-05, + "loss": 0.6225, + "step": 14884 + }, + { + "epoch": 1.0085371637644827, + "grad_norm": 4.152682304382324, + "learning_rate": 8.06502840714628e-05, + "loss": 0.6958, + "step": 14885 + }, + { + "epoch": 1.0086049190324549, + "grad_norm": 5.545871257781982, + "learning_rate": 8.064891505236498e-05, + "loss": 0.6238, + "step": 14886 + }, + { + "epoch": 1.0086726743004268, + "grad_norm": 5.534969329833984, + "learning_rate": 8.064754603326716e-05, + "loss": 0.6741, + "step": 14887 + }, + { + "epoch": 1.008740429568399, + "grad_norm": 5.907772064208984, + "learning_rate": 8.064617701416935e-05, + "loss": 0.6827, + "step": 14888 + }, + { + "epoch": 1.008808184836371, + "grad_norm": 6.224740028381348, + "learning_rate": 8.064480799507153e-05, + "loss": 0.7227, + "step": 14889 + }, + { + "epoch": 1.008875940104343, + "grad_norm": 5.671234607696533, + "learning_rate": 8.064343897597372e-05, + "loss": 0.6511, + "step": 14890 + }, + { + "epoch": 1.0089436953723152, + "grad_norm": 4.597881317138672, + "learning_rate": 8.06420699568759e-05, + "loss": 0.5303, + "step": 14891 + }, + { + "epoch": 1.0090114506402872, + "grad_norm": 7.3619561195373535, + "learning_rate": 8.064070093777809e-05, + "loss": 0.6853, + "step": 14892 + }, + { + "epoch": 1.0090792059082594, + "grad_norm": 5.377906322479248, + "learning_rate": 8.063933191868027e-05, + "loss": 0.6067, + "step": 14893 + }, + { + "epoch": 1.0091469611762314, + "grad_norm": 4.887160778045654, + "learning_rate": 8.063796289958245e-05, + "loss": 0.7488, + "step": 14894 + }, + { + "epoch": 1.0092147164442036, + "grad_norm": 6.19053316116333, + "learning_rate": 8.063659388048463e-05, + "loss": 0.6629, + "step": 14895 + }, + { + "epoch": 1.0092824717121756, + "grad_norm": 5.102216720581055, + "learning_rate": 8.063522486138681e-05, + "loss": 0.765, + "step": 14896 + }, + { + "epoch": 1.0093502269801478, + "grad_norm": 8.829859733581543, + "learning_rate": 8.0633855842289e-05, + "loss": 0.5257, + "step": 14897 + }, + { + "epoch": 1.0094179822481197, + "grad_norm": 11.013011932373047, + "learning_rate": 8.063248682319118e-05, + "loss": 0.7178, + "step": 14898 + }, + { + "epoch": 1.009485737516092, + "grad_norm": 6.39233922958374, + "learning_rate": 8.063111780409337e-05, + "loss": 0.7438, + "step": 14899 + }, + { + "epoch": 1.009553492784064, + "grad_norm": 5.155503749847412, + "learning_rate": 8.062974878499555e-05, + "loss": 0.5149, + "step": 14900 + }, + { + "epoch": 1.0096212480520361, + "grad_norm": 5.446846961975098, + "learning_rate": 8.062837976589774e-05, + "loss": 0.5956, + "step": 14901 + }, + { + "epoch": 1.009689003320008, + "grad_norm": 4.781370639801025, + "learning_rate": 8.062701074679992e-05, + "loss": 0.7076, + "step": 14902 + }, + { + "epoch": 1.0097567585879803, + "grad_norm": 5.34435510635376, + "learning_rate": 8.06256417277021e-05, + "loss": 0.6906, + "step": 14903 + }, + { + "epoch": 1.0098245138559523, + "grad_norm": 9.561019897460938, + "learning_rate": 8.062427270860428e-05, + "loss": 0.6931, + "step": 14904 + }, + { + "epoch": 1.0098922691239245, + "grad_norm": 5.806305408477783, + "learning_rate": 8.062290368950647e-05, + "loss": 0.9629, + "step": 14905 + }, + { + "epoch": 1.0099600243918965, + "grad_norm": 6.2731032371521, + "learning_rate": 8.062153467040865e-05, + "loss": 0.8255, + "step": 14906 + }, + { + "epoch": 1.0100277796598685, + "grad_norm": 5.084347248077393, + "learning_rate": 8.062016565131084e-05, + "loss": 0.756, + "step": 14907 + }, + { + "epoch": 1.0100955349278407, + "grad_norm": 5.779279708862305, + "learning_rate": 8.061879663221303e-05, + "loss": 0.8397, + "step": 14908 + }, + { + "epoch": 1.0101632901958126, + "grad_norm": 8.087071418762207, + "learning_rate": 8.061742761311521e-05, + "loss": 0.7148, + "step": 14909 + }, + { + "epoch": 1.0102310454637848, + "grad_norm": 7.416251182556152, + "learning_rate": 8.061605859401739e-05, + "loss": 0.5286, + "step": 14910 + }, + { + "epoch": 1.0102988007317568, + "grad_norm": 7.369309425354004, + "learning_rate": 8.061468957491958e-05, + "loss": 0.6396, + "step": 14911 + }, + { + "epoch": 1.010366555999729, + "grad_norm": 5.095036029815674, + "learning_rate": 8.061332055582176e-05, + "loss": 0.5745, + "step": 14912 + }, + { + "epoch": 1.010434311267701, + "grad_norm": 5.032590866088867, + "learning_rate": 8.061195153672394e-05, + "loss": 0.8218, + "step": 14913 + }, + { + "epoch": 1.0105020665356732, + "grad_norm": 5.7722039222717285, + "learning_rate": 8.061058251762612e-05, + "loss": 0.5213, + "step": 14914 + }, + { + "epoch": 1.0105698218036452, + "grad_norm": 5.260948181152344, + "learning_rate": 8.060921349852832e-05, + "loss": 0.755, + "step": 14915 + }, + { + "epoch": 1.0106375770716174, + "grad_norm": 6.892032623291016, + "learning_rate": 8.06078444794305e-05, + "loss": 0.7952, + "step": 14916 + }, + { + "epoch": 1.0107053323395894, + "grad_norm": 6.230368137359619, + "learning_rate": 8.060647546033268e-05, + "loss": 0.5725, + "step": 14917 + }, + { + "epoch": 1.0107730876075616, + "grad_norm": 6.269092559814453, + "learning_rate": 8.060510644123486e-05, + "loss": 0.4972, + "step": 14918 + }, + { + "epoch": 1.0108408428755336, + "grad_norm": 6.031118869781494, + "learning_rate": 8.060373742213704e-05, + "loss": 0.8361, + "step": 14919 + }, + { + "epoch": 1.0109085981435058, + "grad_norm": 8.682488441467285, + "learning_rate": 8.060236840303923e-05, + "loss": 0.6005, + "step": 14920 + }, + { + "epoch": 1.0109763534114777, + "grad_norm": 6.82516622543335, + "learning_rate": 8.060099938394141e-05, + "loss": 0.7075, + "step": 14921 + }, + { + "epoch": 1.01104410867945, + "grad_norm": 6.251338958740234, + "learning_rate": 8.05996303648436e-05, + "loss": 0.7116, + "step": 14922 + }, + { + "epoch": 1.011111863947422, + "grad_norm": 7.739006519317627, + "learning_rate": 8.059826134574577e-05, + "loss": 0.9093, + "step": 14923 + }, + { + "epoch": 1.011179619215394, + "grad_norm": 6.702469825744629, + "learning_rate": 8.059689232664797e-05, + "loss": 0.5837, + "step": 14924 + }, + { + "epoch": 1.011247374483366, + "grad_norm": 6.131649494171143, + "learning_rate": 8.059552330755015e-05, + "loss": 0.5026, + "step": 14925 + }, + { + "epoch": 1.011315129751338, + "grad_norm": 4.714535713195801, + "learning_rate": 8.059415428845233e-05, + "loss": 0.5705, + "step": 14926 + }, + { + "epoch": 1.0113828850193103, + "grad_norm": 5.460876941680908, + "learning_rate": 8.059278526935451e-05, + "loss": 0.9523, + "step": 14927 + }, + { + "epoch": 1.0114506402872823, + "grad_norm": 5.706765174865723, + "learning_rate": 8.059141625025669e-05, + "loss": 0.7936, + "step": 14928 + }, + { + "epoch": 1.0115183955552545, + "grad_norm": 6.572486877441406, + "learning_rate": 8.059004723115888e-05, + "loss": 0.8149, + "step": 14929 + }, + { + "epoch": 1.0115861508232264, + "grad_norm": 9.406861305236816, + "learning_rate": 8.058867821206106e-05, + "loss": 0.5752, + "step": 14930 + }, + { + "epoch": 1.0116539060911987, + "grad_norm": 5.10825252532959, + "learning_rate": 8.058730919296324e-05, + "loss": 0.4359, + "step": 14931 + }, + { + "epoch": 1.0117216613591706, + "grad_norm": 6.5198516845703125, + "learning_rate": 8.058594017386542e-05, + "loss": 0.7252, + "step": 14932 + }, + { + "epoch": 1.0117894166271428, + "grad_norm": 5.7710981369018555, + "learning_rate": 8.05845711547676e-05, + "loss": 0.6666, + "step": 14933 + }, + { + "epoch": 1.0118571718951148, + "grad_norm": 5.927985191345215, + "learning_rate": 8.05832021356698e-05, + "loss": 0.7755, + "step": 14934 + }, + { + "epoch": 1.011924927163087, + "grad_norm": 7.585999488830566, + "learning_rate": 8.058183311657198e-05, + "loss": 0.7536, + "step": 14935 + }, + { + "epoch": 1.011992682431059, + "grad_norm": 4.760716915130615, + "learning_rate": 8.058046409747416e-05, + "loss": 0.6324, + "step": 14936 + }, + { + "epoch": 1.0120604376990312, + "grad_norm": 5.466569423675537, + "learning_rate": 8.057909507837634e-05, + "loss": 0.7267, + "step": 14937 + }, + { + "epoch": 1.0121281929670032, + "grad_norm": 4.7150797843933105, + "learning_rate": 8.057772605927853e-05, + "loss": 0.6393, + "step": 14938 + }, + { + "epoch": 1.0121959482349752, + "grad_norm": 4.741540908813477, + "learning_rate": 8.057635704018071e-05, + "loss": 0.4859, + "step": 14939 + }, + { + "epoch": 1.0122637035029474, + "grad_norm": 5.607858657836914, + "learning_rate": 8.05749880210829e-05, + "loss": 0.738, + "step": 14940 + }, + { + "epoch": 1.0123314587709193, + "grad_norm": 6.989736080169678, + "learning_rate": 8.057361900198508e-05, + "loss": 0.6129, + "step": 14941 + }, + { + "epoch": 1.0123992140388915, + "grad_norm": 9.055098533630371, + "learning_rate": 8.057224998288726e-05, + "loss": 0.7514, + "step": 14942 + }, + { + "epoch": 1.0124669693068635, + "grad_norm": 5.395411014556885, + "learning_rate": 8.057088096378945e-05, + "loss": 0.817, + "step": 14943 + }, + { + "epoch": 1.0125347245748357, + "grad_norm": 5.7741851806640625, + "learning_rate": 8.056951194469163e-05, + "loss": 0.5735, + "step": 14944 + }, + { + "epoch": 1.0126024798428077, + "grad_norm": 6.427124500274658, + "learning_rate": 8.056814292559381e-05, + "loss": 0.613, + "step": 14945 + }, + { + "epoch": 1.01267023511078, + "grad_norm": 4.106490135192871, + "learning_rate": 8.056677390649599e-05, + "loss": 0.6788, + "step": 14946 + }, + { + "epoch": 1.012737990378752, + "grad_norm": 5.7934370040893555, + "learning_rate": 8.056540488739818e-05, + "loss": 0.7784, + "step": 14947 + }, + { + "epoch": 1.012805745646724, + "grad_norm": 7.501285076141357, + "learning_rate": 8.056403586830036e-05, + "loss": 0.77, + "step": 14948 + }, + { + "epoch": 1.012873500914696, + "grad_norm": 4.6254472732543945, + "learning_rate": 8.056266684920254e-05, + "loss": 0.5918, + "step": 14949 + }, + { + "epoch": 1.0129412561826683, + "grad_norm": 5.470935821533203, + "learning_rate": 8.056129783010473e-05, + "loss": 0.6471, + "step": 14950 + }, + { + "epoch": 1.0130090114506403, + "grad_norm": 4.878664970397949, + "learning_rate": 8.055992881100692e-05, + "loss": 0.5186, + "step": 14951 + }, + { + "epoch": 1.0130767667186125, + "grad_norm": 5.364741802215576, + "learning_rate": 8.05585597919091e-05, + "loss": 0.7577, + "step": 14952 + }, + { + "epoch": 1.0131445219865844, + "grad_norm": 5.699579238891602, + "learning_rate": 8.055719077281128e-05, + "loss": 0.6527, + "step": 14953 + }, + { + "epoch": 1.0132122772545566, + "grad_norm": 5.406833648681641, + "learning_rate": 8.055582175371347e-05, + "loss": 0.7961, + "step": 14954 + }, + { + "epoch": 1.0132800325225286, + "grad_norm": 7.075387954711914, + "learning_rate": 8.055445273461565e-05, + "loss": 0.5801, + "step": 14955 + }, + { + "epoch": 1.0133477877905006, + "grad_norm": 7.106254577636719, + "learning_rate": 8.055308371551783e-05, + "loss": 0.7384, + "step": 14956 + }, + { + "epoch": 1.0134155430584728, + "grad_norm": 5.849175453186035, + "learning_rate": 8.055171469642003e-05, + "loss": 0.5413, + "step": 14957 + }, + { + "epoch": 1.0134832983264448, + "grad_norm": 10.159880638122559, + "learning_rate": 8.055034567732221e-05, + "loss": 0.6702, + "step": 14958 + }, + { + "epoch": 1.013551053594417, + "grad_norm": 5.166466236114502, + "learning_rate": 8.054897665822439e-05, + "loss": 0.8039, + "step": 14959 + }, + { + "epoch": 1.013618808862389, + "grad_norm": 5.084167957305908, + "learning_rate": 8.054760763912657e-05, + "loss": 0.6388, + "step": 14960 + }, + { + "epoch": 1.0136865641303612, + "grad_norm": 7.328620433807373, + "learning_rate": 8.054623862002876e-05, + "loss": 0.7152, + "step": 14961 + }, + { + "epoch": 1.0137543193983332, + "grad_norm": 6.941154479980469, + "learning_rate": 8.054486960093094e-05, + "loss": 0.744, + "step": 14962 + }, + { + "epoch": 1.0138220746663054, + "grad_norm": 5.957236289978027, + "learning_rate": 8.054350058183312e-05, + "loss": 0.5999, + "step": 14963 + }, + { + "epoch": 1.0138898299342773, + "grad_norm": 5.381267547607422, + "learning_rate": 8.05421315627353e-05, + "loss": 0.8635, + "step": 14964 + }, + { + "epoch": 1.0139575852022495, + "grad_norm": 5.371838569641113, + "learning_rate": 8.054076254363748e-05, + "loss": 0.6552, + "step": 14965 + }, + { + "epoch": 1.0140253404702215, + "grad_norm": 5.056934356689453, + "learning_rate": 8.053939352453968e-05, + "loss": 0.5843, + "step": 14966 + }, + { + "epoch": 1.0140930957381937, + "grad_norm": 5.23961877822876, + "learning_rate": 8.053802450544186e-05, + "loss": 0.8306, + "step": 14967 + }, + { + "epoch": 1.0141608510061657, + "grad_norm": 6.583441734313965, + "learning_rate": 8.053665548634404e-05, + "loss": 0.686, + "step": 14968 + }, + { + "epoch": 1.014228606274138, + "grad_norm": 7.94297981262207, + "learning_rate": 8.053528646724622e-05, + "loss": 0.7668, + "step": 14969 + }, + { + "epoch": 1.01429636154211, + "grad_norm": 5.7727885246276855, + "learning_rate": 8.053391744814841e-05, + "loss": 0.7958, + "step": 14970 + }, + { + "epoch": 1.014364116810082, + "grad_norm": 4.770255088806152, + "learning_rate": 8.05325484290506e-05, + "loss": 0.8039, + "step": 14971 + }, + { + "epoch": 1.014431872078054, + "grad_norm": 6.65435266494751, + "learning_rate": 8.053117940995277e-05, + "loss": 0.7278, + "step": 14972 + }, + { + "epoch": 1.014499627346026, + "grad_norm": 5.573418140411377, + "learning_rate": 8.052981039085495e-05, + "loss": 0.8147, + "step": 14973 + }, + { + "epoch": 1.0145673826139983, + "grad_norm": 6.491858005523682, + "learning_rate": 8.052844137175713e-05, + "loss": 0.6853, + "step": 14974 + }, + { + "epoch": 1.0146351378819702, + "grad_norm": 5.823346138000488, + "learning_rate": 8.052707235265933e-05, + "loss": 0.6468, + "step": 14975 + }, + { + "epoch": 1.0147028931499424, + "grad_norm": 7.527129650115967, + "learning_rate": 8.052570333356151e-05, + "loss": 0.7114, + "step": 14976 + }, + { + "epoch": 1.0147706484179144, + "grad_norm": 5.738404273986816, + "learning_rate": 8.052433431446369e-05, + "loss": 0.584, + "step": 14977 + }, + { + "epoch": 1.0148384036858866, + "grad_norm": 5.364207744598389, + "learning_rate": 8.052296529536587e-05, + "loss": 0.7407, + "step": 14978 + }, + { + "epoch": 1.0149061589538586, + "grad_norm": 5.664304256439209, + "learning_rate": 8.052159627626806e-05, + "loss": 0.5753, + "step": 14979 + }, + { + "epoch": 1.0149739142218308, + "grad_norm": 8.1001558303833, + "learning_rate": 8.052022725717024e-05, + "loss": 0.6435, + "step": 14980 + }, + { + "epoch": 1.0150416694898028, + "grad_norm": 6.447721481323242, + "learning_rate": 8.051885823807242e-05, + "loss": 0.7063, + "step": 14981 + }, + { + "epoch": 1.015109424757775, + "grad_norm": 5.5510969161987305, + "learning_rate": 8.05174892189746e-05, + "loss": 0.5911, + "step": 14982 + }, + { + "epoch": 1.015177180025747, + "grad_norm": 5.534966468811035, + "learning_rate": 8.051612019987678e-05, + "loss": 0.5952, + "step": 14983 + }, + { + "epoch": 1.0152449352937192, + "grad_norm": 5.641912460327148, + "learning_rate": 8.051475118077898e-05, + "loss": 0.8201, + "step": 14984 + }, + { + "epoch": 1.0153126905616912, + "grad_norm": 5.643457412719727, + "learning_rate": 8.051338216168116e-05, + "loss": 0.5825, + "step": 14985 + }, + { + "epoch": 1.0153804458296634, + "grad_norm": 5.161952972412109, + "learning_rate": 8.051201314258334e-05, + "loss": 0.5534, + "step": 14986 + }, + { + "epoch": 1.0154482010976353, + "grad_norm": 6.023798942565918, + "learning_rate": 8.051064412348552e-05, + "loss": 0.7081, + "step": 14987 + }, + { + "epoch": 1.0155159563656073, + "grad_norm": 8.365532875061035, + "learning_rate": 8.05092751043877e-05, + "loss": 0.6779, + "step": 14988 + }, + { + "epoch": 1.0155837116335795, + "grad_norm": 6.577813625335693, + "learning_rate": 8.05079060852899e-05, + "loss": 0.6511, + "step": 14989 + }, + { + "epoch": 1.0156514669015515, + "grad_norm": 6.61271858215332, + "learning_rate": 8.050653706619207e-05, + "loss": 0.6625, + "step": 14990 + }, + { + "epoch": 1.0157192221695237, + "grad_norm": 5.8450517654418945, + "learning_rate": 8.050516804709425e-05, + "loss": 0.6154, + "step": 14991 + }, + { + "epoch": 1.0157869774374957, + "grad_norm": 6.10211181640625, + "learning_rate": 8.050379902799644e-05, + "loss": 0.7207, + "step": 14992 + }, + { + "epoch": 1.0158547327054679, + "grad_norm": 6.336976528167725, + "learning_rate": 8.050243000889863e-05, + "loss": 0.7093, + "step": 14993 + }, + { + "epoch": 1.0159224879734399, + "grad_norm": 5.03262996673584, + "learning_rate": 8.050106098980081e-05, + "loss": 0.5328, + "step": 14994 + }, + { + "epoch": 1.015990243241412, + "grad_norm": 6.129831314086914, + "learning_rate": 8.049969197070299e-05, + "loss": 1.0295, + "step": 14995 + }, + { + "epoch": 1.016057998509384, + "grad_norm": 5.919708728790283, + "learning_rate": 8.049832295160517e-05, + "loss": 0.8243, + "step": 14996 + }, + { + "epoch": 1.0161257537773563, + "grad_norm": 11.239713668823242, + "learning_rate": 8.049695393250736e-05, + "loss": 0.6003, + "step": 14997 + }, + { + "epoch": 1.0161935090453282, + "grad_norm": 4.68793249130249, + "learning_rate": 8.049558491340954e-05, + "loss": 0.64, + "step": 14998 + }, + { + "epoch": 1.0162612643133004, + "grad_norm": 6.901069164276123, + "learning_rate": 8.049421589431172e-05, + "loss": 0.7363, + "step": 14999 + }, + { + "epoch": 1.0163290195812724, + "grad_norm": 5.286013603210449, + "learning_rate": 8.049284687521392e-05, + "loss": 0.5766, + "step": 15000 + }, + { + "epoch": 1.0163967748492446, + "grad_norm": 9.495491027832031, + "learning_rate": 8.04914778561161e-05, + "loss": 0.5364, + "step": 15001 + }, + { + "epoch": 1.0164645301172166, + "grad_norm": 5.1024932861328125, + "learning_rate": 8.049010883701828e-05, + "loss": 0.7218, + "step": 15002 + }, + { + "epoch": 1.0165322853851888, + "grad_norm": 6.650432586669922, + "learning_rate": 8.048873981792047e-05, + "loss": 0.8232, + "step": 15003 + }, + { + "epoch": 1.0166000406531608, + "grad_norm": 6.3598313331604, + "learning_rate": 8.048737079882265e-05, + "loss": 0.5487, + "step": 15004 + }, + { + "epoch": 1.0166677959211328, + "grad_norm": 7.906015872955322, + "learning_rate": 8.048600177972483e-05, + "loss": 0.7754, + "step": 15005 + }, + { + "epoch": 1.016735551189105, + "grad_norm": 8.92300033569336, + "learning_rate": 8.048463276062701e-05, + "loss": 0.8194, + "step": 15006 + }, + { + "epoch": 1.016803306457077, + "grad_norm": 5.348727703094482, + "learning_rate": 8.048326374152921e-05, + "loss": 0.7689, + "step": 15007 + }, + { + "epoch": 1.0168710617250492, + "grad_norm": 4.177212715148926, + "learning_rate": 8.048189472243139e-05, + "loss": 0.5612, + "step": 15008 + }, + { + "epoch": 1.0169388169930211, + "grad_norm": 5.465108871459961, + "learning_rate": 8.048052570333357e-05, + "loss": 0.7477, + "step": 15009 + }, + { + "epoch": 1.0170065722609933, + "grad_norm": 6.203986644744873, + "learning_rate": 8.047915668423575e-05, + "loss": 0.6927, + "step": 15010 + }, + { + "epoch": 1.0170743275289653, + "grad_norm": 5.149818420410156, + "learning_rate": 8.047778766513793e-05, + "loss": 0.6977, + "step": 15011 + }, + { + "epoch": 1.0171420827969375, + "grad_norm": 8.325011253356934, + "learning_rate": 8.047641864604012e-05, + "loss": 0.6829, + "step": 15012 + }, + { + "epoch": 1.0172098380649095, + "grad_norm": 5.652045726776123, + "learning_rate": 8.04750496269423e-05, + "loss": 0.7819, + "step": 15013 + }, + { + "epoch": 1.0172775933328817, + "grad_norm": 5.549155235290527, + "learning_rate": 8.047368060784448e-05, + "loss": 0.6268, + "step": 15014 + }, + { + "epoch": 1.0173453486008537, + "grad_norm": 6.955049991607666, + "learning_rate": 8.047231158874666e-05, + "loss": 0.8359, + "step": 15015 + }, + { + "epoch": 1.0174131038688259, + "grad_norm": 5.063580513000488, + "learning_rate": 8.047094256964886e-05, + "loss": 0.6845, + "step": 15016 + }, + { + "epoch": 1.0174808591367979, + "grad_norm": 4.619150638580322, + "learning_rate": 8.046957355055104e-05, + "loss": 0.5725, + "step": 15017 + }, + { + "epoch": 1.01754861440477, + "grad_norm": 5.2023820877075195, + "learning_rate": 8.046820453145322e-05, + "loss": 0.8112, + "step": 15018 + }, + { + "epoch": 1.017616369672742, + "grad_norm": 6.25310754776001, + "learning_rate": 8.04668355123554e-05, + "loss": 0.772, + "step": 15019 + }, + { + "epoch": 1.0176841249407143, + "grad_norm": 6.057315349578857, + "learning_rate": 8.046546649325758e-05, + "loss": 0.726, + "step": 15020 + }, + { + "epoch": 1.0177518802086862, + "grad_norm": 6.200272560119629, + "learning_rate": 8.046409747415977e-05, + "loss": 0.8038, + "step": 15021 + }, + { + "epoch": 1.0178196354766582, + "grad_norm": 6.192685604095459, + "learning_rate": 8.046272845506195e-05, + "loss": 0.8221, + "step": 15022 + }, + { + "epoch": 1.0178873907446304, + "grad_norm": 6.599914073944092, + "learning_rate": 8.046135943596413e-05, + "loss": 0.8947, + "step": 15023 + }, + { + "epoch": 1.0179551460126024, + "grad_norm": 5.081368923187256, + "learning_rate": 8.045999041686631e-05, + "loss": 0.5658, + "step": 15024 + }, + { + "epoch": 1.0180229012805746, + "grad_norm": 4.629168510437012, + "learning_rate": 8.045862139776851e-05, + "loss": 0.4918, + "step": 15025 + }, + { + "epoch": 1.0180906565485466, + "grad_norm": 5.531469345092773, + "learning_rate": 8.045725237867069e-05, + "loss": 0.9338, + "step": 15026 + }, + { + "epoch": 1.0181584118165188, + "grad_norm": 5.594359874725342, + "learning_rate": 8.045588335957287e-05, + "loss": 0.8965, + "step": 15027 + }, + { + "epoch": 1.0182261670844908, + "grad_norm": 5.746640205383301, + "learning_rate": 8.045451434047505e-05, + "loss": 0.8001, + "step": 15028 + }, + { + "epoch": 1.018293922352463, + "grad_norm": 6.410648822784424, + "learning_rate": 8.045314532137723e-05, + "loss": 0.6035, + "step": 15029 + }, + { + "epoch": 1.018361677620435, + "grad_norm": 5.184842109680176, + "learning_rate": 8.045177630227942e-05, + "loss": 0.6867, + "step": 15030 + }, + { + "epoch": 1.0184294328884071, + "grad_norm": 4.650674343109131, + "learning_rate": 8.04504072831816e-05, + "loss": 0.5872, + "step": 15031 + }, + { + "epoch": 1.0184971881563791, + "grad_norm": 5.6024346351623535, + "learning_rate": 8.044903826408378e-05, + "loss": 0.7804, + "step": 15032 + }, + { + "epoch": 1.0185649434243513, + "grad_norm": 4.8978047370910645, + "learning_rate": 8.044766924498596e-05, + "loss": 0.6067, + "step": 15033 + }, + { + "epoch": 1.0186326986923233, + "grad_norm": 5.678053379058838, + "learning_rate": 8.044630022588816e-05, + "loss": 0.7362, + "step": 15034 + }, + { + "epoch": 1.0187004539602955, + "grad_norm": 5.044686317443848, + "learning_rate": 8.044493120679034e-05, + "loss": 0.9308, + "step": 15035 + }, + { + "epoch": 1.0187682092282675, + "grad_norm": 6.241668701171875, + "learning_rate": 8.044356218769252e-05, + "loss": 0.7732, + "step": 15036 + }, + { + "epoch": 1.0188359644962395, + "grad_norm": 5.17287540435791, + "learning_rate": 8.04421931685947e-05, + "loss": 0.6823, + "step": 15037 + }, + { + "epoch": 1.0189037197642117, + "grad_norm": 5.231973648071289, + "learning_rate": 8.044082414949688e-05, + "loss": 0.5864, + "step": 15038 + }, + { + "epoch": 1.0189714750321837, + "grad_norm": 5.848363876342773, + "learning_rate": 8.043945513039907e-05, + "loss": 0.6496, + "step": 15039 + }, + { + "epoch": 1.0190392303001559, + "grad_norm": 5.491754531860352, + "learning_rate": 8.043808611130125e-05, + "loss": 0.6945, + "step": 15040 + }, + { + "epoch": 1.0191069855681278, + "grad_norm": 5.215052604675293, + "learning_rate": 8.043671709220343e-05, + "loss": 0.7234, + "step": 15041 + }, + { + "epoch": 1.0191747408361, + "grad_norm": 4.157303810119629, + "learning_rate": 8.043534807310561e-05, + "loss": 0.5688, + "step": 15042 + }, + { + "epoch": 1.019242496104072, + "grad_norm": 5.955345630645752, + "learning_rate": 8.043397905400781e-05, + "loss": 0.7193, + "step": 15043 + }, + { + "epoch": 1.0193102513720442, + "grad_norm": 5.868755340576172, + "learning_rate": 8.043261003490999e-05, + "loss": 0.7041, + "step": 15044 + }, + { + "epoch": 1.0193780066400162, + "grad_norm": 5.522411823272705, + "learning_rate": 8.043124101581217e-05, + "loss": 0.6228, + "step": 15045 + }, + { + "epoch": 1.0194457619079884, + "grad_norm": 5.676867961883545, + "learning_rate": 8.042987199671436e-05, + "loss": 0.6802, + "step": 15046 + }, + { + "epoch": 1.0195135171759604, + "grad_norm": 5.678533554077148, + "learning_rate": 8.042850297761654e-05, + "loss": 0.8044, + "step": 15047 + }, + { + "epoch": 1.0195812724439326, + "grad_norm": 6.804957866668701, + "learning_rate": 8.042713395851872e-05, + "loss": 0.8231, + "step": 15048 + }, + { + "epoch": 1.0196490277119046, + "grad_norm": 4.685696601867676, + "learning_rate": 8.042576493942092e-05, + "loss": 0.5464, + "step": 15049 + }, + { + "epoch": 1.0197167829798768, + "grad_norm": 5.141258239746094, + "learning_rate": 8.04243959203231e-05, + "loss": 0.5144, + "step": 15050 + }, + { + "epoch": 1.0197845382478488, + "grad_norm": 5.105460166931152, + "learning_rate": 8.042302690122528e-05, + "loss": 0.6002, + "step": 15051 + }, + { + "epoch": 1.019852293515821, + "grad_norm": 6.820575714111328, + "learning_rate": 8.042165788212746e-05, + "loss": 0.5949, + "step": 15052 + }, + { + "epoch": 1.019920048783793, + "grad_norm": 6.264902114868164, + "learning_rate": 8.042028886302965e-05, + "loss": 0.8404, + "step": 15053 + }, + { + "epoch": 1.019987804051765, + "grad_norm": 6.1790385246276855, + "learning_rate": 8.041891984393183e-05, + "loss": 0.5916, + "step": 15054 + }, + { + "epoch": 1.0200555593197371, + "grad_norm": 5.779554843902588, + "learning_rate": 8.041755082483401e-05, + "loss": 0.6559, + "step": 15055 + }, + { + "epoch": 1.020123314587709, + "grad_norm": 5.888326168060303, + "learning_rate": 8.04161818057362e-05, + "loss": 0.5994, + "step": 15056 + }, + { + "epoch": 1.0201910698556813, + "grad_norm": 5.410882949829102, + "learning_rate": 8.041481278663839e-05, + "loss": 0.7267, + "step": 15057 + }, + { + "epoch": 1.0202588251236533, + "grad_norm": 5.042235374450684, + "learning_rate": 8.041344376754057e-05, + "loss": 0.543, + "step": 15058 + }, + { + "epoch": 1.0203265803916255, + "grad_norm": 5.355964183807373, + "learning_rate": 8.041207474844275e-05, + "loss": 0.7188, + "step": 15059 + }, + { + "epoch": 1.0203943356595975, + "grad_norm": 4.4993510246276855, + "learning_rate": 8.041070572934493e-05, + "loss": 0.5713, + "step": 15060 + }, + { + "epoch": 1.0204620909275697, + "grad_norm": 6.155606746673584, + "learning_rate": 8.040933671024711e-05, + "loss": 0.7806, + "step": 15061 + }, + { + "epoch": 1.0205298461955417, + "grad_norm": 6.213912010192871, + "learning_rate": 8.04079676911493e-05, + "loss": 0.662, + "step": 15062 + }, + { + "epoch": 1.0205976014635139, + "grad_norm": 4.450127124786377, + "learning_rate": 8.040659867205148e-05, + "loss": 0.7145, + "step": 15063 + }, + { + "epoch": 1.0206653567314858, + "grad_norm": 5.589287757873535, + "learning_rate": 8.040522965295366e-05, + "loss": 0.7071, + "step": 15064 + }, + { + "epoch": 1.020733111999458, + "grad_norm": 6.013401985168457, + "learning_rate": 8.040386063385584e-05, + "loss": 0.5325, + "step": 15065 + }, + { + "epoch": 1.02080086726743, + "grad_norm": 6.579004287719727, + "learning_rate": 8.040249161475802e-05, + "loss": 0.6771, + "step": 15066 + }, + { + "epoch": 1.0208686225354022, + "grad_norm": 6.377874374389648, + "learning_rate": 8.040112259566022e-05, + "loss": 0.9557, + "step": 15067 + }, + { + "epoch": 1.0209363778033742, + "grad_norm": 6.904638767242432, + "learning_rate": 8.03997535765624e-05, + "loss": 0.8783, + "step": 15068 + }, + { + "epoch": 1.0210041330713464, + "grad_norm": 6.453176975250244, + "learning_rate": 8.039838455746458e-05, + "loss": 0.675, + "step": 15069 + }, + { + "epoch": 1.0210718883393184, + "grad_norm": 4.14583158493042, + "learning_rate": 8.039701553836676e-05, + "loss": 0.6133, + "step": 15070 + }, + { + "epoch": 1.0211396436072904, + "grad_norm": 9.864669799804688, + "learning_rate": 8.039564651926895e-05, + "loss": 0.515, + "step": 15071 + }, + { + "epoch": 1.0212073988752626, + "grad_norm": 13.798471450805664, + "learning_rate": 8.039427750017113e-05, + "loss": 0.7913, + "step": 15072 + }, + { + "epoch": 1.0212751541432346, + "grad_norm": 6.997447967529297, + "learning_rate": 8.039290848107331e-05, + "loss": 0.7798, + "step": 15073 + }, + { + "epoch": 1.0213429094112068, + "grad_norm": 7.579859256744385, + "learning_rate": 8.03915394619755e-05, + "loss": 0.4848, + "step": 15074 + }, + { + "epoch": 1.0214106646791787, + "grad_norm": 6.31156063079834, + "learning_rate": 8.039017044287767e-05, + "loss": 0.8056, + "step": 15075 + }, + { + "epoch": 1.021478419947151, + "grad_norm": 10.114006042480469, + "learning_rate": 8.038880142377987e-05, + "loss": 0.7082, + "step": 15076 + }, + { + "epoch": 1.021546175215123, + "grad_norm": 4.31792688369751, + "learning_rate": 8.038743240468205e-05, + "loss": 0.4963, + "step": 15077 + }, + { + "epoch": 1.0216139304830951, + "grad_norm": 8.745311737060547, + "learning_rate": 8.038606338558423e-05, + "loss": 0.6986, + "step": 15078 + }, + { + "epoch": 1.021681685751067, + "grad_norm": 6.955810070037842, + "learning_rate": 8.038469436648641e-05, + "loss": 0.6558, + "step": 15079 + }, + { + "epoch": 1.0217494410190393, + "grad_norm": 5.911043167114258, + "learning_rate": 8.03833253473886e-05, + "loss": 0.8858, + "step": 15080 + }, + { + "epoch": 1.0218171962870113, + "grad_norm": 5.10263204574585, + "learning_rate": 8.038195632829078e-05, + "loss": 0.5157, + "step": 15081 + }, + { + "epoch": 1.0218849515549835, + "grad_norm": 5.808590888977051, + "learning_rate": 8.038058730919296e-05, + "loss": 0.7309, + "step": 15082 + }, + { + "epoch": 1.0219527068229555, + "grad_norm": 5.918943405151367, + "learning_rate": 8.037921829009514e-05, + "loss": 0.6521, + "step": 15083 + }, + { + "epoch": 1.0220204620909277, + "grad_norm": 10.792163848876953, + "learning_rate": 8.037784927099732e-05, + "loss": 0.8682, + "step": 15084 + }, + { + "epoch": 1.0220882173588997, + "grad_norm": 6.594424724578857, + "learning_rate": 8.037648025189952e-05, + "loss": 0.567, + "step": 15085 + }, + { + "epoch": 1.0221559726268716, + "grad_norm": 7.634060382843018, + "learning_rate": 8.03751112328017e-05, + "loss": 0.6299, + "step": 15086 + }, + { + "epoch": 1.0222237278948438, + "grad_norm": 13.075284004211426, + "learning_rate": 8.037374221370388e-05, + "loss": 0.6774, + "step": 15087 + }, + { + "epoch": 1.0222914831628158, + "grad_norm": 5.656576156616211, + "learning_rate": 8.037237319460606e-05, + "loss": 0.7564, + "step": 15088 + }, + { + "epoch": 1.022359238430788, + "grad_norm": 4.356131553649902, + "learning_rate": 8.037100417550825e-05, + "loss": 0.6635, + "step": 15089 + }, + { + "epoch": 1.02242699369876, + "grad_norm": 8.44076919555664, + "learning_rate": 8.036963515641043e-05, + "loss": 0.8179, + "step": 15090 + }, + { + "epoch": 1.0224947489667322, + "grad_norm": 6.754337310791016, + "learning_rate": 8.036826613731261e-05, + "loss": 0.6055, + "step": 15091 + }, + { + "epoch": 1.0225625042347042, + "grad_norm": 4.849721908569336, + "learning_rate": 8.036689711821481e-05, + "loss": 0.5342, + "step": 15092 + }, + { + "epoch": 1.0226302595026764, + "grad_norm": 6.975347518920898, + "learning_rate": 8.036552809911699e-05, + "loss": 0.7692, + "step": 15093 + }, + { + "epoch": 1.0226980147706484, + "grad_norm": 5.742364406585693, + "learning_rate": 8.036415908001917e-05, + "loss": 0.605, + "step": 15094 + }, + { + "epoch": 1.0227657700386206, + "grad_norm": 7.3798909187316895, + "learning_rate": 8.036279006092136e-05, + "loss": 0.7643, + "step": 15095 + }, + { + "epoch": 1.0228335253065926, + "grad_norm": 6.8255510330200195, + "learning_rate": 8.036142104182354e-05, + "loss": 0.6628, + "step": 15096 + }, + { + "epoch": 1.0229012805745648, + "grad_norm": 6.594735145568848, + "learning_rate": 8.036005202272572e-05, + "loss": 0.7144, + "step": 15097 + }, + { + "epoch": 1.0229690358425367, + "grad_norm": 5.225235462188721, + "learning_rate": 8.03586830036279e-05, + "loss": 0.8352, + "step": 15098 + }, + { + "epoch": 1.023036791110509, + "grad_norm": 6.080996036529541, + "learning_rate": 8.03573139845301e-05, + "loss": 0.8509, + "step": 15099 + }, + { + "epoch": 1.023104546378481, + "grad_norm": 9.194795608520508, + "learning_rate": 8.035594496543228e-05, + "loss": 0.7135, + "step": 15100 + }, + { + "epoch": 1.0231723016464531, + "grad_norm": 5.869052410125732, + "learning_rate": 8.035457594633446e-05, + "loss": 0.5557, + "step": 15101 + }, + { + "epoch": 1.023240056914425, + "grad_norm": 5.882430076599121, + "learning_rate": 8.035320692723664e-05, + "loss": 0.6686, + "step": 15102 + }, + { + "epoch": 1.023307812182397, + "grad_norm": 5.315283298492432, + "learning_rate": 8.035183790813883e-05, + "loss": 0.5841, + "step": 15103 + }, + { + "epoch": 1.0233755674503693, + "grad_norm": 4.697854042053223, + "learning_rate": 8.035046888904101e-05, + "loss": 0.6619, + "step": 15104 + }, + { + "epoch": 1.0234433227183413, + "grad_norm": 5.584891319274902, + "learning_rate": 8.034909986994319e-05, + "loss": 0.7641, + "step": 15105 + }, + { + "epoch": 1.0235110779863135, + "grad_norm": 6.978733539581299, + "learning_rate": 8.034773085084537e-05, + "loss": 0.7094, + "step": 15106 + }, + { + "epoch": 1.0235788332542854, + "grad_norm": 8.549322128295898, + "learning_rate": 8.034636183174755e-05, + "loss": 0.6725, + "step": 15107 + }, + { + "epoch": 1.0236465885222576, + "grad_norm": 5.663527488708496, + "learning_rate": 8.034499281264975e-05, + "loss": 0.609, + "step": 15108 + }, + { + "epoch": 1.0237143437902296, + "grad_norm": 5.843603610992432, + "learning_rate": 8.034362379355193e-05, + "loss": 0.7198, + "step": 15109 + }, + { + "epoch": 1.0237820990582018, + "grad_norm": 6.912360668182373, + "learning_rate": 8.034225477445411e-05, + "loss": 0.728, + "step": 15110 + }, + { + "epoch": 1.0238498543261738, + "grad_norm": 4.6928205490112305, + "learning_rate": 8.034088575535629e-05, + "loss": 0.8542, + "step": 15111 + }, + { + "epoch": 1.023917609594146, + "grad_norm": 5.714449405670166, + "learning_rate": 8.033951673625848e-05, + "loss": 0.8885, + "step": 15112 + }, + { + "epoch": 1.023985364862118, + "grad_norm": 9.74325180053711, + "learning_rate": 8.033814771716066e-05, + "loss": 0.7175, + "step": 15113 + }, + { + "epoch": 1.0240531201300902, + "grad_norm": 6.803297519683838, + "learning_rate": 8.033677869806284e-05, + "loss": 0.8037, + "step": 15114 + }, + { + "epoch": 1.0241208753980622, + "grad_norm": 5.692788600921631, + "learning_rate": 8.033540967896502e-05, + "loss": 0.6073, + "step": 15115 + }, + { + "epoch": 1.0241886306660344, + "grad_norm": 4.766846179962158, + "learning_rate": 8.03340406598672e-05, + "loss": 0.687, + "step": 15116 + }, + { + "epoch": 1.0242563859340064, + "grad_norm": 7.894632816314697, + "learning_rate": 8.03326716407694e-05, + "loss": 0.6866, + "step": 15117 + }, + { + "epoch": 1.0243241412019786, + "grad_norm": 8.924421310424805, + "learning_rate": 8.033130262167158e-05, + "loss": 0.6033, + "step": 15118 + }, + { + "epoch": 1.0243918964699505, + "grad_norm": 4.983919620513916, + "learning_rate": 8.032993360257376e-05, + "loss": 0.7635, + "step": 15119 + }, + { + "epoch": 1.0244596517379225, + "grad_norm": 6.151333808898926, + "learning_rate": 8.032856458347594e-05, + "loss": 0.5241, + "step": 15120 + }, + { + "epoch": 1.0245274070058947, + "grad_norm": 5.959475517272949, + "learning_rate": 8.032719556437812e-05, + "loss": 0.801, + "step": 15121 + }, + { + "epoch": 1.0245951622738667, + "grad_norm": 6.7095255851745605, + "learning_rate": 8.032582654528031e-05, + "loss": 0.5638, + "step": 15122 + }, + { + "epoch": 1.024662917541839, + "grad_norm": 6.7581939697265625, + "learning_rate": 8.03244575261825e-05, + "loss": 0.7744, + "step": 15123 + }, + { + "epoch": 1.024730672809811, + "grad_norm": 4.929287433624268, + "learning_rate": 8.032308850708467e-05, + "loss": 0.5911, + "step": 15124 + }, + { + "epoch": 1.024798428077783, + "grad_norm": 7.830074310302734, + "learning_rate": 8.032171948798685e-05, + "loss": 0.5202, + "step": 15125 + }, + { + "epoch": 1.024866183345755, + "grad_norm": 5.351446628570557, + "learning_rate": 8.032035046888905e-05, + "loss": 0.5243, + "step": 15126 + }, + { + "epoch": 1.0249339386137273, + "grad_norm": 6.004148960113525, + "learning_rate": 8.031898144979123e-05, + "loss": 0.7672, + "step": 15127 + }, + { + "epoch": 1.0250016938816993, + "grad_norm": 5.840296745300293, + "learning_rate": 8.031761243069341e-05, + "loss": 0.6149, + "step": 15128 + }, + { + "epoch": 1.0250694491496715, + "grad_norm": 5.449799537658691, + "learning_rate": 8.031624341159559e-05, + "loss": 0.9104, + "step": 15129 + }, + { + "epoch": 1.0251372044176434, + "grad_norm": 4.739322662353516, + "learning_rate": 8.031487439249777e-05, + "loss": 0.586, + "step": 15130 + }, + { + "epoch": 1.0252049596856156, + "grad_norm": 6.538787841796875, + "learning_rate": 8.031350537339996e-05, + "loss": 0.8415, + "step": 15131 + }, + { + "epoch": 1.0252727149535876, + "grad_norm": 6.406269550323486, + "learning_rate": 8.031213635430214e-05, + "loss": 0.8221, + "step": 15132 + }, + { + "epoch": 1.0253404702215598, + "grad_norm": 5.359891891479492, + "learning_rate": 8.031076733520432e-05, + "loss": 0.7246, + "step": 15133 + }, + { + "epoch": 1.0254082254895318, + "grad_norm": 6.871723175048828, + "learning_rate": 8.03093983161065e-05, + "loss": 0.8127, + "step": 15134 + }, + { + "epoch": 1.0254759807575038, + "grad_norm": 7.51058292388916, + "learning_rate": 8.03080292970087e-05, + "loss": 0.8611, + "step": 15135 + }, + { + "epoch": 1.025543736025476, + "grad_norm": 6.180769920349121, + "learning_rate": 8.030666027791088e-05, + "loss": 0.6036, + "step": 15136 + }, + { + "epoch": 1.025611491293448, + "grad_norm": 4.803264141082764, + "learning_rate": 8.030529125881306e-05, + "loss": 0.6907, + "step": 15137 + }, + { + "epoch": 1.0256792465614202, + "grad_norm": 4.692986965179443, + "learning_rate": 8.030392223971524e-05, + "loss": 0.5427, + "step": 15138 + }, + { + "epoch": 1.0257470018293922, + "grad_norm": 4.468636512756348, + "learning_rate": 8.030255322061743e-05, + "loss": 0.5688, + "step": 15139 + }, + { + "epoch": 1.0258147570973644, + "grad_norm": 5.465978622436523, + "learning_rate": 8.030118420151961e-05, + "loss": 0.6728, + "step": 15140 + }, + { + "epoch": 1.0258825123653363, + "grad_norm": 7.363076686859131, + "learning_rate": 8.02998151824218e-05, + "loss": 0.608, + "step": 15141 + }, + { + "epoch": 1.0259502676333085, + "grad_norm": 6.24710750579834, + "learning_rate": 8.029844616332399e-05, + "loss": 0.7095, + "step": 15142 + }, + { + "epoch": 1.0260180229012805, + "grad_norm": 5.708677291870117, + "learning_rate": 8.029707714422617e-05, + "loss": 0.7028, + "step": 15143 + }, + { + "epoch": 1.0260857781692527, + "grad_norm": 6.651005744934082, + "learning_rate": 8.029570812512835e-05, + "loss": 0.8753, + "step": 15144 + }, + { + "epoch": 1.0261535334372247, + "grad_norm": 7.2024688720703125, + "learning_rate": 8.029433910603054e-05, + "loss": 0.6307, + "step": 15145 + }, + { + "epoch": 1.026221288705197, + "grad_norm": 8.108603477478027, + "learning_rate": 8.029297008693272e-05, + "loss": 0.7454, + "step": 15146 + }, + { + "epoch": 1.026289043973169, + "grad_norm": 8.093022346496582, + "learning_rate": 8.02916010678349e-05, + "loss": 0.6492, + "step": 15147 + }, + { + "epoch": 1.026356799241141, + "grad_norm": 6.092846393585205, + "learning_rate": 8.029023204873708e-05, + "loss": 0.8324, + "step": 15148 + }, + { + "epoch": 1.026424554509113, + "grad_norm": 7.56104040145874, + "learning_rate": 8.028886302963928e-05, + "loss": 0.7282, + "step": 15149 + }, + { + "epoch": 1.0264923097770853, + "grad_norm": 5.667774677276611, + "learning_rate": 8.028749401054146e-05, + "loss": 0.9169, + "step": 15150 + }, + { + "epoch": 1.0265600650450573, + "grad_norm": 6.644709587097168, + "learning_rate": 8.028612499144364e-05, + "loss": 0.6861, + "step": 15151 + }, + { + "epoch": 1.0266278203130292, + "grad_norm": 5.4562578201293945, + "learning_rate": 8.028475597234582e-05, + "loss": 0.8854, + "step": 15152 + }, + { + "epoch": 1.0266955755810014, + "grad_norm": 4.384959697723389, + "learning_rate": 8.0283386953248e-05, + "loss": 0.6736, + "step": 15153 + }, + { + "epoch": 1.0267633308489734, + "grad_norm": 5.349459648132324, + "learning_rate": 8.028201793415019e-05, + "loss": 0.704, + "step": 15154 + }, + { + "epoch": 1.0268310861169456, + "grad_norm": 4.929141521453857, + "learning_rate": 8.028064891505237e-05, + "loss": 0.6705, + "step": 15155 + }, + { + "epoch": 1.0268988413849176, + "grad_norm": 5.738053798675537, + "learning_rate": 8.027927989595455e-05, + "loss": 0.7195, + "step": 15156 + }, + { + "epoch": 1.0269665966528898, + "grad_norm": 5.528050899505615, + "learning_rate": 8.027791087685673e-05, + "loss": 0.707, + "step": 15157 + }, + { + "epoch": 1.0270343519208618, + "grad_norm": 5.940127372741699, + "learning_rate": 8.027654185775893e-05, + "loss": 0.7448, + "step": 15158 + }, + { + "epoch": 1.027102107188834, + "grad_norm": 5.716156482696533, + "learning_rate": 8.027517283866111e-05, + "loss": 0.5385, + "step": 15159 + }, + { + "epoch": 1.027169862456806, + "grad_norm": 6.133981227874756, + "learning_rate": 8.027380381956329e-05, + "loss": 0.459, + "step": 15160 + }, + { + "epoch": 1.0272376177247782, + "grad_norm": 4.139076232910156, + "learning_rate": 8.027243480046547e-05, + "loss": 0.5117, + "step": 15161 + }, + { + "epoch": 1.0273053729927502, + "grad_norm": 5.8663482666015625, + "learning_rate": 8.027106578136765e-05, + "loss": 0.6677, + "step": 15162 + }, + { + "epoch": 1.0273731282607224, + "grad_norm": 6.346996784210205, + "learning_rate": 8.026969676226984e-05, + "loss": 0.794, + "step": 15163 + }, + { + "epoch": 1.0274408835286943, + "grad_norm": 6.667228698730469, + "learning_rate": 8.026832774317202e-05, + "loss": 0.6352, + "step": 15164 + }, + { + "epoch": 1.0275086387966665, + "grad_norm": 4.740386962890625, + "learning_rate": 8.02669587240742e-05, + "loss": 0.5717, + "step": 15165 + }, + { + "epoch": 1.0275763940646385, + "grad_norm": 7.014326095581055, + "learning_rate": 8.026558970497638e-05, + "loss": 0.831, + "step": 15166 + }, + { + "epoch": 1.0276441493326105, + "grad_norm": 5.337719440460205, + "learning_rate": 8.026422068587858e-05, + "loss": 0.7297, + "step": 15167 + }, + { + "epoch": 1.0277119046005827, + "grad_norm": 5.5287957191467285, + "learning_rate": 8.026285166678076e-05, + "loss": 0.8606, + "step": 15168 + }, + { + "epoch": 1.0277796598685547, + "grad_norm": 3.9618964195251465, + "learning_rate": 8.026148264768294e-05, + "loss": 0.4601, + "step": 15169 + }, + { + "epoch": 1.0278474151365269, + "grad_norm": 6.047200679779053, + "learning_rate": 8.026011362858512e-05, + "loss": 0.7246, + "step": 15170 + }, + { + "epoch": 1.0279151704044989, + "grad_norm": 6.743053913116455, + "learning_rate": 8.02587446094873e-05, + "loss": 0.7362, + "step": 15171 + }, + { + "epoch": 1.027982925672471, + "grad_norm": 6.415504455566406, + "learning_rate": 8.025737559038949e-05, + "loss": 0.8653, + "step": 15172 + }, + { + "epoch": 1.028050680940443, + "grad_norm": 5.755765438079834, + "learning_rate": 8.025600657129167e-05, + "loss": 0.7041, + "step": 15173 + }, + { + "epoch": 1.0281184362084153, + "grad_norm": 6.556794166564941, + "learning_rate": 8.025463755219385e-05, + "loss": 0.8511, + "step": 15174 + }, + { + "epoch": 1.0281861914763872, + "grad_norm": 5.8607025146484375, + "learning_rate": 8.025326853309603e-05, + "loss": 0.6352, + "step": 15175 + }, + { + "epoch": 1.0282539467443594, + "grad_norm": 5.547766208648682, + "learning_rate": 8.025189951399821e-05, + "loss": 0.7237, + "step": 15176 + }, + { + "epoch": 1.0283217020123314, + "grad_norm": 6.3242363929748535, + "learning_rate": 8.025053049490041e-05, + "loss": 0.7579, + "step": 15177 + }, + { + "epoch": 1.0283894572803036, + "grad_norm": 5.0050764083862305, + "learning_rate": 8.024916147580259e-05, + "loss": 0.765, + "step": 15178 + }, + { + "epoch": 1.0284572125482756, + "grad_norm": 6.55527400970459, + "learning_rate": 8.024779245670477e-05, + "loss": 0.7241, + "step": 15179 + }, + { + "epoch": 1.0285249678162478, + "grad_norm": 6.457221984863281, + "learning_rate": 8.024642343760695e-05, + "loss": 0.6609, + "step": 15180 + }, + { + "epoch": 1.0285927230842198, + "grad_norm": 6.385996341705322, + "learning_rate": 8.024505441850914e-05, + "loss": 0.8659, + "step": 15181 + }, + { + "epoch": 1.028660478352192, + "grad_norm": 6.647210121154785, + "learning_rate": 8.024368539941132e-05, + "loss": 0.7662, + "step": 15182 + }, + { + "epoch": 1.028728233620164, + "grad_norm": 5.438072204589844, + "learning_rate": 8.02423163803135e-05, + "loss": 0.5482, + "step": 15183 + }, + { + "epoch": 1.028795988888136, + "grad_norm": 7.4278998374938965, + "learning_rate": 8.024094736121568e-05, + "loss": 0.8363, + "step": 15184 + }, + { + "epoch": 1.0288637441561082, + "grad_norm": 5.1344451904296875, + "learning_rate": 8.023957834211788e-05, + "loss": 0.7989, + "step": 15185 + }, + { + "epoch": 1.0289314994240801, + "grad_norm": 4.938210487365723, + "learning_rate": 8.023820932302006e-05, + "loss": 0.6469, + "step": 15186 + }, + { + "epoch": 1.0289992546920523, + "grad_norm": 5.672119617462158, + "learning_rate": 8.023684030392224e-05, + "loss": 0.9169, + "step": 15187 + }, + { + "epoch": 1.0290670099600243, + "grad_norm": 6.35474157333374, + "learning_rate": 8.023547128482443e-05, + "loss": 0.6798, + "step": 15188 + }, + { + "epoch": 1.0291347652279965, + "grad_norm": 10.651616096496582, + "learning_rate": 8.023410226572661e-05, + "loss": 0.8043, + "step": 15189 + }, + { + "epoch": 1.0292025204959685, + "grad_norm": 5.502874851226807, + "learning_rate": 8.023273324662879e-05, + "loss": 0.8463, + "step": 15190 + }, + { + "epoch": 1.0292702757639407, + "grad_norm": 4.313845634460449, + "learning_rate": 8.023136422753099e-05, + "loss": 0.6796, + "step": 15191 + }, + { + "epoch": 1.0293380310319127, + "grad_norm": 6.374643802642822, + "learning_rate": 8.022999520843317e-05, + "loss": 0.7316, + "step": 15192 + }, + { + "epoch": 1.0294057862998849, + "grad_norm": 6.878230094909668, + "learning_rate": 8.022862618933535e-05, + "loss": 0.9329, + "step": 15193 + }, + { + "epoch": 1.0294735415678569, + "grad_norm": 5.371943950653076, + "learning_rate": 8.022725717023753e-05, + "loss": 0.5674, + "step": 15194 + }, + { + "epoch": 1.029541296835829, + "grad_norm": 5.623109340667725, + "learning_rate": 8.022588815113972e-05, + "loss": 0.7014, + "step": 15195 + }, + { + "epoch": 1.029609052103801, + "grad_norm": 6.517233848571777, + "learning_rate": 8.02245191320419e-05, + "loss": 0.8067, + "step": 15196 + }, + { + "epoch": 1.0296768073717733, + "grad_norm": 6.380861759185791, + "learning_rate": 8.022315011294408e-05, + "loss": 0.7762, + "step": 15197 + }, + { + "epoch": 1.0297445626397452, + "grad_norm": 5.307709693908691, + "learning_rate": 8.022178109384626e-05, + "loss": 0.8181, + "step": 15198 + }, + { + "epoch": 1.0298123179077174, + "grad_norm": 5.426553249359131, + "learning_rate": 8.022041207474844e-05, + "loss": 0.8538, + "step": 15199 + }, + { + "epoch": 1.0298800731756894, + "grad_norm": 5.571786880493164, + "learning_rate": 8.021904305565064e-05, + "loss": 0.6279, + "step": 15200 + }, + { + "epoch": 1.0299478284436614, + "grad_norm": 5.296565055847168, + "learning_rate": 8.021767403655282e-05, + "loss": 0.7286, + "step": 15201 + }, + { + "epoch": 1.0300155837116336, + "grad_norm": 5.196515083312988, + "learning_rate": 8.0216305017455e-05, + "loss": 0.5467, + "step": 15202 + }, + { + "epoch": 1.0300833389796056, + "grad_norm": 5.717817783355713, + "learning_rate": 8.021493599835718e-05, + "loss": 0.5466, + "step": 15203 + }, + { + "epoch": 1.0301510942475778, + "grad_norm": 4.812506675720215, + "learning_rate": 8.021356697925937e-05, + "loss": 0.4824, + "step": 15204 + }, + { + "epoch": 1.0302188495155498, + "grad_norm": 5.952337741851807, + "learning_rate": 8.021219796016155e-05, + "loss": 0.6964, + "step": 15205 + }, + { + "epoch": 1.030286604783522, + "grad_norm": 6.307968616485596, + "learning_rate": 8.021082894106373e-05, + "loss": 0.804, + "step": 15206 + }, + { + "epoch": 1.030354360051494, + "grad_norm": 6.3860626220703125, + "learning_rate": 8.020945992196591e-05, + "loss": 0.5327, + "step": 15207 + }, + { + "epoch": 1.0304221153194661, + "grad_norm": 6.184783935546875, + "learning_rate": 8.02080909028681e-05, + "loss": 0.754, + "step": 15208 + }, + { + "epoch": 1.0304898705874381, + "grad_norm": 5.628595352172852, + "learning_rate": 8.020672188377029e-05, + "loss": 0.6584, + "step": 15209 + }, + { + "epoch": 1.0305576258554103, + "grad_norm": 6.341923713684082, + "learning_rate": 8.020535286467247e-05, + "loss": 0.7252, + "step": 15210 + }, + { + "epoch": 1.0306253811233823, + "grad_norm": 7.7430853843688965, + "learning_rate": 8.020398384557465e-05, + "loss": 0.7014, + "step": 15211 + }, + { + "epoch": 1.0306931363913545, + "grad_norm": 5.762584686279297, + "learning_rate": 8.020261482647683e-05, + "loss": 0.7568, + "step": 15212 + }, + { + "epoch": 1.0307608916593265, + "grad_norm": 6.523187637329102, + "learning_rate": 8.020124580737902e-05, + "loss": 0.7841, + "step": 15213 + }, + { + "epoch": 1.0308286469272987, + "grad_norm": 6.489431858062744, + "learning_rate": 8.01998767882812e-05, + "loss": 0.6339, + "step": 15214 + }, + { + "epoch": 1.0308964021952707, + "grad_norm": 6.271862983703613, + "learning_rate": 8.019850776918338e-05, + "loss": 0.85, + "step": 15215 + }, + { + "epoch": 1.0309641574632427, + "grad_norm": 5.647708415985107, + "learning_rate": 8.019713875008556e-05, + "loss": 0.7023, + "step": 15216 + }, + { + "epoch": 1.0310319127312149, + "grad_norm": 4.607950210571289, + "learning_rate": 8.019576973098774e-05, + "loss": 0.7162, + "step": 15217 + }, + { + "epoch": 1.0310996679991868, + "grad_norm": 6.540196895599365, + "learning_rate": 8.019440071188994e-05, + "loss": 0.7398, + "step": 15218 + }, + { + "epoch": 1.031167423267159, + "grad_norm": 6.469182014465332, + "learning_rate": 8.019303169279212e-05, + "loss": 0.6353, + "step": 15219 + }, + { + "epoch": 1.031235178535131, + "grad_norm": 6.716250896453857, + "learning_rate": 8.01916626736943e-05, + "loss": 0.8495, + "step": 15220 + }, + { + "epoch": 1.0313029338031032, + "grad_norm": 5.285309791564941, + "learning_rate": 8.019029365459648e-05, + "loss": 0.5978, + "step": 15221 + }, + { + "epoch": 1.0313706890710752, + "grad_norm": 5.688589572906494, + "learning_rate": 8.018892463549867e-05, + "loss": 0.8367, + "step": 15222 + }, + { + "epoch": 1.0314384443390474, + "grad_norm": 6.630375385284424, + "learning_rate": 8.018755561640085e-05, + "loss": 0.772, + "step": 15223 + }, + { + "epoch": 1.0315061996070194, + "grad_norm": 5.191856861114502, + "learning_rate": 8.018618659730303e-05, + "loss": 0.6048, + "step": 15224 + }, + { + "epoch": 1.0315739548749916, + "grad_norm": 5.3768839836120605, + "learning_rate": 8.018481757820521e-05, + "loss": 0.5963, + "step": 15225 + }, + { + "epoch": 1.0316417101429636, + "grad_norm": 6.222872734069824, + "learning_rate": 8.01834485591074e-05, + "loss": 0.6568, + "step": 15226 + }, + { + "epoch": 1.0317094654109358, + "grad_norm": 4.519933700561523, + "learning_rate": 8.018207954000959e-05, + "loss": 0.5297, + "step": 15227 + }, + { + "epoch": 1.0317772206789078, + "grad_norm": 5.497208595275879, + "learning_rate": 8.018071052091177e-05, + "loss": 0.723, + "step": 15228 + }, + { + "epoch": 1.03184497594688, + "grad_norm": 5.850059509277344, + "learning_rate": 8.017934150181395e-05, + "loss": 0.5526, + "step": 15229 + }, + { + "epoch": 1.031912731214852, + "grad_norm": 5.479531288146973, + "learning_rate": 8.017797248271613e-05, + "loss": 0.6473, + "step": 15230 + }, + { + "epoch": 1.0319804864828241, + "grad_norm": 6.393281936645508, + "learning_rate": 8.017660346361832e-05, + "loss": 0.8833, + "step": 15231 + }, + { + "epoch": 1.0320482417507961, + "grad_norm": 5.936949253082275, + "learning_rate": 8.01752344445205e-05, + "loss": 0.96, + "step": 15232 + }, + { + "epoch": 1.032115997018768, + "grad_norm": 6.389046669006348, + "learning_rate": 8.017386542542268e-05, + "loss": 0.8469, + "step": 15233 + }, + { + "epoch": 1.0321837522867403, + "grad_norm": 5.648378849029541, + "learning_rate": 8.017249640632488e-05, + "loss": 0.5647, + "step": 15234 + }, + { + "epoch": 1.0322515075547123, + "grad_norm": 6.237347602844238, + "learning_rate": 8.017112738722706e-05, + "loss": 0.6116, + "step": 15235 + }, + { + "epoch": 1.0323192628226845, + "grad_norm": 6.752171993255615, + "learning_rate": 8.016975836812924e-05, + "loss": 0.7117, + "step": 15236 + }, + { + "epoch": 1.0323870180906565, + "grad_norm": 7.579265594482422, + "learning_rate": 8.016838934903143e-05, + "loss": 0.7308, + "step": 15237 + }, + { + "epoch": 1.0324547733586287, + "grad_norm": 5.256562232971191, + "learning_rate": 8.016702032993361e-05, + "loss": 0.6299, + "step": 15238 + }, + { + "epoch": 1.0325225286266007, + "grad_norm": 4.109334468841553, + "learning_rate": 8.016565131083579e-05, + "loss": 0.3859, + "step": 15239 + }, + { + "epoch": 1.0325902838945729, + "grad_norm": 4.160473823547363, + "learning_rate": 8.016428229173797e-05, + "loss": 0.5872, + "step": 15240 + }, + { + "epoch": 1.0326580391625448, + "grad_norm": 5.3488898277282715, + "learning_rate": 8.016291327264017e-05, + "loss": 0.6311, + "step": 15241 + }, + { + "epoch": 1.032725794430517, + "grad_norm": 10.214828491210938, + "learning_rate": 8.016154425354235e-05, + "loss": 0.8249, + "step": 15242 + }, + { + "epoch": 1.032793549698489, + "grad_norm": 6.101986885070801, + "learning_rate": 8.016017523444453e-05, + "loss": 0.7061, + "step": 15243 + }, + { + "epoch": 1.0328613049664612, + "grad_norm": 4.178615570068359, + "learning_rate": 8.015880621534671e-05, + "loss": 0.5615, + "step": 15244 + }, + { + "epoch": 1.0329290602344332, + "grad_norm": 5.862407207489014, + "learning_rate": 8.01574371962489e-05, + "loss": 0.6751, + "step": 15245 + }, + { + "epoch": 1.0329968155024054, + "grad_norm": 9.34377670288086, + "learning_rate": 8.015606817715108e-05, + "loss": 0.9418, + "step": 15246 + }, + { + "epoch": 1.0330645707703774, + "grad_norm": 5.592286109924316, + "learning_rate": 8.015469915805326e-05, + "loss": 0.8633, + "step": 15247 + }, + { + "epoch": 1.0331323260383494, + "grad_norm": 5.773770332336426, + "learning_rate": 8.015333013895544e-05, + "loss": 0.6302, + "step": 15248 + }, + { + "epoch": 1.0332000813063216, + "grad_norm": 5.281434535980225, + "learning_rate": 8.015196111985762e-05, + "loss": 0.6259, + "step": 15249 + }, + { + "epoch": 1.0332678365742936, + "grad_norm": 5.689375877380371, + "learning_rate": 8.015059210075982e-05, + "loss": 0.6735, + "step": 15250 + }, + { + "epoch": 1.0333355918422658, + "grad_norm": 6.111639976501465, + "learning_rate": 8.0149223081662e-05, + "loss": 0.8211, + "step": 15251 + }, + { + "epoch": 1.0334033471102377, + "grad_norm": 5.1067328453063965, + "learning_rate": 8.014785406256418e-05, + "loss": 0.5156, + "step": 15252 + }, + { + "epoch": 1.03347110237821, + "grad_norm": 6.3971638679504395, + "learning_rate": 8.014648504346636e-05, + "loss": 0.7773, + "step": 15253 + }, + { + "epoch": 1.033538857646182, + "grad_norm": 7.105903148651123, + "learning_rate": 8.014511602436854e-05, + "loss": 0.8546, + "step": 15254 + }, + { + "epoch": 1.0336066129141541, + "grad_norm": 5.161197185516357, + "learning_rate": 8.014374700527073e-05, + "loss": 0.7245, + "step": 15255 + }, + { + "epoch": 1.033674368182126, + "grad_norm": 5.202775001525879, + "learning_rate": 8.014237798617291e-05, + "loss": 0.7735, + "step": 15256 + }, + { + "epoch": 1.0337421234500983, + "grad_norm": 5.7077131271362305, + "learning_rate": 8.014100896707509e-05, + "loss": 0.6378, + "step": 15257 + }, + { + "epoch": 1.0338098787180703, + "grad_norm": 6.216683864593506, + "learning_rate": 8.013963994797727e-05, + "loss": 0.8665, + "step": 15258 + }, + { + "epoch": 1.0338776339860425, + "grad_norm": 6.291334629058838, + "learning_rate": 8.013827092887947e-05, + "loss": 0.6189, + "step": 15259 + }, + { + "epoch": 1.0339453892540145, + "grad_norm": 6.946064472198486, + "learning_rate": 8.013690190978165e-05, + "loss": 0.7349, + "step": 15260 + }, + { + "epoch": 1.0340131445219867, + "grad_norm": 5.636563301086426, + "learning_rate": 8.013553289068383e-05, + "loss": 0.5853, + "step": 15261 + }, + { + "epoch": 1.0340808997899587, + "grad_norm": 6.100147724151611, + "learning_rate": 8.013416387158601e-05, + "loss": 0.8702, + "step": 15262 + }, + { + "epoch": 1.0341486550579309, + "grad_norm": 6.185873985290527, + "learning_rate": 8.013279485248819e-05, + "loss": 0.6916, + "step": 15263 + }, + { + "epoch": 1.0342164103259028, + "grad_norm": 7.08696174621582, + "learning_rate": 8.013142583339038e-05, + "loss": 0.6231, + "step": 15264 + }, + { + "epoch": 1.034284165593875, + "grad_norm": 5.164997100830078, + "learning_rate": 8.013005681429256e-05, + "loss": 0.5656, + "step": 15265 + }, + { + "epoch": 1.034351920861847, + "grad_norm": 5.154972553253174, + "learning_rate": 8.012868779519474e-05, + "loss": 0.5931, + "step": 15266 + }, + { + "epoch": 1.034419676129819, + "grad_norm": 5.627068996429443, + "learning_rate": 8.012731877609692e-05, + "loss": 0.7211, + "step": 15267 + }, + { + "epoch": 1.0344874313977912, + "grad_norm": 4.955535888671875, + "learning_rate": 8.012594975699912e-05, + "loss": 0.5718, + "step": 15268 + }, + { + "epoch": 1.0345551866657632, + "grad_norm": 5.209494113922119, + "learning_rate": 8.01245807379013e-05, + "loss": 0.5789, + "step": 15269 + }, + { + "epoch": 1.0346229419337354, + "grad_norm": 6.1501312255859375, + "learning_rate": 8.012321171880348e-05, + "loss": 0.7794, + "step": 15270 + }, + { + "epoch": 1.0346906972017074, + "grad_norm": 5.943906307220459, + "learning_rate": 8.012184269970566e-05, + "loss": 0.6906, + "step": 15271 + }, + { + "epoch": 1.0347584524696796, + "grad_norm": 6.266148567199707, + "learning_rate": 8.012047368060784e-05, + "loss": 0.6289, + "step": 15272 + }, + { + "epoch": 1.0348262077376515, + "grad_norm": 5.710838794708252, + "learning_rate": 8.011910466151003e-05, + "loss": 0.8653, + "step": 15273 + }, + { + "epoch": 1.0348939630056238, + "grad_norm": 4.5793585777282715, + "learning_rate": 8.011773564241221e-05, + "loss": 0.6731, + "step": 15274 + }, + { + "epoch": 1.0349617182735957, + "grad_norm": 5.686688423156738, + "learning_rate": 8.011636662331439e-05, + "loss": 0.6466, + "step": 15275 + }, + { + "epoch": 1.035029473541568, + "grad_norm": 6.409125804901123, + "learning_rate": 8.011499760421657e-05, + "loss": 0.8089, + "step": 15276 + }, + { + "epoch": 1.03509722880954, + "grad_norm": 9.235058784484863, + "learning_rate": 8.011362858511877e-05, + "loss": 0.6794, + "step": 15277 + }, + { + "epoch": 1.0351649840775121, + "grad_norm": 5.486425876617432, + "learning_rate": 8.011225956602095e-05, + "loss": 0.6879, + "step": 15278 + }, + { + "epoch": 1.035232739345484, + "grad_norm": 5.220149040222168, + "learning_rate": 8.011089054692313e-05, + "loss": 0.5114, + "step": 15279 + }, + { + "epoch": 1.0353004946134563, + "grad_norm": 4.368312835693359, + "learning_rate": 8.010952152782532e-05, + "loss": 0.455, + "step": 15280 + }, + { + "epoch": 1.0353682498814283, + "grad_norm": 7.039102554321289, + "learning_rate": 8.01081525087275e-05, + "loss": 0.9742, + "step": 15281 + }, + { + "epoch": 1.0354360051494003, + "grad_norm": 5.032625675201416, + "learning_rate": 8.010678348962968e-05, + "loss": 0.5802, + "step": 15282 + }, + { + "epoch": 1.0355037604173725, + "grad_norm": 5.188567638397217, + "learning_rate": 8.010541447053188e-05, + "loss": 0.5383, + "step": 15283 + }, + { + "epoch": 1.0355715156853444, + "grad_norm": 6.444438934326172, + "learning_rate": 8.010404545143406e-05, + "loss": 0.6963, + "step": 15284 + }, + { + "epoch": 1.0356392709533166, + "grad_norm": 4.651839733123779, + "learning_rate": 8.010267643233624e-05, + "loss": 0.5967, + "step": 15285 + }, + { + "epoch": 1.0357070262212886, + "grad_norm": 5.508388996124268, + "learning_rate": 8.010130741323842e-05, + "loss": 0.7505, + "step": 15286 + }, + { + "epoch": 1.0357747814892608, + "grad_norm": 6.142048358917236, + "learning_rate": 8.009993839414061e-05, + "loss": 0.7186, + "step": 15287 + }, + { + "epoch": 1.0358425367572328, + "grad_norm": 5.3503899574279785, + "learning_rate": 8.009856937504279e-05, + "loss": 0.6446, + "step": 15288 + }, + { + "epoch": 1.035910292025205, + "grad_norm": 5.894726753234863, + "learning_rate": 8.009720035594497e-05, + "loss": 0.6653, + "step": 15289 + }, + { + "epoch": 1.035978047293177, + "grad_norm": 6.199258804321289, + "learning_rate": 8.009583133684715e-05, + "loss": 0.5282, + "step": 15290 + }, + { + "epoch": 1.0360458025611492, + "grad_norm": 7.289583683013916, + "learning_rate": 8.009446231774935e-05, + "loss": 0.6126, + "step": 15291 + }, + { + "epoch": 1.0361135578291212, + "grad_norm": 5.015061378479004, + "learning_rate": 8.009309329865153e-05, + "loss": 0.6272, + "step": 15292 + }, + { + "epoch": 1.0361813130970934, + "grad_norm": 8.497147560119629, + "learning_rate": 8.009172427955371e-05, + "loss": 0.7675, + "step": 15293 + }, + { + "epoch": 1.0362490683650654, + "grad_norm": 7.492831230163574, + "learning_rate": 8.009035526045589e-05, + "loss": 0.8589, + "step": 15294 + }, + { + "epoch": 1.0363168236330376, + "grad_norm": 8.964046478271484, + "learning_rate": 8.008898624135807e-05, + "loss": 0.8056, + "step": 15295 + }, + { + "epoch": 1.0363845789010095, + "grad_norm": 6.669633388519287, + "learning_rate": 8.008761722226026e-05, + "loss": 0.7112, + "step": 15296 + }, + { + "epoch": 1.0364523341689815, + "grad_norm": 6.454007148742676, + "learning_rate": 8.008624820316244e-05, + "loss": 0.6977, + "step": 15297 + }, + { + "epoch": 1.0365200894369537, + "grad_norm": 5.101663112640381, + "learning_rate": 8.008487918406462e-05, + "loss": 0.6409, + "step": 15298 + }, + { + "epoch": 1.0365878447049257, + "grad_norm": 6.301394462585449, + "learning_rate": 8.00835101649668e-05, + "loss": 0.7558, + "step": 15299 + }, + { + "epoch": 1.036655599972898, + "grad_norm": 4.543341636657715, + "learning_rate": 8.0082141145869e-05, + "loss": 0.6226, + "step": 15300 + }, + { + "epoch": 1.03672335524087, + "grad_norm": 6.267812728881836, + "learning_rate": 8.008077212677118e-05, + "loss": 0.7714, + "step": 15301 + }, + { + "epoch": 1.036791110508842, + "grad_norm": 5.134308338165283, + "learning_rate": 8.007940310767336e-05, + "loss": 0.5897, + "step": 15302 + }, + { + "epoch": 1.036858865776814, + "grad_norm": 6.004744052886963, + "learning_rate": 8.007803408857554e-05, + "loss": 0.6461, + "step": 15303 + }, + { + "epoch": 1.0369266210447863, + "grad_norm": 6.2730631828308105, + "learning_rate": 8.007666506947772e-05, + "loss": 0.7272, + "step": 15304 + }, + { + "epoch": 1.0369943763127583, + "grad_norm": 7.032886981964111, + "learning_rate": 8.007529605037991e-05, + "loss": 0.708, + "step": 15305 + }, + { + "epoch": 1.0370621315807305, + "grad_norm": 7.366089344024658, + "learning_rate": 8.007392703128209e-05, + "loss": 0.5729, + "step": 15306 + }, + { + "epoch": 1.0371298868487024, + "grad_norm": 5.599564075469971, + "learning_rate": 8.007255801218427e-05, + "loss": 0.7446, + "step": 15307 + }, + { + "epoch": 1.0371976421166746, + "grad_norm": 4.722618103027344, + "learning_rate": 8.007118899308645e-05, + "loss": 0.571, + "step": 15308 + }, + { + "epoch": 1.0372653973846466, + "grad_norm": 5.094542026519775, + "learning_rate": 8.006981997398863e-05, + "loss": 0.8405, + "step": 15309 + }, + { + "epoch": 1.0373331526526188, + "grad_norm": 6.558629989624023, + "learning_rate": 8.006845095489083e-05, + "loss": 0.6621, + "step": 15310 + }, + { + "epoch": 1.0374009079205908, + "grad_norm": 7.169748306274414, + "learning_rate": 8.006708193579301e-05, + "loss": 0.7468, + "step": 15311 + }, + { + "epoch": 1.037468663188563, + "grad_norm": 7.064498424530029, + "learning_rate": 8.006571291669519e-05, + "loss": 0.9875, + "step": 15312 + }, + { + "epoch": 1.037536418456535, + "grad_norm": 8.31985092163086, + "learning_rate": 8.006434389759737e-05, + "loss": 0.7591, + "step": 15313 + }, + { + "epoch": 1.037604173724507, + "grad_norm": 8.058642387390137, + "learning_rate": 8.006297487849956e-05, + "loss": 0.7805, + "step": 15314 + }, + { + "epoch": 1.0376719289924792, + "grad_norm": 7.940720081329346, + "learning_rate": 8.006160585940174e-05, + "loss": 0.5895, + "step": 15315 + }, + { + "epoch": 1.0377396842604512, + "grad_norm": 4.879898548126221, + "learning_rate": 8.006023684030392e-05, + "loss": 0.6924, + "step": 15316 + }, + { + "epoch": 1.0378074395284234, + "grad_norm": 4.655125617980957, + "learning_rate": 8.00588678212061e-05, + "loss": 0.6833, + "step": 15317 + }, + { + "epoch": 1.0378751947963953, + "grad_norm": 6.936232566833496, + "learning_rate": 8.005749880210828e-05, + "loss": 0.7441, + "step": 15318 + }, + { + "epoch": 1.0379429500643675, + "grad_norm": 9.707219123840332, + "learning_rate": 8.005612978301048e-05, + "loss": 0.8677, + "step": 15319 + }, + { + "epoch": 1.0380107053323395, + "grad_norm": 6.091515064239502, + "learning_rate": 8.005476076391266e-05, + "loss": 0.8092, + "step": 15320 + }, + { + "epoch": 1.0380784606003117, + "grad_norm": 8.209754943847656, + "learning_rate": 8.005339174481484e-05, + "loss": 0.5553, + "step": 15321 + }, + { + "epoch": 1.0381462158682837, + "grad_norm": 6.475118160247803, + "learning_rate": 8.005202272571702e-05, + "loss": 0.6681, + "step": 15322 + }, + { + "epoch": 1.038213971136256, + "grad_norm": 8.269380569458008, + "learning_rate": 8.005065370661921e-05, + "loss": 0.7209, + "step": 15323 + }, + { + "epoch": 1.038281726404228, + "grad_norm": 6.947020053863525, + "learning_rate": 8.004928468752139e-05, + "loss": 0.8772, + "step": 15324 + }, + { + "epoch": 1.0383494816722, + "grad_norm": 4.593833923339844, + "learning_rate": 8.004791566842357e-05, + "loss": 0.5989, + "step": 15325 + }, + { + "epoch": 1.038417236940172, + "grad_norm": 5.82660436630249, + "learning_rate": 8.004654664932577e-05, + "loss": 0.7262, + "step": 15326 + }, + { + "epoch": 1.0384849922081443, + "grad_norm": 4.537695407867432, + "learning_rate": 8.004517763022795e-05, + "loss": 0.4765, + "step": 15327 + }, + { + "epoch": 1.0385527474761163, + "grad_norm": 7.33888053894043, + "learning_rate": 8.004380861113013e-05, + "loss": 0.6138, + "step": 15328 + }, + { + "epoch": 1.0386205027440885, + "grad_norm": 4.84438943862915, + "learning_rate": 8.004243959203232e-05, + "loss": 0.7464, + "step": 15329 + }, + { + "epoch": 1.0386882580120604, + "grad_norm": 8.118237495422363, + "learning_rate": 8.00410705729345e-05, + "loss": 0.4538, + "step": 15330 + }, + { + "epoch": 1.0387560132800324, + "grad_norm": 6.666751384735107, + "learning_rate": 8.003970155383668e-05, + "loss": 0.6187, + "step": 15331 + }, + { + "epoch": 1.0388237685480046, + "grad_norm": 6.705092906951904, + "learning_rate": 8.003833253473886e-05, + "loss": 0.8744, + "step": 15332 + }, + { + "epoch": 1.0388915238159766, + "grad_norm": 6.252575874328613, + "learning_rate": 8.003696351564106e-05, + "loss": 0.654, + "step": 15333 + }, + { + "epoch": 1.0389592790839488, + "grad_norm": 4.896333694458008, + "learning_rate": 8.003559449654324e-05, + "loss": 0.6385, + "step": 15334 + }, + { + "epoch": 1.0390270343519208, + "grad_norm": 6.1699957847595215, + "learning_rate": 8.003422547744542e-05, + "loss": 0.7625, + "step": 15335 + }, + { + "epoch": 1.039094789619893, + "grad_norm": 5.307643413543701, + "learning_rate": 8.00328564583476e-05, + "loss": 0.7154, + "step": 15336 + }, + { + "epoch": 1.039162544887865, + "grad_norm": 5.892039775848389, + "learning_rate": 8.003148743924979e-05, + "loss": 0.8284, + "step": 15337 + }, + { + "epoch": 1.0392303001558372, + "grad_norm": 5.1963372230529785, + "learning_rate": 8.003011842015197e-05, + "loss": 0.6057, + "step": 15338 + }, + { + "epoch": 1.0392980554238092, + "grad_norm": 5.593263626098633, + "learning_rate": 8.002874940105415e-05, + "loss": 0.7043, + "step": 15339 + }, + { + "epoch": 1.0393658106917814, + "grad_norm": 4.905797481536865, + "learning_rate": 8.002738038195633e-05, + "loss": 0.6327, + "step": 15340 + }, + { + "epoch": 1.0394335659597533, + "grad_norm": 7.234451770782471, + "learning_rate": 8.002601136285851e-05, + "loss": 0.5896, + "step": 15341 + }, + { + "epoch": 1.0395013212277255, + "grad_norm": 6.471969127655029, + "learning_rate": 8.00246423437607e-05, + "loss": 0.8219, + "step": 15342 + }, + { + "epoch": 1.0395690764956975, + "grad_norm": 6.145739555358887, + "learning_rate": 8.002327332466289e-05, + "loss": 0.6992, + "step": 15343 + }, + { + "epoch": 1.0396368317636697, + "grad_norm": 4.9458842277526855, + "learning_rate": 8.002190430556507e-05, + "loss": 0.5296, + "step": 15344 + }, + { + "epoch": 1.0397045870316417, + "grad_norm": 6.5056562423706055, + "learning_rate": 8.002053528646725e-05, + "loss": 0.7327, + "step": 15345 + }, + { + "epoch": 1.0397723422996137, + "grad_norm": 5.6960320472717285, + "learning_rate": 8.001916626736944e-05, + "loss": 0.5978, + "step": 15346 + }, + { + "epoch": 1.0398400975675859, + "grad_norm": 5.816061496734619, + "learning_rate": 8.001779724827162e-05, + "loss": 0.677, + "step": 15347 + }, + { + "epoch": 1.0399078528355579, + "grad_norm": 5.3048577308654785, + "learning_rate": 8.00164282291738e-05, + "loss": 0.5438, + "step": 15348 + }, + { + "epoch": 1.03997560810353, + "grad_norm": 4.549619197845459, + "learning_rate": 8.001505921007598e-05, + "loss": 0.3974, + "step": 15349 + }, + { + "epoch": 1.040043363371502, + "grad_norm": 7.864236831665039, + "learning_rate": 8.001369019097816e-05, + "loss": 0.7221, + "step": 15350 + }, + { + "epoch": 1.0401111186394743, + "grad_norm": 6.662132740020752, + "learning_rate": 8.001232117188036e-05, + "loss": 0.7352, + "step": 15351 + }, + { + "epoch": 1.0401788739074462, + "grad_norm": 5.497745037078857, + "learning_rate": 8.001095215278254e-05, + "loss": 0.6255, + "step": 15352 + }, + { + "epoch": 1.0402466291754184, + "grad_norm": 10.039993286132812, + "learning_rate": 8.000958313368472e-05, + "loss": 0.5771, + "step": 15353 + }, + { + "epoch": 1.0403143844433904, + "grad_norm": 5.307958126068115, + "learning_rate": 8.00082141145869e-05, + "loss": 0.5976, + "step": 15354 + }, + { + "epoch": 1.0403821397113626, + "grad_norm": 5.4257917404174805, + "learning_rate": 8.000684509548909e-05, + "loss": 0.7848, + "step": 15355 + }, + { + "epoch": 1.0404498949793346, + "grad_norm": 7.313519477844238, + "learning_rate": 8.000547607639127e-05, + "loss": 0.9212, + "step": 15356 + }, + { + "epoch": 1.0405176502473068, + "grad_norm": 6.327539443969727, + "learning_rate": 8.000410705729345e-05, + "loss": 0.6187, + "step": 15357 + }, + { + "epoch": 1.0405854055152788, + "grad_norm": 5.164039134979248, + "learning_rate": 8.000273803819563e-05, + "loss": 0.5013, + "step": 15358 + }, + { + "epoch": 1.040653160783251, + "grad_norm": 5.942699909210205, + "learning_rate": 8.000136901909781e-05, + "loss": 0.6973, + "step": 15359 + }, + { + "epoch": 1.040720916051223, + "grad_norm": 4.6758527755737305, + "learning_rate": 8e-05, + "loss": 0.6294, + "step": 15360 + }, + { + "epoch": 1.0407886713191952, + "grad_norm": 7.673499584197998, + "learning_rate": 7.999863098090219e-05, + "loss": 0.4294, + "step": 15361 + }, + { + "epoch": 1.0408564265871671, + "grad_norm": 6.975398063659668, + "learning_rate": 7.999726196180437e-05, + "loss": 1.0246, + "step": 15362 + }, + { + "epoch": 1.0409241818551391, + "grad_norm": 5.054098606109619, + "learning_rate": 7.999589294270655e-05, + "loss": 0.6756, + "step": 15363 + }, + { + "epoch": 1.0409919371231113, + "grad_norm": 5.625120639801025, + "learning_rate": 7.999452392360873e-05, + "loss": 0.6515, + "step": 15364 + }, + { + "epoch": 1.0410596923910833, + "grad_norm": 5.1921820640563965, + "learning_rate": 7.999315490451092e-05, + "loss": 0.707, + "step": 15365 + }, + { + "epoch": 1.0411274476590555, + "grad_norm": 11.847484588623047, + "learning_rate": 7.99917858854131e-05, + "loss": 0.641, + "step": 15366 + }, + { + "epoch": 1.0411952029270275, + "grad_norm": 5.949629783630371, + "learning_rate": 7.999041686631528e-05, + "loss": 0.6544, + "step": 15367 + }, + { + "epoch": 1.0412629581949997, + "grad_norm": 5.2862138748168945, + "learning_rate": 7.998904784721746e-05, + "loss": 0.5595, + "step": 15368 + }, + { + "epoch": 1.0413307134629717, + "grad_norm": 6.709194660186768, + "learning_rate": 7.998767882811966e-05, + "loss": 0.6076, + "step": 15369 + }, + { + "epoch": 1.0413984687309439, + "grad_norm": 5.2147088050842285, + "learning_rate": 7.998630980902184e-05, + "loss": 0.3128, + "step": 15370 + }, + { + "epoch": 1.0414662239989159, + "grad_norm": 6.64967155456543, + "learning_rate": 7.998494078992402e-05, + "loss": 0.6488, + "step": 15371 + }, + { + "epoch": 1.041533979266888, + "grad_norm": 6.93261194229126, + "learning_rate": 7.99835717708262e-05, + "loss": 0.802, + "step": 15372 + }, + { + "epoch": 1.04160173453486, + "grad_norm": 4.657343864440918, + "learning_rate": 7.998220275172839e-05, + "loss": 0.6395, + "step": 15373 + }, + { + "epoch": 1.0416694898028322, + "grad_norm": 7.187932968139648, + "learning_rate": 7.998083373263057e-05, + "loss": 0.6507, + "step": 15374 + }, + { + "epoch": 1.0417372450708042, + "grad_norm": 7.809886455535889, + "learning_rate": 7.997946471353275e-05, + "loss": 0.72, + "step": 15375 + }, + { + "epoch": 1.0418050003387764, + "grad_norm": 7.339730262756348, + "learning_rate": 7.997809569443495e-05, + "loss": 0.7263, + "step": 15376 + }, + { + "epoch": 1.0418727556067484, + "grad_norm": 6.153295516967773, + "learning_rate": 7.997672667533713e-05, + "loss": 0.6731, + "step": 15377 + }, + { + "epoch": 1.0419405108747206, + "grad_norm": 5.918948650360107, + "learning_rate": 7.997535765623931e-05, + "loss": 0.6788, + "step": 15378 + }, + { + "epoch": 1.0420082661426926, + "grad_norm": 7.921314716339111, + "learning_rate": 7.99739886371415e-05, + "loss": 0.7971, + "step": 15379 + }, + { + "epoch": 1.0420760214106646, + "grad_norm": 5.347783088684082, + "learning_rate": 7.997261961804368e-05, + "loss": 0.5056, + "step": 15380 + }, + { + "epoch": 1.0421437766786368, + "grad_norm": 4.612505912780762, + "learning_rate": 7.997125059894586e-05, + "loss": 0.6825, + "step": 15381 + }, + { + "epoch": 1.0422115319466088, + "grad_norm": 8.001241683959961, + "learning_rate": 7.996988157984804e-05, + "loss": 0.5893, + "step": 15382 + }, + { + "epoch": 1.042279287214581, + "grad_norm": 5.214502811431885, + "learning_rate": 7.996851256075024e-05, + "loss": 0.6628, + "step": 15383 + }, + { + "epoch": 1.042347042482553, + "grad_norm": 8.609155654907227, + "learning_rate": 7.996714354165242e-05, + "loss": 0.9572, + "step": 15384 + }, + { + "epoch": 1.0424147977505251, + "grad_norm": 4.416464328765869, + "learning_rate": 7.99657745225546e-05, + "loss": 0.6881, + "step": 15385 + }, + { + "epoch": 1.0424825530184971, + "grad_norm": 6.996601104736328, + "learning_rate": 7.996440550345678e-05, + "loss": 0.8354, + "step": 15386 + }, + { + "epoch": 1.0425503082864693, + "grad_norm": 7.7891035079956055, + "learning_rate": 7.996303648435896e-05, + "loss": 0.5579, + "step": 15387 + }, + { + "epoch": 1.0426180635544413, + "grad_norm": 5.543262004852295, + "learning_rate": 7.996166746526115e-05, + "loss": 0.7516, + "step": 15388 + }, + { + "epoch": 1.0426858188224135, + "grad_norm": 5.837127685546875, + "learning_rate": 7.996029844616333e-05, + "loss": 0.7111, + "step": 15389 + }, + { + "epoch": 1.0427535740903855, + "grad_norm": 7.789334774017334, + "learning_rate": 7.995892942706551e-05, + "loss": 0.7419, + "step": 15390 + }, + { + "epoch": 1.0428213293583577, + "grad_norm": 5.3756022453308105, + "learning_rate": 7.995756040796769e-05, + "loss": 0.7475, + "step": 15391 + }, + { + "epoch": 1.0428890846263297, + "grad_norm": 4.391982555389404, + "learning_rate": 7.995619138886989e-05, + "loss": 0.6616, + "step": 15392 + }, + { + "epoch": 1.0429568398943019, + "grad_norm": 5.609358787536621, + "learning_rate": 7.995482236977207e-05, + "loss": 0.817, + "step": 15393 + }, + { + "epoch": 1.0430245951622739, + "grad_norm": 4.2053046226501465, + "learning_rate": 7.995345335067425e-05, + "loss": 0.5884, + "step": 15394 + }, + { + "epoch": 1.0430923504302458, + "grad_norm": 7.365318775177002, + "learning_rate": 7.995208433157643e-05, + "loss": 0.7841, + "step": 15395 + }, + { + "epoch": 1.043160105698218, + "grad_norm": 5.268418312072754, + "learning_rate": 7.995071531247861e-05, + "loss": 0.7165, + "step": 15396 + }, + { + "epoch": 1.04322786096619, + "grad_norm": 4.734410762786865, + "learning_rate": 7.99493462933808e-05, + "loss": 0.5459, + "step": 15397 + }, + { + "epoch": 1.0432956162341622, + "grad_norm": 6.027469635009766, + "learning_rate": 7.994797727428298e-05, + "loss": 0.7033, + "step": 15398 + }, + { + "epoch": 1.0433633715021342, + "grad_norm": 5.068356037139893, + "learning_rate": 7.994660825518516e-05, + "loss": 0.7333, + "step": 15399 + }, + { + "epoch": 1.0434311267701064, + "grad_norm": 5.465489387512207, + "learning_rate": 7.994523923608734e-05, + "loss": 0.6387, + "step": 15400 + }, + { + "epoch": 1.0434988820380784, + "grad_norm": 6.087520122528076, + "learning_rate": 7.994387021698954e-05, + "loss": 0.6957, + "step": 15401 + }, + { + "epoch": 1.0435666373060506, + "grad_norm": 6.731315612792969, + "learning_rate": 7.994250119789172e-05, + "loss": 0.6397, + "step": 15402 + }, + { + "epoch": 1.0436343925740226, + "grad_norm": 5.57260274887085, + "learning_rate": 7.99411321787939e-05, + "loss": 0.6853, + "step": 15403 + }, + { + "epoch": 1.0437021478419948, + "grad_norm": 5.95422887802124, + "learning_rate": 7.993976315969608e-05, + "loss": 0.6726, + "step": 15404 + }, + { + "epoch": 1.0437699031099668, + "grad_norm": 5.570609092712402, + "learning_rate": 7.993839414059826e-05, + "loss": 0.7738, + "step": 15405 + }, + { + "epoch": 1.043837658377939, + "grad_norm": 5.522849082946777, + "learning_rate": 7.993702512150045e-05, + "loss": 0.6653, + "step": 15406 + }, + { + "epoch": 1.043905413645911, + "grad_norm": 5.089807987213135, + "learning_rate": 7.993565610240263e-05, + "loss": 0.5422, + "step": 15407 + }, + { + "epoch": 1.0439731689138831, + "grad_norm": 5.952882766723633, + "learning_rate": 7.993428708330481e-05, + "loss": 0.714, + "step": 15408 + }, + { + "epoch": 1.0440409241818551, + "grad_norm": 5.655484199523926, + "learning_rate": 7.993291806420699e-05, + "loss": 0.461, + "step": 15409 + }, + { + "epoch": 1.0441086794498273, + "grad_norm": 6.936872959136963, + "learning_rate": 7.993154904510919e-05, + "loss": 0.6569, + "step": 15410 + }, + { + "epoch": 1.0441764347177993, + "grad_norm": 6.010931968688965, + "learning_rate": 7.993018002601137e-05, + "loss": 0.7323, + "step": 15411 + }, + { + "epoch": 1.0442441899857713, + "grad_norm": 7.187067985534668, + "learning_rate": 7.992881100691355e-05, + "loss": 0.9217, + "step": 15412 + }, + { + "epoch": 1.0443119452537435, + "grad_norm": 5.070352554321289, + "learning_rate": 7.992744198781573e-05, + "loss": 0.6433, + "step": 15413 + }, + { + "epoch": 1.0443797005217155, + "grad_norm": 6.546175479888916, + "learning_rate": 7.992607296871791e-05, + "loss": 0.671, + "step": 15414 + }, + { + "epoch": 1.0444474557896877, + "grad_norm": 6.2877349853515625, + "learning_rate": 7.99247039496201e-05, + "loss": 0.8915, + "step": 15415 + }, + { + "epoch": 1.0445152110576597, + "grad_norm": 5.39522647857666, + "learning_rate": 7.992333493052228e-05, + "loss": 0.7388, + "step": 15416 + }, + { + "epoch": 1.0445829663256319, + "grad_norm": 6.453690052032471, + "learning_rate": 7.992196591142446e-05, + "loss": 0.7892, + "step": 15417 + }, + { + "epoch": 1.0446507215936038, + "grad_norm": 6.763200759887695, + "learning_rate": 7.992059689232664e-05, + "loss": 0.6219, + "step": 15418 + }, + { + "epoch": 1.044718476861576, + "grad_norm": 5.610644817352295, + "learning_rate": 7.991922787322884e-05, + "loss": 0.7382, + "step": 15419 + }, + { + "epoch": 1.044786232129548, + "grad_norm": 6.314178466796875, + "learning_rate": 7.991785885413102e-05, + "loss": 0.7147, + "step": 15420 + }, + { + "epoch": 1.0448539873975202, + "grad_norm": 5.987586498260498, + "learning_rate": 7.99164898350332e-05, + "loss": 0.537, + "step": 15421 + }, + { + "epoch": 1.0449217426654922, + "grad_norm": 7.661067008972168, + "learning_rate": 7.991512081593539e-05, + "loss": 0.9005, + "step": 15422 + }, + { + "epoch": 1.0449894979334644, + "grad_norm": 5.166024684906006, + "learning_rate": 7.991375179683757e-05, + "loss": 0.755, + "step": 15423 + }, + { + "epoch": 1.0450572532014364, + "grad_norm": 6.560528755187988, + "learning_rate": 7.991238277773975e-05, + "loss": 0.7315, + "step": 15424 + }, + { + "epoch": 1.0451250084694086, + "grad_norm": 6.162966728210449, + "learning_rate": 7.991101375864195e-05, + "loss": 0.5871, + "step": 15425 + }, + { + "epoch": 1.0451927637373806, + "grad_norm": 6.7889628410339355, + "learning_rate": 7.990964473954413e-05, + "loss": 0.7381, + "step": 15426 + }, + { + "epoch": 1.0452605190053528, + "grad_norm": 5.342012405395508, + "learning_rate": 7.99082757204463e-05, + "loss": 0.7028, + "step": 15427 + }, + { + "epoch": 1.0453282742733248, + "grad_norm": 5.235086441040039, + "learning_rate": 7.990690670134849e-05, + "loss": 0.5145, + "step": 15428 + }, + { + "epoch": 1.0453960295412967, + "grad_norm": 5.529267311096191, + "learning_rate": 7.990553768225068e-05, + "loss": 0.5921, + "step": 15429 + }, + { + "epoch": 1.045463784809269, + "grad_norm": 5.2520880699157715, + "learning_rate": 7.990416866315286e-05, + "loss": 0.7905, + "step": 15430 + }, + { + "epoch": 1.045531540077241, + "grad_norm": 5.065273761749268, + "learning_rate": 7.990279964405504e-05, + "loss": 0.6968, + "step": 15431 + }, + { + "epoch": 1.0455992953452131, + "grad_norm": 5.409516334533691, + "learning_rate": 7.990143062495722e-05, + "loss": 0.6831, + "step": 15432 + }, + { + "epoch": 1.045667050613185, + "grad_norm": 5.545697212219238, + "learning_rate": 7.990006160585942e-05, + "loss": 0.6522, + "step": 15433 + }, + { + "epoch": 1.0457348058811573, + "grad_norm": 5.714067459106445, + "learning_rate": 7.98986925867616e-05, + "loss": 0.5569, + "step": 15434 + }, + { + "epoch": 1.0458025611491293, + "grad_norm": 7.723472595214844, + "learning_rate": 7.989732356766378e-05, + "loss": 0.6271, + "step": 15435 + }, + { + "epoch": 1.0458703164171015, + "grad_norm": 7.13986349105835, + "learning_rate": 7.989595454856596e-05, + "loss": 0.8158, + "step": 15436 + }, + { + "epoch": 1.0459380716850735, + "grad_norm": 6.393859386444092, + "learning_rate": 7.989458552946814e-05, + "loss": 0.575, + "step": 15437 + }, + { + "epoch": 1.0460058269530457, + "grad_norm": 8.466290473937988, + "learning_rate": 7.989321651037033e-05, + "loss": 0.6657, + "step": 15438 + }, + { + "epoch": 1.0460735822210177, + "grad_norm": 4.97887659072876, + "learning_rate": 7.989184749127251e-05, + "loss": 0.6869, + "step": 15439 + }, + { + "epoch": 1.0461413374889899, + "grad_norm": 6.120355129241943, + "learning_rate": 7.989047847217469e-05, + "loss": 0.7158, + "step": 15440 + }, + { + "epoch": 1.0462090927569618, + "grad_norm": 5.353936672210693, + "learning_rate": 7.988910945307687e-05, + "loss": 0.6081, + "step": 15441 + }, + { + "epoch": 1.046276848024934, + "grad_norm": 5.116786956787109, + "learning_rate": 7.988774043397905e-05, + "loss": 0.725, + "step": 15442 + }, + { + "epoch": 1.046344603292906, + "grad_norm": 5.482798099517822, + "learning_rate": 7.988637141488125e-05, + "loss": 0.7154, + "step": 15443 + }, + { + "epoch": 1.046412358560878, + "grad_norm": 6.663843631744385, + "learning_rate": 7.988500239578343e-05, + "loss": 0.7235, + "step": 15444 + }, + { + "epoch": 1.0464801138288502, + "grad_norm": 5.938141822814941, + "learning_rate": 7.98836333766856e-05, + "loss": 0.5438, + "step": 15445 + }, + { + "epoch": 1.0465478690968222, + "grad_norm": 4.406172752380371, + "learning_rate": 7.988226435758779e-05, + "loss": 0.6383, + "step": 15446 + }, + { + "epoch": 1.0466156243647944, + "grad_norm": 4.991530895233154, + "learning_rate": 7.988089533848998e-05, + "loss": 0.6158, + "step": 15447 + }, + { + "epoch": 1.0466833796327664, + "grad_norm": 5.490728378295898, + "learning_rate": 7.987952631939216e-05, + "loss": 0.6724, + "step": 15448 + }, + { + "epoch": 1.0467511349007386, + "grad_norm": 6.0448384284973145, + "learning_rate": 7.987815730029434e-05, + "loss": 0.5423, + "step": 15449 + }, + { + "epoch": 1.0468188901687105, + "grad_norm": 5.0079731941223145, + "learning_rate": 7.987678828119652e-05, + "loss": 0.7416, + "step": 15450 + }, + { + "epoch": 1.0468866454366828, + "grad_norm": 6.357697486877441, + "learning_rate": 7.98754192620987e-05, + "loss": 0.5595, + "step": 15451 + }, + { + "epoch": 1.0469544007046547, + "grad_norm": 5.9440226554870605, + "learning_rate": 7.98740502430009e-05, + "loss": 0.6615, + "step": 15452 + }, + { + "epoch": 1.047022155972627, + "grad_norm": 5.571059226989746, + "learning_rate": 7.987268122390308e-05, + "loss": 0.6681, + "step": 15453 + }, + { + "epoch": 1.047089911240599, + "grad_norm": 5.801420211791992, + "learning_rate": 7.987131220480526e-05, + "loss": 0.6817, + "step": 15454 + }, + { + "epoch": 1.0471576665085711, + "grad_norm": 5.256460666656494, + "learning_rate": 7.986994318570744e-05, + "loss": 0.5682, + "step": 15455 + }, + { + "epoch": 1.047225421776543, + "grad_norm": 5.437204360961914, + "learning_rate": 7.986857416660963e-05, + "loss": 0.8709, + "step": 15456 + }, + { + "epoch": 1.0472931770445153, + "grad_norm": 4.7969651222229, + "learning_rate": 7.986720514751181e-05, + "loss": 0.5018, + "step": 15457 + }, + { + "epoch": 1.0473609323124873, + "grad_norm": 8.146381378173828, + "learning_rate": 7.986583612841399e-05, + "loss": 0.6968, + "step": 15458 + }, + { + "epoch": 1.0474286875804595, + "grad_norm": 7.133126258850098, + "learning_rate": 7.986446710931617e-05, + "loss": 0.7164, + "step": 15459 + }, + { + "epoch": 1.0474964428484315, + "grad_norm": 8.4725980758667, + "learning_rate": 7.986309809021835e-05, + "loss": 0.543, + "step": 15460 + }, + { + "epoch": 1.0475641981164034, + "grad_norm": 6.91729211807251, + "learning_rate": 7.986172907112055e-05, + "loss": 0.4047, + "step": 15461 + }, + { + "epoch": 1.0476319533843756, + "grad_norm": 5.977921009063721, + "learning_rate": 7.986036005202273e-05, + "loss": 0.4638, + "step": 15462 + }, + { + "epoch": 1.0476997086523476, + "grad_norm": 5.780356407165527, + "learning_rate": 7.985899103292491e-05, + "loss": 0.6565, + "step": 15463 + }, + { + "epoch": 1.0477674639203198, + "grad_norm": 5.508984088897705, + "learning_rate": 7.985762201382709e-05, + "loss": 0.6388, + "step": 15464 + }, + { + "epoch": 1.0478352191882918, + "grad_norm": 7.178645610809326, + "learning_rate": 7.985625299472928e-05, + "loss": 0.6638, + "step": 15465 + }, + { + "epoch": 1.047902974456264, + "grad_norm": 5.548744201660156, + "learning_rate": 7.985488397563146e-05, + "loss": 0.5982, + "step": 15466 + }, + { + "epoch": 1.047970729724236, + "grad_norm": 6.19132661819458, + "learning_rate": 7.985351495653364e-05, + "loss": 0.6313, + "step": 15467 + }, + { + "epoch": 1.0480384849922082, + "grad_norm": 5.953742504119873, + "learning_rate": 7.985214593743584e-05, + "loss": 0.9055, + "step": 15468 + }, + { + "epoch": 1.0481062402601802, + "grad_norm": 6.040510177612305, + "learning_rate": 7.985077691833802e-05, + "loss": 0.8007, + "step": 15469 + }, + { + "epoch": 1.0481739955281524, + "grad_norm": 6.583937168121338, + "learning_rate": 7.98494078992402e-05, + "loss": 0.5396, + "step": 15470 + }, + { + "epoch": 1.0482417507961244, + "grad_norm": 6.282310485839844, + "learning_rate": 7.984803888014239e-05, + "loss": 0.7954, + "step": 15471 + }, + { + "epoch": 1.0483095060640966, + "grad_norm": 5.081226348876953, + "learning_rate": 7.984666986104457e-05, + "loss": 0.753, + "step": 15472 + }, + { + "epoch": 1.0483772613320685, + "grad_norm": 7.40177583694458, + "learning_rate": 7.984530084194675e-05, + "loss": 0.8124, + "step": 15473 + }, + { + "epoch": 1.0484450166000407, + "grad_norm": 5.868223667144775, + "learning_rate": 7.984393182284893e-05, + "loss": 0.651, + "step": 15474 + }, + { + "epoch": 1.0485127718680127, + "grad_norm": 4.443064212799072, + "learning_rate": 7.984256280375113e-05, + "loss": 0.5547, + "step": 15475 + }, + { + "epoch": 1.048580527135985, + "grad_norm": 6.508821964263916, + "learning_rate": 7.98411937846533e-05, + "loss": 0.8823, + "step": 15476 + }, + { + "epoch": 1.048648282403957, + "grad_norm": 8.762590408325195, + "learning_rate": 7.983982476555549e-05, + "loss": 0.7839, + "step": 15477 + }, + { + "epoch": 1.048716037671929, + "grad_norm": 6.139303684234619, + "learning_rate": 7.983845574645767e-05, + "loss": 0.6002, + "step": 15478 + }, + { + "epoch": 1.048783792939901, + "grad_norm": 5.247832775115967, + "learning_rate": 7.983708672735986e-05, + "loss": 0.6179, + "step": 15479 + }, + { + "epoch": 1.048851548207873, + "grad_norm": 6.066807270050049, + "learning_rate": 7.983571770826204e-05, + "loss": 0.585, + "step": 15480 + }, + { + "epoch": 1.0489193034758453, + "grad_norm": 4.7573747634887695, + "learning_rate": 7.983434868916422e-05, + "loss": 0.6707, + "step": 15481 + }, + { + "epoch": 1.0489870587438173, + "grad_norm": 7.490664958953857, + "learning_rate": 7.98329796700664e-05, + "loss": 0.7973, + "step": 15482 + }, + { + "epoch": 1.0490548140117895, + "grad_norm": 4.913379669189453, + "learning_rate": 7.983161065096858e-05, + "loss": 0.6074, + "step": 15483 + }, + { + "epoch": 1.0491225692797614, + "grad_norm": 5.44326639175415, + "learning_rate": 7.983024163187078e-05, + "loss": 0.5872, + "step": 15484 + }, + { + "epoch": 1.0491903245477336, + "grad_norm": 6.307222843170166, + "learning_rate": 7.982887261277296e-05, + "loss": 0.7191, + "step": 15485 + }, + { + "epoch": 1.0492580798157056, + "grad_norm": 7.862999439239502, + "learning_rate": 7.982750359367514e-05, + "loss": 0.6033, + "step": 15486 + }, + { + "epoch": 1.0493258350836778, + "grad_norm": 7.700031757354736, + "learning_rate": 7.982613457457732e-05, + "loss": 0.8108, + "step": 15487 + }, + { + "epoch": 1.0493935903516498, + "grad_norm": 4.863072872161865, + "learning_rate": 7.982476555547951e-05, + "loss": 0.6379, + "step": 15488 + }, + { + "epoch": 1.049461345619622, + "grad_norm": 6.879638671875, + "learning_rate": 7.982339653638169e-05, + "loss": 0.6425, + "step": 15489 + }, + { + "epoch": 1.049529100887594, + "grad_norm": 5.878018379211426, + "learning_rate": 7.982202751728387e-05, + "loss": 0.62, + "step": 15490 + }, + { + "epoch": 1.0495968561555662, + "grad_norm": 5.8812055587768555, + "learning_rate": 7.982065849818605e-05, + "loss": 0.5707, + "step": 15491 + }, + { + "epoch": 1.0496646114235382, + "grad_norm": 5.584284782409668, + "learning_rate": 7.981928947908823e-05, + "loss": 0.5941, + "step": 15492 + }, + { + "epoch": 1.0497323666915102, + "grad_norm": 11.247712135314941, + "learning_rate": 7.981792045999043e-05, + "loss": 0.9765, + "step": 15493 + }, + { + "epoch": 1.0498001219594824, + "grad_norm": 5.628141403198242, + "learning_rate": 7.98165514408926e-05, + "loss": 0.5359, + "step": 15494 + }, + { + "epoch": 1.0498678772274543, + "grad_norm": 6.264639377593994, + "learning_rate": 7.981518242179479e-05, + "loss": 0.7608, + "step": 15495 + }, + { + "epoch": 1.0499356324954265, + "grad_norm": 7.300719261169434, + "learning_rate": 7.981381340269697e-05, + "loss": 0.7166, + "step": 15496 + }, + { + "epoch": 1.0500033877633985, + "grad_norm": 5.170079231262207, + "learning_rate": 7.981244438359915e-05, + "loss": 0.5827, + "step": 15497 + }, + { + "epoch": 1.0500711430313707, + "grad_norm": 6.566158294677734, + "learning_rate": 7.981107536450134e-05, + "loss": 0.6063, + "step": 15498 + }, + { + "epoch": 1.0501388982993427, + "grad_norm": 7.797521114349365, + "learning_rate": 7.980970634540352e-05, + "loss": 0.4916, + "step": 15499 + }, + { + "epoch": 1.050206653567315, + "grad_norm": 6.960193157196045, + "learning_rate": 7.98083373263057e-05, + "loss": 0.7103, + "step": 15500 + }, + { + "epoch": 1.0502744088352869, + "grad_norm": 7.210864543914795, + "learning_rate": 7.980696830720788e-05, + "loss": 0.7338, + "step": 15501 + }, + { + "epoch": 1.050342164103259, + "grad_norm": 7.08119535446167, + "learning_rate": 7.980559928811008e-05, + "loss": 0.6056, + "step": 15502 + }, + { + "epoch": 1.050409919371231, + "grad_norm": 6.606025695800781, + "learning_rate": 7.980423026901226e-05, + "loss": 0.7785, + "step": 15503 + }, + { + "epoch": 1.0504776746392033, + "grad_norm": 7.610130786895752, + "learning_rate": 7.980286124991444e-05, + "loss": 0.7014, + "step": 15504 + }, + { + "epoch": 1.0505454299071753, + "grad_norm": 6.512502670288086, + "learning_rate": 7.980149223081662e-05, + "loss": 0.606, + "step": 15505 + }, + { + "epoch": 1.0506131851751475, + "grad_norm": 6.440487861633301, + "learning_rate": 7.98001232117188e-05, + "loss": 0.6465, + "step": 15506 + }, + { + "epoch": 1.0506809404431194, + "grad_norm": 6.003171920776367, + "learning_rate": 7.979875419262099e-05, + "loss": 0.5331, + "step": 15507 + }, + { + "epoch": 1.0507486957110916, + "grad_norm": 5.904346466064453, + "learning_rate": 7.979738517352317e-05, + "loss": 0.824, + "step": 15508 + }, + { + "epoch": 1.0508164509790636, + "grad_norm": 5.533935070037842, + "learning_rate": 7.979601615442535e-05, + "loss": 0.8511, + "step": 15509 + }, + { + "epoch": 1.0508842062470356, + "grad_norm": 6.289936065673828, + "learning_rate": 7.979464713532753e-05, + "loss": 0.7926, + "step": 15510 + }, + { + "epoch": 1.0509519615150078, + "grad_norm": 5.81295919418335, + "learning_rate": 7.979327811622973e-05, + "loss": 0.9554, + "step": 15511 + }, + { + "epoch": 1.0510197167829798, + "grad_norm": 5.214400768280029, + "learning_rate": 7.97919090971319e-05, + "loss": 0.9091, + "step": 15512 + }, + { + "epoch": 1.051087472050952, + "grad_norm": 7.468234539031982, + "learning_rate": 7.979054007803409e-05, + "loss": 0.593, + "step": 15513 + }, + { + "epoch": 1.051155227318924, + "grad_norm": 6.6748366355896, + "learning_rate": 7.978917105893628e-05, + "loss": 0.9312, + "step": 15514 + }, + { + "epoch": 1.0512229825868962, + "grad_norm": 7.021498680114746, + "learning_rate": 7.978780203983846e-05, + "loss": 0.5446, + "step": 15515 + }, + { + "epoch": 1.0512907378548682, + "grad_norm": 7.151730060577393, + "learning_rate": 7.978643302074064e-05, + "loss": 0.6672, + "step": 15516 + }, + { + "epoch": 1.0513584931228404, + "grad_norm": 8.595193862915039, + "learning_rate": 7.978506400164284e-05, + "loss": 0.8893, + "step": 15517 + }, + { + "epoch": 1.0514262483908123, + "grad_norm": 6.063235282897949, + "learning_rate": 7.978369498254502e-05, + "loss": 0.5404, + "step": 15518 + }, + { + "epoch": 1.0514940036587845, + "grad_norm": 6.944025039672852, + "learning_rate": 7.97823259634472e-05, + "loss": 0.4193, + "step": 15519 + }, + { + "epoch": 1.0515617589267565, + "grad_norm": 6.7520976066589355, + "learning_rate": 7.978095694434938e-05, + "loss": 0.8591, + "step": 15520 + }, + { + "epoch": 1.0516295141947287, + "grad_norm": 5.863169193267822, + "learning_rate": 7.977958792525157e-05, + "loss": 0.6715, + "step": 15521 + }, + { + "epoch": 1.0516972694627007, + "grad_norm": 7.465962886810303, + "learning_rate": 7.977821890615375e-05, + "loss": 0.6657, + "step": 15522 + }, + { + "epoch": 1.051765024730673, + "grad_norm": 4.659458637237549, + "learning_rate": 7.977684988705593e-05, + "loss": 0.8134, + "step": 15523 + }, + { + "epoch": 1.0518327799986449, + "grad_norm": 7.612015247344971, + "learning_rate": 7.977548086795811e-05, + "loss": 0.6289, + "step": 15524 + }, + { + "epoch": 1.051900535266617, + "grad_norm": 5.157435894012451, + "learning_rate": 7.97741118488603e-05, + "loss": 0.7039, + "step": 15525 + }, + { + "epoch": 1.051968290534589, + "grad_norm": 5.263278484344482, + "learning_rate": 7.977274282976249e-05, + "loss": 0.5698, + "step": 15526 + }, + { + "epoch": 1.052036045802561, + "grad_norm": 5.186944484710693, + "learning_rate": 7.977137381066467e-05, + "loss": 0.5398, + "step": 15527 + }, + { + "epoch": 1.0521038010705333, + "grad_norm": 6.908531188964844, + "learning_rate": 7.977000479156685e-05, + "loss": 0.637, + "step": 15528 + }, + { + "epoch": 1.0521715563385052, + "grad_norm": 5.298072338104248, + "learning_rate": 7.976863577246903e-05, + "loss": 0.55, + "step": 15529 + }, + { + "epoch": 1.0522393116064774, + "grad_norm": 4.47366189956665, + "learning_rate": 7.976726675337122e-05, + "loss": 0.5943, + "step": 15530 + }, + { + "epoch": 1.0523070668744494, + "grad_norm": 5.410435676574707, + "learning_rate": 7.97658977342734e-05, + "loss": 0.9073, + "step": 15531 + }, + { + "epoch": 1.0523748221424216, + "grad_norm": 5.6297478675842285, + "learning_rate": 7.976452871517558e-05, + "loss": 0.7233, + "step": 15532 + }, + { + "epoch": 1.0524425774103936, + "grad_norm": 5.776382923126221, + "learning_rate": 7.976315969607776e-05, + "loss": 0.6116, + "step": 15533 + }, + { + "epoch": 1.0525103326783658, + "grad_norm": 7.4787774085998535, + "learning_rate": 7.976179067697996e-05, + "loss": 0.7542, + "step": 15534 + }, + { + "epoch": 1.0525780879463378, + "grad_norm": 4.613537311553955, + "learning_rate": 7.976042165788214e-05, + "loss": 0.6745, + "step": 15535 + }, + { + "epoch": 1.05264584321431, + "grad_norm": 6.3547282218933105, + "learning_rate": 7.975905263878432e-05, + "loss": 0.7607, + "step": 15536 + }, + { + "epoch": 1.052713598482282, + "grad_norm": 5.2433366775512695, + "learning_rate": 7.97576836196865e-05, + "loss": 0.679, + "step": 15537 + }, + { + "epoch": 1.0527813537502542, + "grad_norm": 6.4581522941589355, + "learning_rate": 7.975631460058868e-05, + "loss": 0.8027, + "step": 15538 + }, + { + "epoch": 1.0528491090182261, + "grad_norm": 6.2019267082214355, + "learning_rate": 7.975494558149087e-05, + "loss": 0.8168, + "step": 15539 + }, + { + "epoch": 1.0529168642861984, + "grad_norm": 5.71392297744751, + "learning_rate": 7.975357656239305e-05, + "loss": 0.6555, + "step": 15540 + }, + { + "epoch": 1.0529846195541703, + "grad_norm": 6.714015007019043, + "learning_rate": 7.975220754329523e-05, + "loss": 0.6979, + "step": 15541 + }, + { + "epoch": 1.0530523748221423, + "grad_norm": 6.717672348022461, + "learning_rate": 7.975083852419741e-05, + "loss": 0.8411, + "step": 15542 + }, + { + "epoch": 1.0531201300901145, + "grad_norm": 7.295918941497803, + "learning_rate": 7.97494695050996e-05, + "loss": 0.702, + "step": 15543 + }, + { + "epoch": 1.0531878853580865, + "grad_norm": 6.989544868469238, + "learning_rate": 7.974810048600179e-05, + "loss": 0.6153, + "step": 15544 + }, + { + "epoch": 1.0532556406260587, + "grad_norm": 6.641953945159912, + "learning_rate": 7.974673146690397e-05, + "loss": 0.4917, + "step": 15545 + }, + { + "epoch": 1.0533233958940307, + "grad_norm": 7.03810977935791, + "learning_rate": 7.974536244780615e-05, + "loss": 0.6252, + "step": 15546 + }, + { + "epoch": 1.0533911511620029, + "grad_norm": 5.024344444274902, + "learning_rate": 7.974399342870833e-05, + "loss": 0.7038, + "step": 15547 + }, + { + "epoch": 1.0534589064299749, + "grad_norm": 6.919520854949951, + "learning_rate": 7.974262440961052e-05, + "loss": 0.9027, + "step": 15548 + }, + { + "epoch": 1.053526661697947, + "grad_norm": 5.6881537437438965, + "learning_rate": 7.97412553905127e-05, + "loss": 0.6556, + "step": 15549 + }, + { + "epoch": 1.053594416965919, + "grad_norm": 6.098855495452881, + "learning_rate": 7.973988637141488e-05, + "loss": 0.6225, + "step": 15550 + }, + { + "epoch": 1.0536621722338912, + "grad_norm": 5.481220245361328, + "learning_rate": 7.973851735231706e-05, + "loss": 0.7031, + "step": 15551 + }, + { + "epoch": 1.0537299275018632, + "grad_norm": 7.0920586585998535, + "learning_rate": 7.973714833321924e-05, + "loss": 0.672, + "step": 15552 + }, + { + "epoch": 1.0537976827698354, + "grad_norm": 4.710086345672607, + "learning_rate": 7.973577931412144e-05, + "loss": 0.652, + "step": 15553 + }, + { + "epoch": 1.0538654380378074, + "grad_norm": 5.764717102050781, + "learning_rate": 7.973441029502362e-05, + "loss": 0.7777, + "step": 15554 + }, + { + "epoch": 1.0539331933057796, + "grad_norm": 6.428826332092285, + "learning_rate": 7.97330412759258e-05, + "loss": 0.7785, + "step": 15555 + }, + { + "epoch": 1.0540009485737516, + "grad_norm": 5.741217136383057, + "learning_rate": 7.973167225682798e-05, + "loss": 0.6728, + "step": 15556 + }, + { + "epoch": 1.0540687038417238, + "grad_norm": 7.573585510253906, + "learning_rate": 7.973030323773017e-05, + "loss": 0.7876, + "step": 15557 + }, + { + "epoch": 1.0541364591096958, + "grad_norm": 4.414328575134277, + "learning_rate": 7.972893421863235e-05, + "loss": 0.556, + "step": 15558 + }, + { + "epoch": 1.0542042143776678, + "grad_norm": 4.814504623413086, + "learning_rate": 7.972756519953453e-05, + "loss": 0.6419, + "step": 15559 + }, + { + "epoch": 1.05427196964564, + "grad_norm": 6.726992130279541, + "learning_rate": 7.972619618043673e-05, + "loss": 0.7071, + "step": 15560 + }, + { + "epoch": 1.054339724913612, + "grad_norm": 4.732487201690674, + "learning_rate": 7.97248271613389e-05, + "loss": 0.6135, + "step": 15561 + }, + { + "epoch": 1.0544074801815841, + "grad_norm": 5.262027740478516, + "learning_rate": 7.972345814224109e-05, + "loss": 0.657, + "step": 15562 + }, + { + "epoch": 1.0544752354495561, + "grad_norm": 7.205251693725586, + "learning_rate": 7.972208912314328e-05, + "loss": 0.6268, + "step": 15563 + }, + { + "epoch": 1.0545429907175283, + "grad_norm": 6.2166337966918945, + "learning_rate": 7.972072010404546e-05, + "loss": 0.678, + "step": 15564 + }, + { + "epoch": 1.0546107459855003, + "grad_norm": 5.0857343673706055, + "learning_rate": 7.971935108494764e-05, + "loss": 0.6984, + "step": 15565 + }, + { + "epoch": 1.0546785012534725, + "grad_norm": 5.1078782081604, + "learning_rate": 7.971798206584983e-05, + "loss": 0.7758, + "step": 15566 + }, + { + "epoch": 1.0547462565214445, + "grad_norm": 6.676407337188721, + "learning_rate": 7.971661304675201e-05, + "loss": 0.9758, + "step": 15567 + }, + { + "epoch": 1.0548140117894167, + "grad_norm": 10.701401710510254, + "learning_rate": 7.97152440276542e-05, + "loss": 0.7816, + "step": 15568 + }, + { + "epoch": 1.0548817670573887, + "grad_norm": 11.736532211303711, + "learning_rate": 7.971387500855638e-05, + "loss": 0.7215, + "step": 15569 + }, + { + "epoch": 1.0549495223253609, + "grad_norm": 5.571110248565674, + "learning_rate": 7.971250598945856e-05, + "loss": 0.683, + "step": 15570 + }, + { + "epoch": 1.0550172775933329, + "grad_norm": 6.118210315704346, + "learning_rate": 7.971113697036075e-05, + "loss": 0.6195, + "step": 15571 + }, + { + "epoch": 1.055085032861305, + "grad_norm": 6.9824748039245605, + "learning_rate": 7.970976795126293e-05, + "loss": 0.6818, + "step": 15572 + }, + { + "epoch": 1.055152788129277, + "grad_norm": 6.418920040130615, + "learning_rate": 7.970839893216511e-05, + "loss": 0.6653, + "step": 15573 + }, + { + "epoch": 1.0552205433972492, + "grad_norm": 5.251111030578613, + "learning_rate": 7.970702991306729e-05, + "loss": 0.5911, + "step": 15574 + }, + { + "epoch": 1.0552882986652212, + "grad_norm": 6.46027135848999, + "learning_rate": 7.970566089396947e-05, + "loss": 0.6771, + "step": 15575 + }, + { + "epoch": 1.0553560539331932, + "grad_norm": 5.67859411239624, + "learning_rate": 7.970429187487166e-05, + "loss": 0.6342, + "step": 15576 + }, + { + "epoch": 1.0554238092011654, + "grad_norm": 6.3809661865234375, + "learning_rate": 7.970292285577385e-05, + "loss": 0.7269, + "step": 15577 + }, + { + "epoch": 1.0554915644691374, + "grad_norm": 5.144848346710205, + "learning_rate": 7.970155383667603e-05, + "loss": 0.7396, + "step": 15578 + }, + { + "epoch": 1.0555593197371096, + "grad_norm": 6.251219749450684, + "learning_rate": 7.97001848175782e-05, + "loss": 0.6375, + "step": 15579 + }, + { + "epoch": 1.0556270750050816, + "grad_norm": 6.255589008331299, + "learning_rate": 7.96988157984804e-05, + "loss": 0.5346, + "step": 15580 + }, + { + "epoch": 1.0556948302730538, + "grad_norm": 7.6489481925964355, + "learning_rate": 7.969744677938258e-05, + "loss": 0.6735, + "step": 15581 + }, + { + "epoch": 1.0557625855410258, + "grad_norm": 6.068694114685059, + "learning_rate": 7.969607776028476e-05, + "loss": 0.6175, + "step": 15582 + }, + { + "epoch": 1.055830340808998, + "grad_norm": 6.7391815185546875, + "learning_rate": 7.969470874118694e-05, + "loss": 0.8585, + "step": 15583 + }, + { + "epoch": 1.05589809607697, + "grad_norm": 6.22243595123291, + "learning_rate": 7.969333972208912e-05, + "loss": 0.5475, + "step": 15584 + }, + { + "epoch": 1.0559658513449421, + "grad_norm": 5.03702449798584, + "learning_rate": 7.969197070299132e-05, + "loss": 0.7151, + "step": 15585 + }, + { + "epoch": 1.0560336066129141, + "grad_norm": 6.093680381774902, + "learning_rate": 7.96906016838935e-05, + "loss": 0.5883, + "step": 15586 + }, + { + "epoch": 1.0561013618808863, + "grad_norm": 7.8765034675598145, + "learning_rate": 7.968923266479568e-05, + "loss": 0.7901, + "step": 15587 + }, + { + "epoch": 1.0561691171488583, + "grad_norm": 6.4514031410217285, + "learning_rate": 7.968786364569786e-05, + "loss": 0.6763, + "step": 15588 + }, + { + "epoch": 1.0562368724168305, + "grad_norm": 5.650920867919922, + "learning_rate": 7.968649462660005e-05, + "loss": 0.74, + "step": 15589 + }, + { + "epoch": 1.0563046276848025, + "grad_norm": 5.893711090087891, + "learning_rate": 7.968512560750223e-05, + "loss": 0.696, + "step": 15590 + }, + { + "epoch": 1.0563723829527745, + "grad_norm": 5.7477240562438965, + "learning_rate": 7.968375658840441e-05, + "loss": 0.6714, + "step": 15591 + }, + { + "epoch": 1.0564401382207467, + "grad_norm": 5.326556205749512, + "learning_rate": 7.968238756930659e-05, + "loss": 0.6365, + "step": 15592 + }, + { + "epoch": 1.0565078934887187, + "grad_norm": 6.540344715118408, + "learning_rate": 7.968101855020877e-05, + "loss": 0.7155, + "step": 15593 + }, + { + "epoch": 1.0565756487566909, + "grad_norm": 6.695101261138916, + "learning_rate": 7.967964953111097e-05, + "loss": 0.7152, + "step": 15594 + }, + { + "epoch": 1.0566434040246628, + "grad_norm": 6.3795905113220215, + "learning_rate": 7.967828051201315e-05, + "loss": 0.6307, + "step": 15595 + }, + { + "epoch": 1.056711159292635, + "grad_norm": 6.77623987197876, + "learning_rate": 7.967691149291533e-05, + "loss": 0.6633, + "step": 15596 + }, + { + "epoch": 1.056778914560607, + "grad_norm": 7.296997547149658, + "learning_rate": 7.96755424738175e-05, + "loss": 0.6032, + "step": 15597 + }, + { + "epoch": 1.0568466698285792, + "grad_norm": 9.698561668395996, + "learning_rate": 7.967417345471969e-05, + "loss": 0.7025, + "step": 15598 + }, + { + "epoch": 1.0569144250965512, + "grad_norm": 6.234988212585449, + "learning_rate": 7.967280443562188e-05, + "loss": 0.7538, + "step": 15599 + }, + { + "epoch": 1.0569821803645234, + "grad_norm": 5.491406440734863, + "learning_rate": 7.967143541652406e-05, + "loss": 0.923, + "step": 15600 + }, + { + "epoch": 1.0570499356324954, + "grad_norm": 5.402745723724365, + "learning_rate": 7.967006639742624e-05, + "loss": 0.6183, + "step": 15601 + }, + { + "epoch": 1.0571176909004676, + "grad_norm": 6.674182415008545, + "learning_rate": 7.966869737832842e-05, + "loss": 0.6448, + "step": 15602 + }, + { + "epoch": 1.0571854461684396, + "grad_norm": 6.507812976837158, + "learning_rate": 7.966732835923062e-05, + "loss": 0.8753, + "step": 15603 + }, + { + "epoch": 1.0572532014364118, + "grad_norm": 5.593377590179443, + "learning_rate": 7.96659593401328e-05, + "loss": 0.7686, + "step": 15604 + }, + { + "epoch": 1.0573209567043838, + "grad_norm": 6.690634250640869, + "learning_rate": 7.966459032103498e-05, + "loss": 1.1595, + "step": 15605 + }, + { + "epoch": 1.057388711972356, + "grad_norm": 6.244167804718018, + "learning_rate": 7.966322130193717e-05, + "loss": 0.7725, + "step": 15606 + }, + { + "epoch": 1.057456467240328, + "grad_norm": 5.173239707946777, + "learning_rate": 7.966185228283935e-05, + "loss": 0.6084, + "step": 15607 + }, + { + "epoch": 1.0575242225083, + "grad_norm": 5.758760929107666, + "learning_rate": 7.966048326374153e-05, + "loss": 0.7413, + "step": 15608 + }, + { + "epoch": 1.0575919777762721, + "grad_norm": 4.311896800994873, + "learning_rate": 7.965911424464372e-05, + "loss": 0.5449, + "step": 15609 + }, + { + "epoch": 1.057659733044244, + "grad_norm": 7.174588203430176, + "learning_rate": 7.96577452255459e-05, + "loss": 0.7287, + "step": 15610 + }, + { + "epoch": 1.0577274883122163, + "grad_norm": 5.071308135986328, + "learning_rate": 7.965637620644809e-05, + "loss": 0.6074, + "step": 15611 + }, + { + "epoch": 1.0577952435801883, + "grad_norm": 6.668962001800537, + "learning_rate": 7.965500718735028e-05, + "loss": 0.6961, + "step": 15612 + }, + { + "epoch": 1.0578629988481605, + "grad_norm": 5.634909629821777, + "learning_rate": 7.965363816825246e-05, + "loss": 0.7343, + "step": 15613 + }, + { + "epoch": 1.0579307541161325, + "grad_norm": 5.5863800048828125, + "learning_rate": 7.965226914915464e-05, + "loss": 0.5128, + "step": 15614 + }, + { + "epoch": 1.0579985093841047, + "grad_norm": 7.636445045471191, + "learning_rate": 7.965090013005682e-05, + "loss": 0.6415, + "step": 15615 + }, + { + "epoch": 1.0580662646520766, + "grad_norm": 8.801673889160156, + "learning_rate": 7.9649531110959e-05, + "loss": 0.6823, + "step": 15616 + }, + { + "epoch": 1.0581340199200489, + "grad_norm": 5.3621721267700195, + "learning_rate": 7.96481620918612e-05, + "loss": 0.8068, + "step": 15617 + }, + { + "epoch": 1.0582017751880208, + "grad_norm": 6.023779392242432, + "learning_rate": 7.964679307276337e-05, + "loss": 0.8556, + "step": 15618 + }, + { + "epoch": 1.058269530455993, + "grad_norm": 5.901587963104248, + "learning_rate": 7.964542405366556e-05, + "loss": 0.6895, + "step": 15619 + }, + { + "epoch": 1.058337285723965, + "grad_norm": 5.533476829528809, + "learning_rate": 7.964405503456774e-05, + "loss": 0.6305, + "step": 15620 + }, + { + "epoch": 1.0584050409919372, + "grad_norm": 4.859772682189941, + "learning_rate": 7.964268601546993e-05, + "loss": 0.4848, + "step": 15621 + }, + { + "epoch": 1.0584727962599092, + "grad_norm": 4.763238430023193, + "learning_rate": 7.964131699637211e-05, + "loss": 0.6645, + "step": 15622 + }, + { + "epoch": 1.0585405515278814, + "grad_norm": 5.757406711578369, + "learning_rate": 7.963994797727429e-05, + "loss": 0.5606, + "step": 15623 + }, + { + "epoch": 1.0586083067958534, + "grad_norm": 5.710381984710693, + "learning_rate": 7.963857895817647e-05, + "loss": 0.6795, + "step": 15624 + }, + { + "epoch": 1.0586760620638254, + "grad_norm": 6.074893951416016, + "learning_rate": 7.963720993907865e-05, + "loss": 0.7556, + "step": 15625 + }, + { + "epoch": 1.0587438173317976, + "grad_norm": 4.727344512939453, + "learning_rate": 7.963584091998084e-05, + "loss": 0.4786, + "step": 15626 + }, + { + "epoch": 1.0588115725997695, + "grad_norm": 5.696430683135986, + "learning_rate": 7.963447190088302e-05, + "loss": 0.8202, + "step": 15627 + }, + { + "epoch": 1.0588793278677417, + "grad_norm": 6.602227210998535, + "learning_rate": 7.96331028817852e-05, + "loss": 0.5754, + "step": 15628 + }, + { + "epoch": 1.0589470831357137, + "grad_norm": 5.352349758148193, + "learning_rate": 7.963173386268739e-05, + "loss": 0.7036, + "step": 15629 + }, + { + "epoch": 1.059014838403686, + "grad_norm": 4.953490734100342, + "learning_rate": 7.963036484358957e-05, + "loss": 0.5278, + "step": 15630 + }, + { + "epoch": 1.059082593671658, + "grad_norm": 5.749557971954346, + "learning_rate": 7.962899582449176e-05, + "loss": 0.7663, + "step": 15631 + }, + { + "epoch": 1.0591503489396301, + "grad_norm": 5.01806640625, + "learning_rate": 7.962762680539394e-05, + "loss": 0.699, + "step": 15632 + }, + { + "epoch": 1.059218104207602, + "grad_norm": 5.559873104095459, + "learning_rate": 7.962625778629612e-05, + "loss": 0.8066, + "step": 15633 + }, + { + "epoch": 1.0592858594755743, + "grad_norm": 7.373801231384277, + "learning_rate": 7.96248887671983e-05, + "loss": 0.8825, + "step": 15634 + }, + { + "epoch": 1.0593536147435463, + "grad_norm": 6.330979824066162, + "learning_rate": 7.96235197481005e-05, + "loss": 0.8277, + "step": 15635 + }, + { + "epoch": 1.0594213700115185, + "grad_norm": 4.986181735992432, + "learning_rate": 7.962215072900268e-05, + "loss": 0.7141, + "step": 15636 + }, + { + "epoch": 1.0594891252794905, + "grad_norm": 6.320187568664551, + "learning_rate": 7.962078170990486e-05, + "loss": 0.7843, + "step": 15637 + }, + { + "epoch": 1.0595568805474627, + "grad_norm": 4.531287670135498, + "learning_rate": 7.961941269080704e-05, + "loss": 0.789, + "step": 15638 + }, + { + "epoch": 1.0596246358154346, + "grad_norm": 6.472817897796631, + "learning_rate": 7.961804367170922e-05, + "loss": 0.7853, + "step": 15639 + }, + { + "epoch": 1.0596923910834066, + "grad_norm": 6.977755546569824, + "learning_rate": 7.961667465261141e-05, + "loss": 0.8924, + "step": 15640 + }, + { + "epoch": 1.0597601463513788, + "grad_norm": 4.986513137817383, + "learning_rate": 7.961530563351359e-05, + "loss": 0.6245, + "step": 15641 + }, + { + "epoch": 1.0598279016193508, + "grad_norm": 6.008297443389893, + "learning_rate": 7.961393661441577e-05, + "loss": 0.6167, + "step": 15642 + }, + { + "epoch": 1.059895656887323, + "grad_norm": 6.505741596221924, + "learning_rate": 7.961256759531795e-05, + "loss": 0.5389, + "step": 15643 + }, + { + "epoch": 1.059963412155295, + "grad_norm": 7.218502521514893, + "learning_rate": 7.961119857622014e-05, + "loss": 0.7364, + "step": 15644 + }, + { + "epoch": 1.0600311674232672, + "grad_norm": 6.085995197296143, + "learning_rate": 7.960982955712233e-05, + "loss": 0.9236, + "step": 15645 + }, + { + "epoch": 1.0600989226912392, + "grad_norm": 5.947787284851074, + "learning_rate": 7.96084605380245e-05, + "loss": 0.5488, + "step": 15646 + }, + { + "epoch": 1.0601666779592114, + "grad_norm": 6.727092742919922, + "learning_rate": 7.960709151892669e-05, + "loss": 0.9123, + "step": 15647 + }, + { + "epoch": 1.0602344332271834, + "grad_norm": 6.658313751220703, + "learning_rate": 7.960572249982887e-05, + "loss": 0.8878, + "step": 15648 + }, + { + "epoch": 1.0603021884951556, + "grad_norm": 7.2491984367370605, + "learning_rate": 7.960435348073106e-05, + "loss": 0.6701, + "step": 15649 + }, + { + "epoch": 1.0603699437631275, + "grad_norm": 8.330307960510254, + "learning_rate": 7.960298446163324e-05, + "loss": 0.8083, + "step": 15650 + }, + { + "epoch": 1.0604376990310997, + "grad_norm": 6.639986038208008, + "learning_rate": 7.960161544253542e-05, + "loss": 0.6546, + "step": 15651 + }, + { + "epoch": 1.0605054542990717, + "grad_norm": 5.291508674621582, + "learning_rate": 7.96002464234376e-05, + "loss": 0.4886, + "step": 15652 + }, + { + "epoch": 1.060573209567044, + "grad_norm": 6.049704551696777, + "learning_rate": 7.95988774043398e-05, + "loss": 0.8144, + "step": 15653 + }, + { + "epoch": 1.060640964835016, + "grad_norm": 4.995707988739014, + "learning_rate": 7.959750838524198e-05, + "loss": 0.7146, + "step": 15654 + }, + { + "epoch": 1.0607087201029881, + "grad_norm": 5.3545451164245605, + "learning_rate": 7.959613936614416e-05, + "loss": 0.6091, + "step": 15655 + }, + { + "epoch": 1.06077647537096, + "grad_norm": 5.604588508605957, + "learning_rate": 7.959477034704635e-05, + "loss": 0.7015, + "step": 15656 + }, + { + "epoch": 1.060844230638932, + "grad_norm": 4.693150997161865, + "learning_rate": 7.959340132794853e-05, + "loss": 0.6435, + "step": 15657 + }, + { + "epoch": 1.0609119859069043, + "grad_norm": 5.757687568664551, + "learning_rate": 7.959203230885071e-05, + "loss": 0.7502, + "step": 15658 + }, + { + "epoch": 1.0609797411748763, + "grad_norm": 5.2874064445495605, + "learning_rate": 7.95906632897529e-05, + "loss": 0.5808, + "step": 15659 + }, + { + "epoch": 1.0610474964428485, + "grad_norm": 5.154580116271973, + "learning_rate": 7.958929427065508e-05, + "loss": 0.6679, + "step": 15660 + }, + { + "epoch": 1.0611152517108204, + "grad_norm": 4.7557830810546875, + "learning_rate": 7.958792525155726e-05, + "loss": 0.452, + "step": 15661 + }, + { + "epoch": 1.0611830069787926, + "grad_norm": 4.7612833976745605, + "learning_rate": 7.958655623245945e-05, + "loss": 0.5888, + "step": 15662 + }, + { + "epoch": 1.0612507622467646, + "grad_norm": 6.610939025878906, + "learning_rate": 7.958518721336164e-05, + "loss": 0.8694, + "step": 15663 + }, + { + "epoch": 1.0613185175147368, + "grad_norm": 4.907252788543701, + "learning_rate": 7.958381819426382e-05, + "loss": 0.5352, + "step": 15664 + }, + { + "epoch": 1.0613862727827088, + "grad_norm": 5.652861595153809, + "learning_rate": 7.9582449175166e-05, + "loss": 0.7831, + "step": 15665 + }, + { + "epoch": 1.061454028050681, + "grad_norm": 6.604598045349121, + "learning_rate": 7.958108015606818e-05, + "loss": 0.5988, + "step": 15666 + }, + { + "epoch": 1.061521783318653, + "grad_norm": 6.462950706481934, + "learning_rate": 7.957971113697037e-05, + "loss": 0.9984, + "step": 15667 + }, + { + "epoch": 1.0615895385866252, + "grad_norm": 5.563201427459717, + "learning_rate": 7.957834211787255e-05, + "loss": 0.583, + "step": 15668 + }, + { + "epoch": 1.0616572938545972, + "grad_norm": 8.087369918823242, + "learning_rate": 7.957697309877473e-05, + "loss": 0.7112, + "step": 15669 + }, + { + "epoch": 1.0617250491225694, + "grad_norm": 6.019291877746582, + "learning_rate": 7.957560407967691e-05, + "loss": 0.4987, + "step": 15670 + }, + { + "epoch": 1.0617928043905414, + "grad_norm": 5.730112075805664, + "learning_rate": 7.95742350605791e-05, + "loss": 0.7165, + "step": 15671 + }, + { + "epoch": 1.0618605596585136, + "grad_norm": 4.978113651275635, + "learning_rate": 7.957286604148129e-05, + "loss": 0.6191, + "step": 15672 + }, + { + "epoch": 1.0619283149264855, + "grad_norm": 14.537919998168945, + "learning_rate": 7.957149702238347e-05, + "loss": 0.6737, + "step": 15673 + }, + { + "epoch": 1.0619960701944575, + "grad_norm": 6.772073745727539, + "learning_rate": 7.957012800328565e-05, + "loss": 0.6738, + "step": 15674 + }, + { + "epoch": 1.0620638254624297, + "grad_norm": 7.139239311218262, + "learning_rate": 7.956875898418783e-05, + "loss": 0.7071, + "step": 15675 + }, + { + "epoch": 1.0621315807304017, + "grad_norm": 3.958707571029663, + "learning_rate": 7.956738996509002e-05, + "loss": 0.5774, + "step": 15676 + }, + { + "epoch": 1.062199335998374, + "grad_norm": 7.803884029388428, + "learning_rate": 7.95660209459922e-05, + "loss": 1.1357, + "step": 15677 + }, + { + "epoch": 1.0622670912663459, + "grad_norm": 4.009269714355469, + "learning_rate": 7.956465192689438e-05, + "loss": 0.5232, + "step": 15678 + }, + { + "epoch": 1.062334846534318, + "grad_norm": 5.269588947296143, + "learning_rate": 7.956328290779657e-05, + "loss": 0.6048, + "step": 15679 + }, + { + "epoch": 1.06240260180229, + "grad_norm": 6.135133266448975, + "learning_rate": 7.956191388869875e-05, + "loss": 0.8373, + "step": 15680 + }, + { + "epoch": 1.0624703570702623, + "grad_norm": 7.336065769195557, + "learning_rate": 7.956054486960094e-05, + "loss": 0.7269, + "step": 15681 + }, + { + "epoch": 1.0625381123382343, + "grad_norm": 5.202469825744629, + "learning_rate": 7.955917585050312e-05, + "loss": 0.5768, + "step": 15682 + }, + { + "epoch": 1.0626058676062065, + "grad_norm": 9.39901351928711, + "learning_rate": 7.95578068314053e-05, + "loss": 0.7882, + "step": 15683 + }, + { + "epoch": 1.0626736228741784, + "grad_norm": 6.687909126281738, + "learning_rate": 7.955643781230748e-05, + "loss": 0.7558, + "step": 15684 + }, + { + "epoch": 1.0627413781421506, + "grad_norm": 6.786401271820068, + "learning_rate": 7.955506879320966e-05, + "loss": 0.8085, + "step": 15685 + }, + { + "epoch": 1.0628091334101226, + "grad_norm": 5.360512733459473, + "learning_rate": 7.955369977411185e-05, + "loss": 0.7416, + "step": 15686 + }, + { + "epoch": 1.0628768886780948, + "grad_norm": 6.797042369842529, + "learning_rate": 7.955233075501403e-05, + "loss": 0.6518, + "step": 15687 + }, + { + "epoch": 1.0629446439460668, + "grad_norm": 9.375871658325195, + "learning_rate": 7.955096173591622e-05, + "loss": 0.6947, + "step": 15688 + }, + { + "epoch": 1.0630123992140388, + "grad_norm": 4.795276641845703, + "learning_rate": 7.95495927168184e-05, + "loss": 0.5838, + "step": 15689 + }, + { + "epoch": 1.063080154482011, + "grad_norm": 7.5510759353637695, + "learning_rate": 7.954822369772059e-05, + "loss": 0.6106, + "step": 15690 + }, + { + "epoch": 1.063147909749983, + "grad_norm": 5.1748175621032715, + "learning_rate": 7.954685467862277e-05, + "loss": 0.7426, + "step": 15691 + }, + { + "epoch": 1.0632156650179552, + "grad_norm": 4.72312593460083, + "learning_rate": 7.954548565952495e-05, + "loss": 0.4954, + "step": 15692 + }, + { + "epoch": 1.0632834202859271, + "grad_norm": 6.420324325561523, + "learning_rate": 7.954411664042713e-05, + "loss": 0.7084, + "step": 15693 + }, + { + "epoch": 1.0633511755538994, + "grad_norm": 5.019575119018555, + "learning_rate": 7.954274762132931e-05, + "loss": 0.6904, + "step": 15694 + }, + { + "epoch": 1.0634189308218713, + "grad_norm": 7.187556743621826, + "learning_rate": 7.95413786022315e-05, + "loss": 0.752, + "step": 15695 + }, + { + "epoch": 1.0634866860898435, + "grad_norm": 4.740534782409668, + "learning_rate": 7.954000958313369e-05, + "loss": 0.5472, + "step": 15696 + }, + { + "epoch": 1.0635544413578155, + "grad_norm": 4.72992467880249, + "learning_rate": 7.953864056403587e-05, + "loss": 0.5106, + "step": 15697 + }, + { + "epoch": 1.0636221966257877, + "grad_norm": 8.433707237243652, + "learning_rate": 7.953727154493805e-05, + "loss": 0.552, + "step": 15698 + }, + { + "epoch": 1.0636899518937597, + "grad_norm": 7.640308856964111, + "learning_rate": 7.953590252584024e-05, + "loss": 0.5683, + "step": 15699 + }, + { + "epoch": 1.063757707161732, + "grad_norm": 5.864986419677734, + "learning_rate": 7.953453350674242e-05, + "loss": 0.7258, + "step": 15700 + }, + { + "epoch": 1.0638254624297039, + "grad_norm": 4.7822418212890625, + "learning_rate": 7.95331644876446e-05, + "loss": 0.7018, + "step": 15701 + }, + { + "epoch": 1.063893217697676, + "grad_norm": 7.310060501098633, + "learning_rate": 7.95317954685468e-05, + "loss": 0.6671, + "step": 15702 + }, + { + "epoch": 1.063960972965648, + "grad_norm": 5.417652606964111, + "learning_rate": 7.953042644944897e-05, + "loss": 0.7866, + "step": 15703 + }, + { + "epoch": 1.06402872823362, + "grad_norm": 6.251247882843018, + "learning_rate": 7.952905743035115e-05, + "loss": 0.7288, + "step": 15704 + }, + { + "epoch": 1.0640964835015922, + "grad_norm": 4.557955741882324, + "learning_rate": 7.952768841125335e-05, + "loss": 0.7022, + "step": 15705 + }, + { + "epoch": 1.0641642387695642, + "grad_norm": 6.946822166442871, + "learning_rate": 7.952631939215553e-05, + "loss": 0.7494, + "step": 15706 + }, + { + "epoch": 1.0642319940375364, + "grad_norm": 5.914007663726807, + "learning_rate": 7.952495037305771e-05, + "loss": 0.7723, + "step": 15707 + }, + { + "epoch": 1.0642997493055084, + "grad_norm": 5.465998649597168, + "learning_rate": 7.952358135395989e-05, + "loss": 0.5798, + "step": 15708 + }, + { + "epoch": 1.0643675045734806, + "grad_norm": 6.636282920837402, + "learning_rate": 7.952221233486208e-05, + "loss": 0.8187, + "step": 15709 + }, + { + "epoch": 1.0644352598414526, + "grad_norm": 4.796360969543457, + "learning_rate": 7.952084331576426e-05, + "loss": 0.4452, + "step": 15710 + }, + { + "epoch": 1.0645030151094248, + "grad_norm": 6.594966888427734, + "learning_rate": 7.951947429666644e-05, + "loss": 0.827, + "step": 15711 + }, + { + "epoch": 1.0645707703773968, + "grad_norm": 4.327628135681152, + "learning_rate": 7.951810527756862e-05, + "loss": 0.5833, + "step": 15712 + }, + { + "epoch": 1.064638525645369, + "grad_norm": 4.537369251251221, + "learning_rate": 7.951673625847082e-05, + "loss": 0.6398, + "step": 15713 + }, + { + "epoch": 1.064706280913341, + "grad_norm": 5.633305549621582, + "learning_rate": 7.9515367239373e-05, + "loss": 0.6274, + "step": 15714 + }, + { + "epoch": 1.0647740361813132, + "grad_norm": 5.848304271697998, + "learning_rate": 7.951399822027518e-05, + "loss": 0.6827, + "step": 15715 + }, + { + "epoch": 1.0648417914492851, + "grad_norm": 5.550229072570801, + "learning_rate": 7.951262920117736e-05, + "loss": 0.5926, + "step": 15716 + }, + { + "epoch": 1.0649095467172573, + "grad_norm": 5.79759407043457, + "learning_rate": 7.951126018207954e-05, + "loss": 0.7239, + "step": 15717 + }, + { + "epoch": 1.0649773019852293, + "grad_norm": 5.386721134185791, + "learning_rate": 7.950989116298173e-05, + "loss": 0.6221, + "step": 15718 + }, + { + "epoch": 1.0650450572532015, + "grad_norm": 6.526064395904541, + "learning_rate": 7.950852214388391e-05, + "loss": 0.6978, + "step": 15719 + }, + { + "epoch": 1.0651128125211735, + "grad_norm": 5.103979587554932, + "learning_rate": 7.95071531247861e-05, + "loss": 0.5707, + "step": 15720 + }, + { + "epoch": 1.0651805677891457, + "grad_norm": 7.769407749176025, + "learning_rate": 7.950578410568827e-05, + "loss": 0.7698, + "step": 15721 + }, + { + "epoch": 1.0652483230571177, + "grad_norm": 4.609751224517822, + "learning_rate": 7.950441508659047e-05, + "loss": 0.8119, + "step": 15722 + }, + { + "epoch": 1.0653160783250897, + "grad_norm": 4.900935649871826, + "learning_rate": 7.950304606749265e-05, + "loss": 0.6217, + "step": 15723 + }, + { + "epoch": 1.0653838335930619, + "grad_norm": 6.532763957977295, + "learning_rate": 7.950167704839483e-05, + "loss": 0.7083, + "step": 15724 + }, + { + "epoch": 1.0654515888610339, + "grad_norm": 7.212814807891846, + "learning_rate": 7.950030802929701e-05, + "loss": 0.6279, + "step": 15725 + }, + { + "epoch": 1.065519344129006, + "grad_norm": 6.178891181945801, + "learning_rate": 7.949893901019919e-05, + "loss": 0.7573, + "step": 15726 + }, + { + "epoch": 1.065587099396978, + "grad_norm": 4.8897528648376465, + "learning_rate": 7.949756999110138e-05, + "loss": 0.5121, + "step": 15727 + }, + { + "epoch": 1.0656548546649502, + "grad_norm": 7.742895603179932, + "learning_rate": 7.949620097200356e-05, + "loss": 0.7652, + "step": 15728 + }, + { + "epoch": 1.0657226099329222, + "grad_norm": 6.08188533782959, + "learning_rate": 7.949483195290574e-05, + "loss": 0.7593, + "step": 15729 + }, + { + "epoch": 1.0657903652008944, + "grad_norm": 6.451333999633789, + "learning_rate": 7.949346293380793e-05, + "loss": 0.5019, + "step": 15730 + }, + { + "epoch": 1.0658581204688664, + "grad_norm": 5.50657844543457, + "learning_rate": 7.94920939147101e-05, + "loss": 0.8513, + "step": 15731 + }, + { + "epoch": 1.0659258757368386, + "grad_norm": 5.822981834411621, + "learning_rate": 7.94907248956123e-05, + "loss": 0.673, + "step": 15732 + }, + { + "epoch": 1.0659936310048106, + "grad_norm": 6.3767991065979, + "learning_rate": 7.948935587651448e-05, + "loss": 0.7667, + "step": 15733 + }, + { + "epoch": 1.0660613862727828, + "grad_norm": 6.416679382324219, + "learning_rate": 7.948798685741666e-05, + "loss": 0.7679, + "step": 15734 + }, + { + "epoch": 1.0661291415407548, + "grad_norm": 4.7926344871521, + "learning_rate": 7.948661783831884e-05, + "loss": 0.5938, + "step": 15735 + }, + { + "epoch": 1.066196896808727, + "grad_norm": 5.087335586547852, + "learning_rate": 7.948524881922103e-05, + "loss": 0.5486, + "step": 15736 + }, + { + "epoch": 1.066264652076699, + "grad_norm": 6.492136478424072, + "learning_rate": 7.948387980012321e-05, + "loss": 0.8351, + "step": 15737 + }, + { + "epoch": 1.066332407344671, + "grad_norm": 6.300568103790283, + "learning_rate": 7.94825107810254e-05, + "loss": 0.7037, + "step": 15738 + }, + { + "epoch": 1.0664001626126431, + "grad_norm": 5.677840709686279, + "learning_rate": 7.948114176192758e-05, + "loss": 0.6712, + "step": 15739 + }, + { + "epoch": 1.0664679178806151, + "grad_norm": 4.923882961273193, + "learning_rate": 7.947977274282976e-05, + "loss": 0.7166, + "step": 15740 + }, + { + "epoch": 1.0665356731485873, + "grad_norm": 8.807173728942871, + "learning_rate": 7.947840372373195e-05, + "loss": 0.5985, + "step": 15741 + }, + { + "epoch": 1.0666034284165593, + "grad_norm": 5.597122669219971, + "learning_rate": 7.947703470463413e-05, + "loss": 0.5973, + "step": 15742 + }, + { + "epoch": 1.0666711836845315, + "grad_norm": 6.10443639755249, + "learning_rate": 7.947566568553631e-05, + "loss": 0.7233, + "step": 15743 + }, + { + "epoch": 1.0667389389525035, + "grad_norm": 5.447210788726807, + "learning_rate": 7.947429666643849e-05, + "loss": 0.7122, + "step": 15744 + }, + { + "epoch": 1.0668066942204757, + "grad_norm": 7.586278438568115, + "learning_rate": 7.947292764734068e-05, + "loss": 0.8076, + "step": 15745 + }, + { + "epoch": 1.0668744494884477, + "grad_norm": 7.775143623352051, + "learning_rate": 7.947155862824286e-05, + "loss": 0.7714, + "step": 15746 + }, + { + "epoch": 1.0669422047564199, + "grad_norm": 6.429764270782471, + "learning_rate": 7.947018960914505e-05, + "loss": 0.5349, + "step": 15747 + }, + { + "epoch": 1.0670099600243919, + "grad_norm": 5.419410705566406, + "learning_rate": 7.946882059004724e-05, + "loss": 0.6813, + "step": 15748 + }, + { + "epoch": 1.067077715292364, + "grad_norm": 8.733865737915039, + "learning_rate": 7.946745157094942e-05, + "loss": 0.7048, + "step": 15749 + }, + { + "epoch": 1.067145470560336, + "grad_norm": 5.640448093414307, + "learning_rate": 7.94660825518516e-05, + "loss": 0.6213, + "step": 15750 + }, + { + "epoch": 1.0672132258283082, + "grad_norm": 6.9355692863464355, + "learning_rate": 7.94647135327538e-05, + "loss": 0.7853, + "step": 15751 + }, + { + "epoch": 1.0672809810962802, + "grad_norm": 6.067697048187256, + "learning_rate": 7.946334451365597e-05, + "loss": 0.7184, + "step": 15752 + }, + { + "epoch": 1.0673487363642522, + "grad_norm": 5.817471981048584, + "learning_rate": 7.946197549455815e-05, + "loss": 0.7555, + "step": 15753 + }, + { + "epoch": 1.0674164916322244, + "grad_norm": 5.668905735015869, + "learning_rate": 7.946060647546035e-05, + "loss": 0.6692, + "step": 15754 + }, + { + "epoch": 1.0674842469001964, + "grad_norm": 8.108335494995117, + "learning_rate": 7.945923745636253e-05, + "loss": 0.9925, + "step": 15755 + }, + { + "epoch": 1.0675520021681686, + "grad_norm": 9.043440818786621, + "learning_rate": 7.945786843726471e-05, + "loss": 0.7492, + "step": 15756 + }, + { + "epoch": 1.0676197574361406, + "grad_norm": 5.559685230255127, + "learning_rate": 7.945649941816689e-05, + "loss": 0.6123, + "step": 15757 + }, + { + "epoch": 1.0676875127041128, + "grad_norm": 6.704611778259277, + "learning_rate": 7.945513039906907e-05, + "loss": 0.8296, + "step": 15758 + }, + { + "epoch": 1.0677552679720848, + "grad_norm": 4.621387004852295, + "learning_rate": 7.945376137997126e-05, + "loss": 0.5638, + "step": 15759 + }, + { + "epoch": 1.067823023240057, + "grad_norm": 7.376697063446045, + "learning_rate": 7.945239236087344e-05, + "loss": 0.7056, + "step": 15760 + }, + { + "epoch": 1.067890778508029, + "grad_norm": 6.9395527839660645, + "learning_rate": 7.945102334177562e-05, + "loss": 0.9354, + "step": 15761 + }, + { + "epoch": 1.0679585337760011, + "grad_norm": 6.474776268005371, + "learning_rate": 7.94496543226778e-05, + "loss": 0.569, + "step": 15762 + }, + { + "epoch": 1.0680262890439731, + "grad_norm": 5.319557189941406, + "learning_rate": 7.944828530357998e-05, + "loss": 0.9097, + "step": 15763 + }, + { + "epoch": 1.0680940443119453, + "grad_norm": 6.072210788726807, + "learning_rate": 7.944691628448218e-05, + "loss": 0.8421, + "step": 15764 + }, + { + "epoch": 1.0681617995799173, + "grad_norm": 5.9819865226745605, + "learning_rate": 7.944554726538436e-05, + "loss": 0.8024, + "step": 15765 + }, + { + "epoch": 1.0682295548478895, + "grad_norm": 4.52716588973999, + "learning_rate": 7.944417824628654e-05, + "loss": 0.6446, + "step": 15766 + }, + { + "epoch": 1.0682973101158615, + "grad_norm": 5.481772422790527, + "learning_rate": 7.944280922718872e-05, + "loss": 0.5395, + "step": 15767 + }, + { + "epoch": 1.0683650653838337, + "grad_norm": 5.463979244232178, + "learning_rate": 7.944144020809091e-05, + "loss": 0.5777, + "step": 15768 + }, + { + "epoch": 1.0684328206518057, + "grad_norm": 6.096555233001709, + "learning_rate": 7.94400711889931e-05, + "loss": 0.6996, + "step": 15769 + }, + { + "epoch": 1.0685005759197779, + "grad_norm": 4.486513137817383, + "learning_rate": 7.943870216989527e-05, + "loss": 0.6826, + "step": 15770 + }, + { + "epoch": 1.0685683311877499, + "grad_norm": 5.200229167938232, + "learning_rate": 7.943733315079745e-05, + "loss": 0.6869, + "step": 15771 + }, + { + "epoch": 1.0686360864557218, + "grad_norm": 7.219493389129639, + "learning_rate": 7.943596413169963e-05, + "loss": 0.6478, + "step": 15772 + }, + { + "epoch": 1.068703841723694, + "grad_norm": 5.901253700256348, + "learning_rate": 7.943459511260183e-05, + "loss": 0.7505, + "step": 15773 + }, + { + "epoch": 1.068771596991666, + "grad_norm": 5.0671772956848145, + "learning_rate": 7.943322609350401e-05, + "loss": 0.7321, + "step": 15774 + }, + { + "epoch": 1.0688393522596382, + "grad_norm": 6.431093215942383, + "learning_rate": 7.943185707440619e-05, + "loss": 0.6674, + "step": 15775 + }, + { + "epoch": 1.0689071075276102, + "grad_norm": 5.902375221252441, + "learning_rate": 7.943048805530837e-05, + "loss": 0.7254, + "step": 15776 + }, + { + "epoch": 1.0689748627955824, + "grad_norm": 4.459395408630371, + "learning_rate": 7.942911903621056e-05, + "loss": 0.6371, + "step": 15777 + }, + { + "epoch": 1.0690426180635544, + "grad_norm": 5.590951442718506, + "learning_rate": 7.942775001711274e-05, + "loss": 0.7365, + "step": 15778 + }, + { + "epoch": 1.0691103733315266, + "grad_norm": 9.108878135681152, + "learning_rate": 7.942638099801492e-05, + "loss": 0.8241, + "step": 15779 + }, + { + "epoch": 1.0691781285994986, + "grad_norm": 5.599368572235107, + "learning_rate": 7.94250119789171e-05, + "loss": 0.8881, + "step": 15780 + }, + { + "epoch": 1.0692458838674708, + "grad_norm": 5.293349266052246, + "learning_rate": 7.942364295981929e-05, + "loss": 0.5408, + "step": 15781 + }, + { + "epoch": 1.0693136391354428, + "grad_norm": 6.881068706512451, + "learning_rate": 7.942227394072148e-05, + "loss": 0.8773, + "step": 15782 + }, + { + "epoch": 1.069381394403415, + "grad_norm": 7.475467205047607, + "learning_rate": 7.942090492162366e-05, + "loss": 0.6288, + "step": 15783 + }, + { + "epoch": 1.069449149671387, + "grad_norm": 6.4415669441223145, + "learning_rate": 7.941953590252584e-05, + "loss": 0.7308, + "step": 15784 + }, + { + "epoch": 1.0695169049393591, + "grad_norm": 4.710741996765137, + "learning_rate": 7.941816688342802e-05, + "loss": 0.5095, + "step": 15785 + }, + { + "epoch": 1.0695846602073311, + "grad_norm": 6.763927459716797, + "learning_rate": 7.94167978643302e-05, + "loss": 0.7424, + "step": 15786 + }, + { + "epoch": 1.069652415475303, + "grad_norm": 6.286536693572998, + "learning_rate": 7.94154288452324e-05, + "loss": 0.5971, + "step": 15787 + }, + { + "epoch": 1.0697201707432753, + "grad_norm": 6.880483150482178, + "learning_rate": 7.941405982613457e-05, + "loss": 0.8842, + "step": 15788 + }, + { + "epoch": 1.0697879260112473, + "grad_norm": 5.467273712158203, + "learning_rate": 7.941269080703675e-05, + "loss": 0.7985, + "step": 15789 + }, + { + "epoch": 1.0698556812792195, + "grad_norm": 5.367607593536377, + "learning_rate": 7.941132178793894e-05, + "loss": 0.8427, + "step": 15790 + }, + { + "epoch": 1.0699234365471915, + "grad_norm": 5.593782424926758, + "learning_rate": 7.940995276884113e-05, + "loss": 0.5909, + "step": 15791 + }, + { + "epoch": 1.0699911918151637, + "grad_norm": 4.525633811950684, + "learning_rate": 7.940858374974331e-05, + "loss": 0.5472, + "step": 15792 + }, + { + "epoch": 1.0700589470831356, + "grad_norm": 5.167564868927002, + "learning_rate": 7.940721473064549e-05, + "loss": 0.4875, + "step": 15793 + }, + { + "epoch": 1.0701267023511079, + "grad_norm": 4.89332914352417, + "learning_rate": 7.940584571154768e-05, + "loss": 0.731, + "step": 15794 + }, + { + "epoch": 1.0701944576190798, + "grad_norm": 6.20195198059082, + "learning_rate": 7.940447669244986e-05, + "loss": 0.6374, + "step": 15795 + }, + { + "epoch": 1.070262212887052, + "grad_norm": 4.612509727478027, + "learning_rate": 7.940310767335204e-05, + "loss": 0.5493, + "step": 15796 + }, + { + "epoch": 1.070329968155024, + "grad_norm": 7.004962921142578, + "learning_rate": 7.940173865425424e-05, + "loss": 0.5409, + "step": 15797 + }, + { + "epoch": 1.0703977234229962, + "grad_norm": 5.540370941162109, + "learning_rate": 7.940036963515642e-05, + "loss": 0.6747, + "step": 15798 + }, + { + "epoch": 1.0704654786909682, + "grad_norm": 8.158012390136719, + "learning_rate": 7.93990006160586e-05, + "loss": 0.6241, + "step": 15799 + }, + { + "epoch": 1.0705332339589404, + "grad_norm": 8.461899757385254, + "learning_rate": 7.939763159696079e-05, + "loss": 0.5866, + "step": 15800 + }, + { + "epoch": 1.0706009892269124, + "grad_norm": 6.685081958770752, + "learning_rate": 7.939626257786297e-05, + "loss": 0.7346, + "step": 15801 + }, + { + "epoch": 1.0706687444948844, + "grad_norm": 6.544353485107422, + "learning_rate": 7.939489355876515e-05, + "loss": 0.6462, + "step": 15802 + }, + { + "epoch": 1.0707364997628566, + "grad_norm": 5.48783016204834, + "learning_rate": 7.939352453966733e-05, + "loss": 0.6524, + "step": 15803 + }, + { + "epoch": 1.0708042550308285, + "grad_norm": 6.841857433319092, + "learning_rate": 7.939215552056951e-05, + "loss": 0.6979, + "step": 15804 + }, + { + "epoch": 1.0708720102988007, + "grad_norm": 8.2971773147583, + "learning_rate": 7.939078650147171e-05, + "loss": 0.9404, + "step": 15805 + }, + { + "epoch": 1.0709397655667727, + "grad_norm": 6.574617385864258, + "learning_rate": 7.938941748237389e-05, + "loss": 0.699, + "step": 15806 + }, + { + "epoch": 1.071007520834745, + "grad_norm": 4.680912017822266, + "learning_rate": 7.938804846327607e-05, + "loss": 0.6734, + "step": 15807 + }, + { + "epoch": 1.071075276102717, + "grad_norm": 6.463378429412842, + "learning_rate": 7.938667944417825e-05, + "loss": 0.596, + "step": 15808 + }, + { + "epoch": 1.0711430313706891, + "grad_norm": 5.354882717132568, + "learning_rate": 7.938531042508044e-05, + "loss": 0.6923, + "step": 15809 + }, + { + "epoch": 1.071210786638661, + "grad_norm": 6.985996246337891, + "learning_rate": 7.938394140598262e-05, + "loss": 0.5105, + "step": 15810 + }, + { + "epoch": 1.0712785419066333, + "grad_norm": 7.460660457611084, + "learning_rate": 7.93825723868848e-05, + "loss": 0.8449, + "step": 15811 + }, + { + "epoch": 1.0713462971746053, + "grad_norm": 7.631282806396484, + "learning_rate": 7.938120336778698e-05, + "loss": 0.7648, + "step": 15812 + }, + { + "epoch": 1.0714140524425775, + "grad_norm": 6.466649532318115, + "learning_rate": 7.937983434868916e-05, + "loss": 0.5677, + "step": 15813 + }, + { + "epoch": 1.0714818077105495, + "grad_norm": 4.905895709991455, + "learning_rate": 7.937846532959136e-05, + "loss": 0.7089, + "step": 15814 + }, + { + "epoch": 1.0715495629785217, + "grad_norm": 5.96923828125, + "learning_rate": 7.937709631049354e-05, + "loss": 0.8982, + "step": 15815 + }, + { + "epoch": 1.0716173182464936, + "grad_norm": 5.1013383865356445, + "learning_rate": 7.937572729139572e-05, + "loss": 0.6006, + "step": 15816 + }, + { + "epoch": 1.0716850735144658, + "grad_norm": 5.436969757080078, + "learning_rate": 7.93743582722979e-05, + "loss": 0.6946, + "step": 15817 + }, + { + "epoch": 1.0717528287824378, + "grad_norm": 4.626980304718018, + "learning_rate": 7.937298925320008e-05, + "loss": 0.5443, + "step": 15818 + }, + { + "epoch": 1.07182058405041, + "grad_norm": 4.790196418762207, + "learning_rate": 7.937162023410227e-05, + "loss": 0.6067, + "step": 15819 + }, + { + "epoch": 1.071888339318382, + "grad_norm": 7.302325248718262, + "learning_rate": 7.937025121500445e-05, + "loss": 0.7859, + "step": 15820 + }, + { + "epoch": 1.071956094586354, + "grad_norm": 9.226067543029785, + "learning_rate": 7.936888219590663e-05, + "loss": 0.6142, + "step": 15821 + }, + { + "epoch": 1.0720238498543262, + "grad_norm": 7.070141792297363, + "learning_rate": 7.936751317680881e-05, + "loss": 0.9445, + "step": 15822 + }, + { + "epoch": 1.0720916051222982, + "grad_norm": 5.061405658721924, + "learning_rate": 7.936614415771101e-05, + "loss": 0.5768, + "step": 15823 + }, + { + "epoch": 1.0721593603902704, + "grad_norm": 4.736205101013184, + "learning_rate": 7.936477513861319e-05, + "loss": 0.5623, + "step": 15824 + }, + { + "epoch": 1.0722271156582424, + "grad_norm": 5.544649124145508, + "learning_rate": 7.936340611951537e-05, + "loss": 0.6344, + "step": 15825 + }, + { + "epoch": 1.0722948709262146, + "grad_norm": 5.628857135772705, + "learning_rate": 7.936203710041755e-05, + "loss": 0.7983, + "step": 15826 + }, + { + "epoch": 1.0723626261941865, + "grad_norm": 7.328362464904785, + "learning_rate": 7.936066808131973e-05, + "loss": 0.6597, + "step": 15827 + }, + { + "epoch": 1.0724303814621587, + "grad_norm": 4.993453025817871, + "learning_rate": 7.935929906222192e-05, + "loss": 0.6878, + "step": 15828 + }, + { + "epoch": 1.0724981367301307, + "grad_norm": 4.596197605133057, + "learning_rate": 7.93579300431241e-05, + "loss": 0.7162, + "step": 15829 + }, + { + "epoch": 1.072565891998103, + "grad_norm": 7.738362789154053, + "learning_rate": 7.935656102402628e-05, + "loss": 0.644, + "step": 15830 + }, + { + "epoch": 1.072633647266075, + "grad_norm": 6.615604877471924, + "learning_rate": 7.935519200492846e-05, + "loss": 0.7576, + "step": 15831 + }, + { + "epoch": 1.072701402534047, + "grad_norm": 7.723992347717285, + "learning_rate": 7.935382298583066e-05, + "loss": 0.8502, + "step": 15832 + }, + { + "epoch": 1.072769157802019, + "grad_norm": 5.472813606262207, + "learning_rate": 7.935245396673284e-05, + "loss": 0.6046, + "step": 15833 + }, + { + "epoch": 1.0728369130699913, + "grad_norm": 5.456359386444092, + "learning_rate": 7.935108494763502e-05, + "loss": 0.9344, + "step": 15834 + }, + { + "epoch": 1.0729046683379633, + "grad_norm": 5.542731285095215, + "learning_rate": 7.93497159285372e-05, + "loss": 0.7518, + "step": 15835 + }, + { + "epoch": 1.0729724236059353, + "grad_norm": 5.318366050720215, + "learning_rate": 7.934834690943938e-05, + "loss": 0.6448, + "step": 15836 + }, + { + "epoch": 1.0730401788739075, + "grad_norm": 5.1860432624816895, + "learning_rate": 7.934697789034157e-05, + "loss": 0.8315, + "step": 15837 + }, + { + "epoch": 1.0731079341418794, + "grad_norm": 6.298211097717285, + "learning_rate": 7.934560887124375e-05, + "loss": 0.8517, + "step": 15838 + }, + { + "epoch": 1.0731756894098516, + "grad_norm": 5.515106201171875, + "learning_rate": 7.934423985214593e-05, + "loss": 0.462, + "step": 15839 + }, + { + "epoch": 1.0732434446778236, + "grad_norm": 5.4662394523620605, + "learning_rate": 7.934287083304813e-05, + "loss": 0.7735, + "step": 15840 + }, + { + "epoch": 1.0733111999457958, + "grad_norm": 6.610855579376221, + "learning_rate": 7.934150181395031e-05, + "loss": 0.9312, + "step": 15841 + }, + { + "epoch": 1.0733789552137678, + "grad_norm": 7.660674571990967, + "learning_rate": 7.934013279485249e-05, + "loss": 0.5592, + "step": 15842 + }, + { + "epoch": 1.07344671048174, + "grad_norm": 5.828925609588623, + "learning_rate": 7.933876377575468e-05, + "loss": 0.7, + "step": 15843 + }, + { + "epoch": 1.073514465749712, + "grad_norm": 6.494380474090576, + "learning_rate": 7.933739475665686e-05, + "loss": 0.6033, + "step": 15844 + }, + { + "epoch": 1.0735822210176842, + "grad_norm": 7.698443412780762, + "learning_rate": 7.933602573755904e-05, + "loss": 0.8435, + "step": 15845 + }, + { + "epoch": 1.0736499762856562, + "grad_norm": 5.353089809417725, + "learning_rate": 7.933465671846124e-05, + "loss": 0.9311, + "step": 15846 + }, + { + "epoch": 1.0737177315536284, + "grad_norm": 6.445722579956055, + "learning_rate": 7.933328769936342e-05, + "loss": 0.6781, + "step": 15847 + }, + { + "epoch": 1.0737854868216004, + "grad_norm": 4.490242004394531, + "learning_rate": 7.93319186802656e-05, + "loss": 0.8394, + "step": 15848 + }, + { + "epoch": 1.0738532420895726, + "grad_norm": 4.664844989776611, + "learning_rate": 7.933054966116778e-05, + "loss": 0.7149, + "step": 15849 + }, + { + "epoch": 1.0739209973575445, + "grad_norm": 5.997525691986084, + "learning_rate": 7.932918064206996e-05, + "loss": 0.6835, + "step": 15850 + }, + { + "epoch": 1.0739887526255165, + "grad_norm": 8.030609130859375, + "learning_rate": 7.932781162297215e-05, + "loss": 0.8024, + "step": 15851 + }, + { + "epoch": 1.0740565078934887, + "grad_norm": 7.785398960113525, + "learning_rate": 7.932644260387433e-05, + "loss": 0.7362, + "step": 15852 + }, + { + "epoch": 1.0741242631614607, + "grad_norm": 4.920596122741699, + "learning_rate": 7.932507358477651e-05, + "loss": 0.7132, + "step": 15853 + }, + { + "epoch": 1.074192018429433, + "grad_norm": 4.117366313934326, + "learning_rate": 7.93237045656787e-05, + "loss": 0.5307, + "step": 15854 + }, + { + "epoch": 1.0742597736974049, + "grad_norm": 5.135700702667236, + "learning_rate": 7.932233554658089e-05, + "loss": 0.6229, + "step": 15855 + }, + { + "epoch": 1.074327528965377, + "grad_norm": 6.207793712615967, + "learning_rate": 7.932096652748307e-05, + "loss": 0.5498, + "step": 15856 + }, + { + "epoch": 1.074395284233349, + "grad_norm": 6.431981563568115, + "learning_rate": 7.931959750838525e-05, + "loss": 0.6798, + "step": 15857 + }, + { + "epoch": 1.0744630395013213, + "grad_norm": 5.320846080780029, + "learning_rate": 7.931822848928743e-05, + "loss": 0.6198, + "step": 15858 + }, + { + "epoch": 1.0745307947692933, + "grad_norm": 5.788670539855957, + "learning_rate": 7.931685947018961e-05, + "loss": 0.8206, + "step": 15859 + }, + { + "epoch": 1.0745985500372655, + "grad_norm": 7.664066314697266, + "learning_rate": 7.93154904510918e-05, + "loss": 0.6584, + "step": 15860 + }, + { + "epoch": 1.0746663053052374, + "grad_norm": 6.991944313049316, + "learning_rate": 7.931412143199398e-05, + "loss": 0.9677, + "step": 15861 + }, + { + "epoch": 1.0747340605732096, + "grad_norm": 5.329096794128418, + "learning_rate": 7.931275241289616e-05, + "loss": 0.6164, + "step": 15862 + }, + { + "epoch": 1.0748018158411816, + "grad_norm": 6.307762145996094, + "learning_rate": 7.931138339379834e-05, + "loss": 0.8304, + "step": 15863 + }, + { + "epoch": 1.0748695711091538, + "grad_norm": 5.538243770599365, + "learning_rate": 7.931001437470054e-05, + "loss": 0.7314, + "step": 15864 + }, + { + "epoch": 1.0749373263771258, + "grad_norm": 6.261715888977051, + "learning_rate": 7.930864535560272e-05, + "loss": 0.8585, + "step": 15865 + }, + { + "epoch": 1.075005081645098, + "grad_norm": 5.783536434173584, + "learning_rate": 7.93072763365049e-05, + "loss": 0.6301, + "step": 15866 + }, + { + "epoch": 1.07507283691307, + "grad_norm": 5.780196189880371, + "learning_rate": 7.930590731740708e-05, + "loss": 0.6306, + "step": 15867 + }, + { + "epoch": 1.0751405921810422, + "grad_norm": 5.178442478179932, + "learning_rate": 7.930453829830926e-05, + "loss": 0.4359, + "step": 15868 + }, + { + "epoch": 1.0752083474490142, + "grad_norm": 7.060993194580078, + "learning_rate": 7.930316927921145e-05, + "loss": 0.5727, + "step": 15869 + }, + { + "epoch": 1.0752761027169861, + "grad_norm": 5.602401256561279, + "learning_rate": 7.930180026011363e-05, + "loss": 0.5486, + "step": 15870 + }, + { + "epoch": 1.0753438579849584, + "grad_norm": 6.208281993865967, + "learning_rate": 7.930043124101581e-05, + "loss": 0.7401, + "step": 15871 + }, + { + "epoch": 1.0754116132529303, + "grad_norm": 10.482538223266602, + "learning_rate": 7.9299062221918e-05, + "loss": 0.8773, + "step": 15872 + }, + { + "epoch": 1.0754793685209025, + "grad_norm": 5.301812171936035, + "learning_rate": 7.929769320282017e-05, + "loss": 0.6965, + "step": 15873 + }, + { + "epoch": 1.0755471237888745, + "grad_norm": 10.284536361694336, + "learning_rate": 7.929632418372237e-05, + "loss": 0.7072, + "step": 15874 + }, + { + "epoch": 1.0756148790568467, + "grad_norm": 5.5219550132751465, + "learning_rate": 7.929495516462455e-05, + "loss": 0.6087, + "step": 15875 + }, + { + "epoch": 1.0756826343248187, + "grad_norm": 5.441367149353027, + "learning_rate": 7.929358614552673e-05, + "loss": 0.7554, + "step": 15876 + }, + { + "epoch": 1.075750389592791, + "grad_norm": 5.82017707824707, + "learning_rate": 7.929221712642891e-05, + "loss": 0.6087, + "step": 15877 + }, + { + "epoch": 1.0758181448607629, + "grad_norm": 5.13921594619751, + "learning_rate": 7.92908481073311e-05, + "loss": 0.791, + "step": 15878 + }, + { + "epoch": 1.075885900128735, + "grad_norm": 5.5391926765441895, + "learning_rate": 7.928947908823328e-05, + "loss": 0.6649, + "step": 15879 + }, + { + "epoch": 1.075953655396707, + "grad_norm": 6.536075115203857, + "learning_rate": 7.928811006913546e-05, + "loss": 0.6299, + "step": 15880 + }, + { + "epoch": 1.0760214106646793, + "grad_norm": 5.445075511932373, + "learning_rate": 7.928674105003764e-05, + "loss": 0.7024, + "step": 15881 + }, + { + "epoch": 1.0760891659326512, + "grad_norm": 5.205681800842285, + "learning_rate": 7.928537203093982e-05, + "loss": 0.4982, + "step": 15882 + }, + { + "epoch": 1.0761569212006235, + "grad_norm": 4.551524639129639, + "learning_rate": 7.928400301184202e-05, + "loss": 0.643, + "step": 15883 + }, + { + "epoch": 1.0762246764685954, + "grad_norm": 5.783960819244385, + "learning_rate": 7.92826339927442e-05, + "loss": 0.6593, + "step": 15884 + }, + { + "epoch": 1.0762924317365674, + "grad_norm": 5.964574337005615, + "learning_rate": 7.928126497364638e-05, + "loss": 0.6811, + "step": 15885 + }, + { + "epoch": 1.0763601870045396, + "grad_norm": 7.782623767852783, + "learning_rate": 7.927989595454856e-05, + "loss": 0.7981, + "step": 15886 + }, + { + "epoch": 1.0764279422725116, + "grad_norm": 6.336511135101318, + "learning_rate": 7.927852693545075e-05, + "loss": 0.9382, + "step": 15887 + }, + { + "epoch": 1.0764956975404838, + "grad_norm": 4.654166221618652, + "learning_rate": 7.927715791635293e-05, + "loss": 0.5157, + "step": 15888 + }, + { + "epoch": 1.0765634528084558, + "grad_norm": 4.949594974517822, + "learning_rate": 7.927578889725511e-05, + "loss": 0.662, + "step": 15889 + }, + { + "epoch": 1.076631208076428, + "grad_norm": 6.451333999633789, + "learning_rate": 7.927441987815731e-05, + "loss": 0.7434, + "step": 15890 + }, + { + "epoch": 1.0766989633444, + "grad_norm": 6.374203681945801, + "learning_rate": 7.927305085905949e-05, + "loss": 0.7248, + "step": 15891 + }, + { + "epoch": 1.0767667186123722, + "grad_norm": 8.0396089553833, + "learning_rate": 7.927168183996167e-05, + "loss": 0.589, + "step": 15892 + }, + { + "epoch": 1.0768344738803441, + "grad_norm": 9.502918243408203, + "learning_rate": 7.927031282086386e-05, + "loss": 0.9467, + "step": 15893 + }, + { + "epoch": 1.0769022291483163, + "grad_norm": 5.921857833862305, + "learning_rate": 7.926894380176604e-05, + "loss": 0.8063, + "step": 15894 + }, + { + "epoch": 1.0769699844162883, + "grad_norm": 6.634448051452637, + "learning_rate": 7.926757478266822e-05, + "loss": 0.8519, + "step": 15895 + }, + { + "epoch": 1.0770377396842605, + "grad_norm": 4.550981044769287, + "learning_rate": 7.92662057635704e-05, + "loss": 0.6937, + "step": 15896 + }, + { + "epoch": 1.0771054949522325, + "grad_norm": 6.369821071624756, + "learning_rate": 7.92648367444726e-05, + "loss": 0.7513, + "step": 15897 + }, + { + "epoch": 1.0771732502202047, + "grad_norm": 6.474839210510254, + "learning_rate": 7.926346772537478e-05, + "loss": 0.8707, + "step": 15898 + }, + { + "epoch": 1.0772410054881767, + "grad_norm": 5.136695861816406, + "learning_rate": 7.926209870627696e-05, + "loss": 0.5493, + "step": 15899 + }, + { + "epoch": 1.0773087607561487, + "grad_norm": 5.588931560516357, + "learning_rate": 7.926072968717914e-05, + "loss": 0.7134, + "step": 15900 + }, + { + "epoch": 1.0773765160241209, + "grad_norm": 6.223124980926514, + "learning_rate": 7.925936066808133e-05, + "loss": 0.7647, + "step": 15901 + }, + { + "epoch": 1.0774442712920929, + "grad_norm": 4.788752555847168, + "learning_rate": 7.925799164898351e-05, + "loss": 0.439, + "step": 15902 + }, + { + "epoch": 1.077512026560065, + "grad_norm": 5.260895252227783, + "learning_rate": 7.92566226298857e-05, + "loss": 0.6204, + "step": 15903 + }, + { + "epoch": 1.077579781828037, + "grad_norm": 5.823332786560059, + "learning_rate": 7.925525361078787e-05, + "loss": 0.7579, + "step": 15904 + }, + { + "epoch": 1.0776475370960092, + "grad_norm": 5.55746603012085, + "learning_rate": 7.925388459169005e-05, + "loss": 0.6343, + "step": 15905 + }, + { + "epoch": 1.0777152923639812, + "grad_norm": 5.603518962860107, + "learning_rate": 7.925251557259225e-05, + "loss": 0.7849, + "step": 15906 + }, + { + "epoch": 1.0777830476319534, + "grad_norm": 5.885906219482422, + "learning_rate": 7.925114655349443e-05, + "loss": 0.713, + "step": 15907 + }, + { + "epoch": 1.0778508028999254, + "grad_norm": 9.000563621520996, + "learning_rate": 7.924977753439661e-05, + "loss": 0.8531, + "step": 15908 + }, + { + "epoch": 1.0779185581678976, + "grad_norm": 5.798495769500732, + "learning_rate": 7.924840851529879e-05, + "loss": 0.7047, + "step": 15909 + }, + { + "epoch": 1.0779863134358696, + "grad_norm": 5.180234432220459, + "learning_rate": 7.924703949620098e-05, + "loss": 0.4218, + "step": 15910 + }, + { + "epoch": 1.0780540687038418, + "grad_norm": 5.090123653411865, + "learning_rate": 7.924567047710316e-05, + "loss": 0.7473, + "step": 15911 + }, + { + "epoch": 1.0781218239718138, + "grad_norm": 4.858051300048828, + "learning_rate": 7.924430145800534e-05, + "loss": 0.5706, + "step": 15912 + }, + { + "epoch": 1.078189579239786, + "grad_norm": 5.382134914398193, + "learning_rate": 7.924293243890752e-05, + "loss": 0.7633, + "step": 15913 + }, + { + "epoch": 1.078257334507758, + "grad_norm": 6.212465763092041, + "learning_rate": 7.92415634198097e-05, + "loss": 0.6788, + "step": 15914 + }, + { + "epoch": 1.0783250897757302, + "grad_norm": 6.3222880363464355, + "learning_rate": 7.92401944007119e-05, + "loss": 0.7848, + "step": 15915 + }, + { + "epoch": 1.0783928450437021, + "grad_norm": 6.96838903427124, + "learning_rate": 7.923882538161408e-05, + "loss": 0.8652, + "step": 15916 + }, + { + "epoch": 1.0784606003116743, + "grad_norm": 4.135616302490234, + "learning_rate": 7.923745636251626e-05, + "loss": 0.4712, + "step": 15917 + }, + { + "epoch": 1.0785283555796463, + "grad_norm": 6.912881374359131, + "learning_rate": 7.923608734341844e-05, + "loss": 0.7325, + "step": 15918 + }, + { + "epoch": 1.0785961108476183, + "grad_norm": 5.951952934265137, + "learning_rate": 7.923471832432062e-05, + "loss": 0.645, + "step": 15919 + }, + { + "epoch": 1.0786638661155905, + "grad_norm": 5.720697402954102, + "learning_rate": 7.923334930522281e-05, + "loss": 0.751, + "step": 15920 + }, + { + "epoch": 1.0787316213835625, + "grad_norm": 6.125636577606201, + "learning_rate": 7.9231980286125e-05, + "loss": 0.7128, + "step": 15921 + }, + { + "epoch": 1.0787993766515347, + "grad_norm": 7.798324108123779, + "learning_rate": 7.923061126702717e-05, + "loss": 1.0655, + "step": 15922 + }, + { + "epoch": 1.0788671319195067, + "grad_norm": 7.423648834228516, + "learning_rate": 7.922924224792935e-05, + "loss": 0.8611, + "step": 15923 + }, + { + "epoch": 1.0789348871874789, + "grad_norm": 4.67471170425415, + "learning_rate": 7.922787322883155e-05, + "loss": 0.4459, + "step": 15924 + }, + { + "epoch": 1.0790026424554509, + "grad_norm": 6.07832670211792, + "learning_rate": 7.922650420973373e-05, + "loss": 0.706, + "step": 15925 + }, + { + "epoch": 1.079070397723423, + "grad_norm": 6.1671624183654785, + "learning_rate": 7.922513519063591e-05, + "loss": 0.8027, + "step": 15926 + }, + { + "epoch": 1.079138152991395, + "grad_norm": 6.181910991668701, + "learning_rate": 7.922376617153809e-05, + "loss": 0.726, + "step": 15927 + }, + { + "epoch": 1.0792059082593672, + "grad_norm": 6.463537693023682, + "learning_rate": 7.922239715244027e-05, + "loss": 0.6578, + "step": 15928 + }, + { + "epoch": 1.0792736635273392, + "grad_norm": 6.259862422943115, + "learning_rate": 7.922102813334246e-05, + "loss": 0.6437, + "step": 15929 + }, + { + "epoch": 1.0793414187953114, + "grad_norm": 6.758423805236816, + "learning_rate": 7.921965911424464e-05, + "loss": 0.6028, + "step": 15930 + }, + { + "epoch": 1.0794091740632834, + "grad_norm": 5.508148193359375, + "learning_rate": 7.921829009514682e-05, + "loss": 0.6397, + "step": 15931 + }, + { + "epoch": 1.0794769293312556, + "grad_norm": 4.803408145904541, + "learning_rate": 7.9216921076049e-05, + "loss": 0.8471, + "step": 15932 + }, + { + "epoch": 1.0795446845992276, + "grad_norm": 5.470789432525635, + "learning_rate": 7.92155520569512e-05, + "loss": 0.9083, + "step": 15933 + }, + { + "epoch": 1.0796124398671996, + "grad_norm": 5.594363212585449, + "learning_rate": 7.921418303785338e-05, + "loss": 0.6955, + "step": 15934 + }, + { + "epoch": 1.0796801951351718, + "grad_norm": 6.420119285583496, + "learning_rate": 7.921281401875556e-05, + "loss": 0.8706, + "step": 15935 + }, + { + "epoch": 1.0797479504031438, + "grad_norm": 5.343960285186768, + "learning_rate": 7.921144499965775e-05, + "loss": 0.8528, + "step": 15936 + }, + { + "epoch": 1.079815705671116, + "grad_norm": 6.668441295623779, + "learning_rate": 7.921007598055993e-05, + "loss": 0.82, + "step": 15937 + }, + { + "epoch": 1.079883460939088, + "grad_norm": 7.410157203674316, + "learning_rate": 7.920870696146211e-05, + "loss": 0.7367, + "step": 15938 + }, + { + "epoch": 1.0799512162070601, + "grad_norm": 6.100698471069336, + "learning_rate": 7.920733794236431e-05, + "loss": 0.862, + "step": 15939 + }, + { + "epoch": 1.0800189714750321, + "grad_norm": 6.438335418701172, + "learning_rate": 7.920596892326649e-05, + "loss": 0.7477, + "step": 15940 + }, + { + "epoch": 1.0800867267430043, + "grad_norm": 6.367231845855713, + "learning_rate": 7.920459990416867e-05, + "loss": 0.6993, + "step": 15941 + }, + { + "epoch": 1.0801544820109763, + "grad_norm": 5.784175395965576, + "learning_rate": 7.920323088507086e-05, + "loss": 0.6835, + "step": 15942 + }, + { + "epoch": 1.0802222372789485, + "grad_norm": 5.12135648727417, + "learning_rate": 7.920186186597304e-05, + "loss": 0.7518, + "step": 15943 + }, + { + "epoch": 1.0802899925469205, + "grad_norm": 5.591551780700684, + "learning_rate": 7.920049284687522e-05, + "loss": 0.5578, + "step": 15944 + }, + { + "epoch": 1.0803577478148927, + "grad_norm": 3.941267728805542, + "learning_rate": 7.91991238277774e-05, + "loss": 0.6835, + "step": 15945 + }, + { + "epoch": 1.0804255030828647, + "grad_norm": 4.970183849334717, + "learning_rate": 7.919775480867958e-05, + "loss": 0.6148, + "step": 15946 + }, + { + "epoch": 1.0804932583508369, + "grad_norm": 5.3328938484191895, + "learning_rate": 7.919638578958178e-05, + "loss": 0.5961, + "step": 15947 + }, + { + "epoch": 1.0805610136188089, + "grad_norm": 5.661096096038818, + "learning_rate": 7.919501677048396e-05, + "loss": 0.5808, + "step": 15948 + }, + { + "epoch": 1.0806287688867808, + "grad_norm": 4.3002824783325195, + "learning_rate": 7.919364775138614e-05, + "loss": 0.6531, + "step": 15949 + }, + { + "epoch": 1.080696524154753, + "grad_norm": 5.652578830718994, + "learning_rate": 7.919227873228832e-05, + "loss": 0.8663, + "step": 15950 + }, + { + "epoch": 1.080764279422725, + "grad_norm": 5.4440717697143555, + "learning_rate": 7.91909097131905e-05, + "loss": 0.8888, + "step": 15951 + }, + { + "epoch": 1.0808320346906972, + "grad_norm": 5.150991439819336, + "learning_rate": 7.918954069409269e-05, + "loss": 0.6697, + "step": 15952 + }, + { + "epoch": 1.0808997899586692, + "grad_norm": 5.136131286621094, + "learning_rate": 7.918817167499487e-05, + "loss": 0.6577, + "step": 15953 + }, + { + "epoch": 1.0809675452266414, + "grad_norm": 6.44126033782959, + "learning_rate": 7.918680265589705e-05, + "loss": 0.6284, + "step": 15954 + }, + { + "epoch": 1.0810353004946134, + "grad_norm": 8.204652786254883, + "learning_rate": 7.918543363679923e-05, + "loss": 0.6879, + "step": 15955 + }, + { + "epoch": 1.0811030557625856, + "grad_norm": 5.8660407066345215, + "learning_rate": 7.918406461770143e-05, + "loss": 0.5727, + "step": 15956 + }, + { + "epoch": 1.0811708110305576, + "grad_norm": 6.534690856933594, + "learning_rate": 7.918269559860361e-05, + "loss": 0.697, + "step": 15957 + }, + { + "epoch": 1.0812385662985298, + "grad_norm": 8.585152626037598, + "learning_rate": 7.918132657950579e-05, + "loss": 0.8429, + "step": 15958 + }, + { + "epoch": 1.0813063215665017, + "grad_norm": 7.084420204162598, + "learning_rate": 7.917995756040797e-05, + "loss": 0.6254, + "step": 15959 + }, + { + "epoch": 1.081374076834474, + "grad_norm": 5.689300060272217, + "learning_rate": 7.917858854131015e-05, + "loss": 0.5956, + "step": 15960 + }, + { + "epoch": 1.081441832102446, + "grad_norm": 6.1884331703186035, + "learning_rate": 7.917721952221234e-05, + "loss": 0.7677, + "step": 15961 + }, + { + "epoch": 1.0815095873704181, + "grad_norm": 10.952715873718262, + "learning_rate": 7.917585050311452e-05, + "loss": 0.783, + "step": 15962 + }, + { + "epoch": 1.0815773426383901, + "grad_norm": 8.16262149810791, + "learning_rate": 7.91744814840167e-05, + "loss": 0.548, + "step": 15963 + }, + { + "epoch": 1.0816450979063623, + "grad_norm": 5.6903157234191895, + "learning_rate": 7.917311246491888e-05, + "loss": 0.5904, + "step": 15964 + }, + { + "epoch": 1.0817128531743343, + "grad_norm": 8.33536148071289, + "learning_rate": 7.917174344582108e-05, + "loss": 0.7886, + "step": 15965 + }, + { + "epoch": 1.0817806084423065, + "grad_norm": 9.231719017028809, + "learning_rate": 7.917037442672326e-05, + "loss": 0.7876, + "step": 15966 + }, + { + "epoch": 1.0818483637102785, + "grad_norm": 5.142817497253418, + "learning_rate": 7.916900540762544e-05, + "loss": 0.6525, + "step": 15967 + }, + { + "epoch": 1.0819161189782505, + "grad_norm": 8.006011962890625, + "learning_rate": 7.916763638852762e-05, + "loss": 0.6731, + "step": 15968 + }, + { + "epoch": 1.0819838742462227, + "grad_norm": 4.934510707855225, + "learning_rate": 7.91662673694298e-05, + "loss": 0.6913, + "step": 15969 + }, + { + "epoch": 1.0820516295141946, + "grad_norm": 4.291750907897949, + "learning_rate": 7.916489835033199e-05, + "loss": 0.5819, + "step": 15970 + }, + { + "epoch": 1.0821193847821668, + "grad_norm": 6.167888164520264, + "learning_rate": 7.916352933123417e-05, + "loss": 0.5861, + "step": 15971 + }, + { + "epoch": 1.0821871400501388, + "grad_norm": 6.8970465660095215, + "learning_rate": 7.916216031213635e-05, + "loss": 0.4904, + "step": 15972 + }, + { + "epoch": 1.082254895318111, + "grad_norm": 6.821037292480469, + "learning_rate": 7.916079129303853e-05, + "loss": 0.7735, + "step": 15973 + }, + { + "epoch": 1.082322650586083, + "grad_norm": 6.066835403442383, + "learning_rate": 7.915942227394071e-05, + "loss": 0.5559, + "step": 15974 + }, + { + "epoch": 1.0823904058540552, + "grad_norm": 5.0264716148376465, + "learning_rate": 7.915805325484291e-05, + "loss": 0.8914, + "step": 15975 + }, + { + "epoch": 1.0824581611220272, + "grad_norm": 6.044068336486816, + "learning_rate": 7.915668423574509e-05, + "loss": 0.5114, + "step": 15976 + }, + { + "epoch": 1.0825259163899994, + "grad_norm": 5.62042236328125, + "learning_rate": 7.915531521664727e-05, + "loss": 0.6438, + "step": 15977 + }, + { + "epoch": 1.0825936716579714, + "grad_norm": 5.779903411865234, + "learning_rate": 7.915394619754945e-05, + "loss": 0.5855, + "step": 15978 + }, + { + "epoch": 1.0826614269259436, + "grad_norm": 7.046977996826172, + "learning_rate": 7.915257717845164e-05, + "loss": 0.7729, + "step": 15979 + }, + { + "epoch": 1.0827291821939156, + "grad_norm": 10.05582332611084, + "learning_rate": 7.915120815935382e-05, + "loss": 0.7776, + "step": 15980 + }, + { + "epoch": 1.0827969374618878, + "grad_norm": 6.933775901794434, + "learning_rate": 7.9149839140256e-05, + "loss": 0.7646, + "step": 15981 + }, + { + "epoch": 1.0828646927298597, + "grad_norm": 5.1650543212890625, + "learning_rate": 7.91484701211582e-05, + "loss": 0.5793, + "step": 15982 + }, + { + "epoch": 1.0829324479978317, + "grad_norm": 6.840242385864258, + "learning_rate": 7.914710110206038e-05, + "loss": 0.7227, + "step": 15983 + }, + { + "epoch": 1.083000203265804, + "grad_norm": 4.679685592651367, + "learning_rate": 7.914573208296256e-05, + "loss": 0.6102, + "step": 15984 + }, + { + "epoch": 1.083067958533776, + "grad_norm": 6.447381496429443, + "learning_rate": 7.914436306386475e-05, + "loss": 0.9207, + "step": 15985 + }, + { + "epoch": 1.0831357138017481, + "grad_norm": 7.533105850219727, + "learning_rate": 7.914299404476693e-05, + "loss": 0.7438, + "step": 15986 + }, + { + "epoch": 1.08320346906972, + "grad_norm": 4.451360702514648, + "learning_rate": 7.914162502566911e-05, + "loss": 0.5668, + "step": 15987 + }, + { + "epoch": 1.0832712243376923, + "grad_norm": 7.082450866699219, + "learning_rate": 7.914025600657131e-05, + "loss": 0.6538, + "step": 15988 + }, + { + "epoch": 1.0833389796056643, + "grad_norm": 5.413089275360107, + "learning_rate": 7.913888698747349e-05, + "loss": 0.7967, + "step": 15989 + }, + { + "epoch": 1.0834067348736365, + "grad_norm": 5.9802985191345215, + "learning_rate": 7.913751796837567e-05, + "loss": 0.7933, + "step": 15990 + }, + { + "epoch": 1.0834744901416085, + "grad_norm": 6.097925662994385, + "learning_rate": 7.913614894927785e-05, + "loss": 0.602, + "step": 15991 + }, + { + "epoch": 1.0835422454095807, + "grad_norm": 5.050368309020996, + "learning_rate": 7.913477993018003e-05, + "loss": 0.5664, + "step": 15992 + }, + { + "epoch": 1.0836100006775526, + "grad_norm": 4.976052761077881, + "learning_rate": 7.913341091108222e-05, + "loss": 0.6543, + "step": 15993 + }, + { + "epoch": 1.0836777559455248, + "grad_norm": 6.06764554977417, + "learning_rate": 7.91320418919844e-05, + "loss": 0.6747, + "step": 15994 + }, + { + "epoch": 1.0837455112134968, + "grad_norm": 8.071044921875, + "learning_rate": 7.913067287288658e-05, + "loss": 0.6247, + "step": 15995 + }, + { + "epoch": 1.083813266481469, + "grad_norm": 5.97242546081543, + "learning_rate": 7.912930385378876e-05, + "loss": 0.6158, + "step": 15996 + }, + { + "epoch": 1.083881021749441, + "grad_norm": 5.717789173126221, + "learning_rate": 7.912793483469096e-05, + "loss": 0.7819, + "step": 15997 + }, + { + "epoch": 1.083948777017413, + "grad_norm": 6.379135608673096, + "learning_rate": 7.912656581559314e-05, + "loss": 0.8008, + "step": 15998 + }, + { + "epoch": 1.0840165322853852, + "grad_norm": 5.400828838348389, + "learning_rate": 7.912519679649532e-05, + "loss": 0.6621, + "step": 15999 + }, + { + "epoch": 1.0840842875533572, + "grad_norm": 7.098434925079346, + "learning_rate": 7.91238277773975e-05, + "loss": 0.8617, + "step": 16000 + }, + { + "epoch": 1.0841520428213294, + "grad_norm": 5.690536975860596, + "learning_rate": 7.912245875829968e-05, + "loss": 0.7622, + "step": 16001 + }, + { + "epoch": 1.0842197980893014, + "grad_norm": 5.485647201538086, + "learning_rate": 7.912108973920187e-05, + "loss": 0.6856, + "step": 16002 + }, + { + "epoch": 1.0842875533572736, + "grad_norm": 7.707883358001709, + "learning_rate": 7.911972072010405e-05, + "loss": 0.7483, + "step": 16003 + }, + { + "epoch": 1.0843553086252455, + "grad_norm": 5.487302303314209, + "learning_rate": 7.911835170100623e-05, + "loss": 0.6216, + "step": 16004 + }, + { + "epoch": 1.0844230638932177, + "grad_norm": 5.43400239944458, + "learning_rate": 7.911698268190841e-05, + "loss": 0.6424, + "step": 16005 + }, + { + "epoch": 1.0844908191611897, + "grad_norm": 5.037356853485107, + "learning_rate": 7.91156136628106e-05, + "loss": 0.5823, + "step": 16006 + }, + { + "epoch": 1.084558574429162, + "grad_norm": 6.4713215827941895, + "learning_rate": 7.911424464371279e-05, + "loss": 0.6753, + "step": 16007 + }, + { + "epoch": 1.084626329697134, + "grad_norm": 5.2635393142700195, + "learning_rate": 7.911287562461497e-05, + "loss": 0.8342, + "step": 16008 + }, + { + "epoch": 1.084694084965106, + "grad_norm": 5.816706657409668, + "learning_rate": 7.911150660551715e-05, + "loss": 0.8517, + "step": 16009 + }, + { + "epoch": 1.084761840233078, + "grad_norm": 5.9537458419799805, + "learning_rate": 7.911013758641933e-05, + "loss": 0.6601, + "step": 16010 + }, + { + "epoch": 1.0848295955010503, + "grad_norm": 4.499695301055908, + "learning_rate": 7.910876856732152e-05, + "loss": 0.5232, + "step": 16011 + }, + { + "epoch": 1.0848973507690223, + "grad_norm": 5.2069091796875, + "learning_rate": 7.91073995482237e-05, + "loss": 0.5803, + "step": 16012 + }, + { + "epoch": 1.0849651060369945, + "grad_norm": 4.306644916534424, + "learning_rate": 7.910603052912588e-05, + "loss": 0.6398, + "step": 16013 + }, + { + "epoch": 1.0850328613049665, + "grad_norm": 5.758792400360107, + "learning_rate": 7.910466151002806e-05, + "loss": 0.7415, + "step": 16014 + }, + { + "epoch": 1.0851006165729387, + "grad_norm": 6.12398099899292, + "learning_rate": 7.910329249093024e-05, + "loss": 0.7178, + "step": 16015 + }, + { + "epoch": 1.0851683718409106, + "grad_norm": 7.941034317016602, + "learning_rate": 7.910192347183244e-05, + "loss": 0.6802, + "step": 16016 + }, + { + "epoch": 1.0852361271088826, + "grad_norm": 5.2251176834106445, + "learning_rate": 7.910055445273462e-05, + "loss": 0.6264, + "step": 16017 + }, + { + "epoch": 1.0853038823768548, + "grad_norm": 5.844374179840088, + "learning_rate": 7.90991854336368e-05, + "loss": 0.6761, + "step": 16018 + }, + { + "epoch": 1.0853716376448268, + "grad_norm": 4.731149196624756, + "learning_rate": 7.909781641453898e-05, + "loss": 0.7042, + "step": 16019 + }, + { + "epoch": 1.085439392912799, + "grad_norm": 5.9580206871032715, + "learning_rate": 7.909644739544117e-05, + "loss": 0.6448, + "step": 16020 + }, + { + "epoch": 1.085507148180771, + "grad_norm": 6.342482089996338, + "learning_rate": 7.909507837634335e-05, + "loss": 0.6347, + "step": 16021 + }, + { + "epoch": 1.0855749034487432, + "grad_norm": 5.999290943145752, + "learning_rate": 7.909370935724553e-05, + "loss": 0.7697, + "step": 16022 + }, + { + "epoch": 1.0856426587167152, + "grad_norm": 6.850386142730713, + "learning_rate": 7.909234033814771e-05, + "loss": 0.6584, + "step": 16023 + }, + { + "epoch": 1.0857104139846874, + "grad_norm": 6.658447265625, + "learning_rate": 7.90909713190499e-05, + "loss": 0.7402, + "step": 16024 + }, + { + "epoch": 1.0857781692526594, + "grad_norm": 5.117592811584473, + "learning_rate": 7.908960229995209e-05, + "loss": 0.7171, + "step": 16025 + }, + { + "epoch": 1.0858459245206316, + "grad_norm": 4.4139909744262695, + "learning_rate": 7.908823328085427e-05, + "loss": 0.5602, + "step": 16026 + }, + { + "epoch": 1.0859136797886035, + "grad_norm": 8.113272666931152, + "learning_rate": 7.908686426175645e-05, + "loss": 0.5116, + "step": 16027 + }, + { + "epoch": 1.0859814350565757, + "grad_norm": 6.033690452575684, + "learning_rate": 7.908549524265864e-05, + "loss": 0.6415, + "step": 16028 + }, + { + "epoch": 1.0860491903245477, + "grad_norm": 5.521602630615234, + "learning_rate": 7.908412622356082e-05, + "loss": 0.7071, + "step": 16029 + }, + { + "epoch": 1.08611694559252, + "grad_norm": 9.671419143676758, + "learning_rate": 7.9082757204463e-05, + "loss": 0.746, + "step": 16030 + }, + { + "epoch": 1.086184700860492, + "grad_norm": 12.12438678741455, + "learning_rate": 7.90813881853652e-05, + "loss": 0.5135, + "step": 16031 + }, + { + "epoch": 1.0862524561284639, + "grad_norm": 6.55898904800415, + "learning_rate": 7.908001916626738e-05, + "loss": 0.6438, + "step": 16032 + }, + { + "epoch": 1.086320211396436, + "grad_norm": 6.704573154449463, + "learning_rate": 7.907865014716956e-05, + "loss": 0.5559, + "step": 16033 + }, + { + "epoch": 1.086387966664408, + "grad_norm": 6.356927871704102, + "learning_rate": 7.907728112807175e-05, + "loss": 0.7004, + "step": 16034 + }, + { + "epoch": 1.0864557219323803, + "grad_norm": 6.504284381866455, + "learning_rate": 7.907591210897393e-05, + "loss": 0.7731, + "step": 16035 + }, + { + "epoch": 1.0865234772003523, + "grad_norm": 4.6852874755859375, + "learning_rate": 7.907454308987611e-05, + "loss": 0.5772, + "step": 16036 + }, + { + "epoch": 1.0865912324683245, + "grad_norm": 5.674740791320801, + "learning_rate": 7.907317407077829e-05, + "loss": 0.6515, + "step": 16037 + }, + { + "epoch": 1.0866589877362964, + "grad_norm": 11.06320571899414, + "learning_rate": 7.907180505168047e-05, + "loss": 0.7314, + "step": 16038 + }, + { + "epoch": 1.0867267430042686, + "grad_norm": 7.077748775482178, + "learning_rate": 7.907043603258267e-05, + "loss": 0.7779, + "step": 16039 + }, + { + "epoch": 1.0867944982722406, + "grad_norm": 4.752398490905762, + "learning_rate": 7.906906701348485e-05, + "loss": 0.6754, + "step": 16040 + }, + { + "epoch": 1.0868622535402128, + "grad_norm": 6.530601978302002, + "learning_rate": 7.906769799438703e-05, + "loss": 0.7912, + "step": 16041 + }, + { + "epoch": 1.0869300088081848, + "grad_norm": 7.028359889984131, + "learning_rate": 7.906632897528921e-05, + "loss": 0.7639, + "step": 16042 + }, + { + "epoch": 1.086997764076157, + "grad_norm": 6.825929164886475, + "learning_rate": 7.90649599561914e-05, + "loss": 0.5188, + "step": 16043 + }, + { + "epoch": 1.087065519344129, + "grad_norm": 5.377181053161621, + "learning_rate": 7.906359093709358e-05, + "loss": 0.7833, + "step": 16044 + }, + { + "epoch": 1.0871332746121012, + "grad_norm": 5.946323394775391, + "learning_rate": 7.906222191799576e-05, + "loss": 0.9365, + "step": 16045 + }, + { + "epoch": 1.0872010298800732, + "grad_norm": 6.080692768096924, + "learning_rate": 7.906085289889794e-05, + "loss": 0.8694, + "step": 16046 + }, + { + "epoch": 1.0872687851480451, + "grad_norm": 5.918422698974609, + "learning_rate": 7.905948387980012e-05, + "loss": 0.8249, + "step": 16047 + }, + { + "epoch": 1.0873365404160173, + "grad_norm": 8.938027381896973, + "learning_rate": 7.905811486070232e-05, + "loss": 0.6464, + "step": 16048 + }, + { + "epoch": 1.0874042956839893, + "grad_norm": 5.869150638580322, + "learning_rate": 7.90567458416045e-05, + "loss": 0.849, + "step": 16049 + }, + { + "epoch": 1.0874720509519615, + "grad_norm": 6.127419948577881, + "learning_rate": 7.905537682250668e-05, + "loss": 0.5813, + "step": 16050 + }, + { + "epoch": 1.0875398062199335, + "grad_norm": 5.159786701202393, + "learning_rate": 7.905400780340886e-05, + "loss": 0.5756, + "step": 16051 + }, + { + "epoch": 1.0876075614879057, + "grad_norm": 6.768173694610596, + "learning_rate": 7.905263878431104e-05, + "loss": 0.9234, + "step": 16052 + }, + { + "epoch": 1.0876753167558777, + "grad_norm": 5.480606555938721, + "learning_rate": 7.905126976521323e-05, + "loss": 0.6028, + "step": 16053 + }, + { + "epoch": 1.08774307202385, + "grad_norm": 6.204960823059082, + "learning_rate": 7.904990074611541e-05, + "loss": 0.6402, + "step": 16054 + }, + { + "epoch": 1.0878108272918219, + "grad_norm": 5.574785232543945, + "learning_rate": 7.904853172701759e-05, + "loss": 0.6907, + "step": 16055 + }, + { + "epoch": 1.087878582559794, + "grad_norm": 6.4214043617248535, + "learning_rate": 7.904716270791977e-05, + "loss": 1.0685, + "step": 16056 + }, + { + "epoch": 1.087946337827766, + "grad_norm": 6.589137077331543, + "learning_rate": 7.904579368882197e-05, + "loss": 0.6175, + "step": 16057 + }, + { + "epoch": 1.0880140930957383, + "grad_norm": 5.765407085418701, + "learning_rate": 7.904442466972415e-05, + "loss": 0.696, + "step": 16058 + }, + { + "epoch": 1.0880818483637102, + "grad_norm": 4.7705278396606445, + "learning_rate": 7.904305565062633e-05, + "loss": 0.6874, + "step": 16059 + }, + { + "epoch": 1.0881496036316824, + "grad_norm": 6.676177978515625, + "learning_rate": 7.904168663152851e-05, + "loss": 0.6891, + "step": 16060 + }, + { + "epoch": 1.0882173588996544, + "grad_norm": 4.436383247375488, + "learning_rate": 7.904031761243069e-05, + "loss": 0.6336, + "step": 16061 + }, + { + "epoch": 1.0882851141676266, + "grad_norm": 8.813880920410156, + "learning_rate": 7.903894859333288e-05, + "loss": 0.7391, + "step": 16062 + }, + { + "epoch": 1.0883528694355986, + "grad_norm": 5.455590724945068, + "learning_rate": 7.903757957423506e-05, + "loss": 0.6368, + "step": 16063 + }, + { + "epoch": 1.0884206247035708, + "grad_norm": 6.536738872528076, + "learning_rate": 7.903621055513724e-05, + "loss": 0.756, + "step": 16064 + }, + { + "epoch": 1.0884883799715428, + "grad_norm": 4.461001873016357, + "learning_rate": 7.903484153603942e-05, + "loss": 0.5279, + "step": 16065 + }, + { + "epoch": 1.0885561352395148, + "grad_norm": 5.54738187789917, + "learning_rate": 7.903347251694162e-05, + "loss": 0.5467, + "step": 16066 + }, + { + "epoch": 1.088623890507487, + "grad_norm": 4.30350923538208, + "learning_rate": 7.90321034978438e-05, + "loss": 0.5577, + "step": 16067 + }, + { + "epoch": 1.088691645775459, + "grad_norm": 6.150055885314941, + "learning_rate": 7.903073447874598e-05, + "loss": 0.6348, + "step": 16068 + }, + { + "epoch": 1.0887594010434312, + "grad_norm": 4.178039073944092, + "learning_rate": 7.902936545964816e-05, + "loss": 0.5737, + "step": 16069 + }, + { + "epoch": 1.0888271563114031, + "grad_norm": 4.776052474975586, + "learning_rate": 7.902799644055034e-05, + "loss": 0.6767, + "step": 16070 + }, + { + "epoch": 1.0888949115793753, + "grad_norm": 4.931454658508301, + "learning_rate": 7.902662742145253e-05, + "loss": 0.6867, + "step": 16071 + }, + { + "epoch": 1.0889626668473473, + "grad_norm": 5.556695461273193, + "learning_rate": 7.902525840235471e-05, + "loss": 0.902, + "step": 16072 + }, + { + "epoch": 1.0890304221153195, + "grad_norm": 6.187075138092041, + "learning_rate": 7.90238893832569e-05, + "loss": 0.7161, + "step": 16073 + }, + { + "epoch": 1.0890981773832915, + "grad_norm": 6.942097187042236, + "learning_rate": 7.902252036415909e-05, + "loss": 0.9483, + "step": 16074 + }, + { + "epoch": 1.0891659326512637, + "grad_norm": 7.454192161560059, + "learning_rate": 7.902115134506127e-05, + "loss": 0.768, + "step": 16075 + }, + { + "epoch": 1.0892336879192357, + "grad_norm": 4.790287494659424, + "learning_rate": 7.901978232596345e-05, + "loss": 0.7202, + "step": 16076 + }, + { + "epoch": 1.089301443187208, + "grad_norm": 4.767421722412109, + "learning_rate": 7.901841330686564e-05, + "loss": 0.7903, + "step": 16077 + }, + { + "epoch": 1.0893691984551799, + "grad_norm": 5.385760307312012, + "learning_rate": 7.901704428776782e-05, + "loss": 0.653, + "step": 16078 + }, + { + "epoch": 1.089436953723152, + "grad_norm": 4.6891655921936035, + "learning_rate": 7.901567526867e-05, + "loss": 0.5077, + "step": 16079 + }, + { + "epoch": 1.089504708991124, + "grad_norm": 6.615968704223633, + "learning_rate": 7.90143062495722e-05, + "loss": 0.6641, + "step": 16080 + }, + { + "epoch": 1.089572464259096, + "grad_norm": 8.83869457244873, + "learning_rate": 7.901293723047438e-05, + "loss": 0.7476, + "step": 16081 + }, + { + "epoch": 1.0896402195270682, + "grad_norm": 5.717437267303467, + "learning_rate": 7.901156821137656e-05, + "loss": 0.5911, + "step": 16082 + }, + { + "epoch": 1.0897079747950402, + "grad_norm": 3.9267828464508057, + "learning_rate": 7.901019919227874e-05, + "loss": 0.4863, + "step": 16083 + }, + { + "epoch": 1.0897757300630124, + "grad_norm": 6.119167327880859, + "learning_rate": 7.900883017318092e-05, + "loss": 0.7104, + "step": 16084 + }, + { + "epoch": 1.0898434853309844, + "grad_norm": 8.956766128540039, + "learning_rate": 7.900746115408311e-05, + "loss": 0.61, + "step": 16085 + }, + { + "epoch": 1.0899112405989566, + "grad_norm": 5.749386787414551, + "learning_rate": 7.900609213498529e-05, + "loss": 0.7368, + "step": 16086 + }, + { + "epoch": 1.0899789958669286, + "grad_norm": 7.877694606781006, + "learning_rate": 7.900472311588747e-05, + "loss": 0.5073, + "step": 16087 + }, + { + "epoch": 1.0900467511349008, + "grad_norm": 6.475144386291504, + "learning_rate": 7.900335409678965e-05, + "loss": 0.8651, + "step": 16088 + }, + { + "epoch": 1.0901145064028728, + "grad_norm": 4.912156581878662, + "learning_rate": 7.900198507769185e-05, + "loss": 0.6296, + "step": 16089 + }, + { + "epoch": 1.090182261670845, + "grad_norm": 6.647600173950195, + "learning_rate": 7.900061605859403e-05, + "loss": 0.7773, + "step": 16090 + }, + { + "epoch": 1.090250016938817, + "grad_norm": 5.223667621612549, + "learning_rate": 7.899924703949621e-05, + "loss": 0.6258, + "step": 16091 + }, + { + "epoch": 1.0903177722067892, + "grad_norm": 5.760304927825928, + "learning_rate": 7.899787802039839e-05, + "loss": 0.6562, + "step": 16092 + }, + { + "epoch": 1.0903855274747611, + "grad_norm": 6.270285129547119, + "learning_rate": 7.899650900130057e-05, + "loss": 0.7241, + "step": 16093 + }, + { + "epoch": 1.0904532827427333, + "grad_norm": 6.746170997619629, + "learning_rate": 7.899513998220276e-05, + "loss": 0.6451, + "step": 16094 + }, + { + "epoch": 1.0905210380107053, + "grad_norm": 6.292099952697754, + "learning_rate": 7.899377096310494e-05, + "loss": 0.4791, + "step": 16095 + }, + { + "epoch": 1.0905887932786773, + "grad_norm": 7.886358261108398, + "learning_rate": 7.899240194400712e-05, + "loss": 0.5683, + "step": 16096 + }, + { + "epoch": 1.0906565485466495, + "grad_norm": 5.620253562927246, + "learning_rate": 7.89910329249093e-05, + "loss": 0.5844, + "step": 16097 + }, + { + "epoch": 1.0907243038146215, + "grad_norm": 13.192233085632324, + "learning_rate": 7.89896639058115e-05, + "loss": 0.8562, + "step": 16098 + }, + { + "epoch": 1.0907920590825937, + "grad_norm": 4.923586845397949, + "learning_rate": 7.898829488671368e-05, + "loss": 0.6513, + "step": 16099 + }, + { + "epoch": 1.0908598143505657, + "grad_norm": 6.001354217529297, + "learning_rate": 7.898692586761586e-05, + "loss": 0.6313, + "step": 16100 + }, + { + "epoch": 1.0909275696185379, + "grad_norm": 6.142572402954102, + "learning_rate": 7.898555684851804e-05, + "loss": 0.5797, + "step": 16101 + }, + { + "epoch": 1.0909953248865099, + "grad_norm": 5.978823661804199, + "learning_rate": 7.898418782942022e-05, + "loss": 0.7278, + "step": 16102 + }, + { + "epoch": 1.091063080154482, + "grad_norm": 6.031432151794434, + "learning_rate": 7.898281881032241e-05, + "loss": 0.4503, + "step": 16103 + }, + { + "epoch": 1.091130835422454, + "grad_norm": 6.397644996643066, + "learning_rate": 7.898144979122459e-05, + "loss": 0.8417, + "step": 16104 + }, + { + "epoch": 1.0911985906904262, + "grad_norm": 4.83469295501709, + "learning_rate": 7.898008077212677e-05, + "loss": 0.5679, + "step": 16105 + }, + { + "epoch": 1.0912663459583982, + "grad_norm": 5.136175155639648, + "learning_rate": 7.897871175302895e-05, + "loss": 0.7549, + "step": 16106 + }, + { + "epoch": 1.0913341012263704, + "grad_norm": 5.254212856292725, + "learning_rate": 7.897734273393113e-05, + "loss": 0.5692, + "step": 16107 + }, + { + "epoch": 1.0914018564943424, + "grad_norm": 8.869996070861816, + "learning_rate": 7.897597371483333e-05, + "loss": 0.8202, + "step": 16108 + }, + { + "epoch": 1.0914696117623146, + "grad_norm": 6.127236843109131, + "learning_rate": 7.897460469573551e-05, + "loss": 0.5936, + "step": 16109 + }, + { + "epoch": 1.0915373670302866, + "grad_norm": 6.347018241882324, + "learning_rate": 7.897323567663769e-05, + "loss": 0.5723, + "step": 16110 + }, + { + "epoch": 1.0916051222982588, + "grad_norm": 4.602900981903076, + "learning_rate": 7.897186665753987e-05, + "loss": 0.555, + "step": 16111 + }, + { + "epoch": 1.0916728775662308, + "grad_norm": 5.458276271820068, + "learning_rate": 7.897049763844206e-05, + "loss": 0.6425, + "step": 16112 + }, + { + "epoch": 1.091740632834203, + "grad_norm": 4.358611583709717, + "learning_rate": 7.896912861934424e-05, + "loss": 0.506, + "step": 16113 + }, + { + "epoch": 1.091808388102175, + "grad_norm": 5.602118015289307, + "learning_rate": 7.896775960024642e-05, + "loss": 0.7078, + "step": 16114 + }, + { + "epoch": 1.091876143370147, + "grad_norm": 5.718192100524902, + "learning_rate": 7.89663905811486e-05, + "loss": 0.6152, + "step": 16115 + }, + { + "epoch": 1.0919438986381191, + "grad_norm": 7.62349796295166, + "learning_rate": 7.896502156205078e-05, + "loss": 0.8543, + "step": 16116 + }, + { + "epoch": 1.0920116539060911, + "grad_norm": 5.8089189529418945, + "learning_rate": 7.896365254295298e-05, + "loss": 0.6322, + "step": 16117 + }, + { + "epoch": 1.0920794091740633, + "grad_norm": 4.909200191497803, + "learning_rate": 7.896228352385516e-05, + "loss": 0.6035, + "step": 16118 + }, + { + "epoch": 1.0921471644420353, + "grad_norm": 6.314395427703857, + "learning_rate": 7.896091450475734e-05, + "loss": 0.7228, + "step": 16119 + }, + { + "epoch": 1.0922149197100075, + "grad_norm": 4.429898738861084, + "learning_rate": 7.895954548565953e-05, + "loss": 0.7112, + "step": 16120 + }, + { + "epoch": 1.0922826749779795, + "grad_norm": 5.228384017944336, + "learning_rate": 7.895817646656171e-05, + "loss": 0.6499, + "step": 16121 + }, + { + "epoch": 1.0923504302459517, + "grad_norm": 5.266185760498047, + "learning_rate": 7.895680744746389e-05, + "loss": 0.6539, + "step": 16122 + }, + { + "epoch": 1.0924181855139237, + "grad_norm": 5.5866379737854, + "learning_rate": 7.895543842836609e-05, + "loss": 0.5946, + "step": 16123 + }, + { + "epoch": 1.0924859407818959, + "grad_norm": 5.785736083984375, + "learning_rate": 7.895406940926827e-05, + "loss": 0.6127, + "step": 16124 + }, + { + "epoch": 1.0925536960498679, + "grad_norm": 4.491186141967773, + "learning_rate": 7.895270039017045e-05, + "loss": 0.3924, + "step": 16125 + }, + { + "epoch": 1.09262145131784, + "grad_norm": 5.7343831062316895, + "learning_rate": 7.895133137107264e-05, + "loss": 0.5891, + "step": 16126 + }, + { + "epoch": 1.092689206585812, + "grad_norm": 5.105741500854492, + "learning_rate": 7.894996235197482e-05, + "loss": 0.6144, + "step": 16127 + }, + { + "epoch": 1.0927569618537842, + "grad_norm": 6.519714832305908, + "learning_rate": 7.8948593332877e-05, + "loss": 0.6786, + "step": 16128 + }, + { + "epoch": 1.0928247171217562, + "grad_norm": 6.885262489318848, + "learning_rate": 7.894722431377918e-05, + "loss": 0.7443, + "step": 16129 + }, + { + "epoch": 1.0928924723897282, + "grad_norm": 7.912429332733154, + "learning_rate": 7.894585529468138e-05, + "loss": 0.851, + "step": 16130 + }, + { + "epoch": 1.0929602276577004, + "grad_norm": 8.219959259033203, + "learning_rate": 7.894448627558356e-05, + "loss": 0.6724, + "step": 16131 + }, + { + "epoch": 1.0930279829256724, + "grad_norm": 5.2340545654296875, + "learning_rate": 7.894311725648574e-05, + "loss": 0.8001, + "step": 16132 + }, + { + "epoch": 1.0930957381936446, + "grad_norm": 5.254332542419434, + "learning_rate": 7.894174823738792e-05, + "loss": 0.735, + "step": 16133 + }, + { + "epoch": 1.0931634934616166, + "grad_norm": 6.4383864402771, + "learning_rate": 7.89403792182901e-05, + "loss": 0.7279, + "step": 16134 + }, + { + "epoch": 1.0932312487295888, + "grad_norm": 5.197512626647949, + "learning_rate": 7.893901019919229e-05, + "loss": 0.5941, + "step": 16135 + }, + { + "epoch": 1.0932990039975607, + "grad_norm": 8.93490982055664, + "learning_rate": 7.893764118009447e-05, + "loss": 0.9157, + "step": 16136 + }, + { + "epoch": 1.093366759265533, + "grad_norm": 5.834399223327637, + "learning_rate": 7.893627216099665e-05, + "loss": 0.6377, + "step": 16137 + }, + { + "epoch": 1.093434514533505, + "grad_norm": 7.221981048583984, + "learning_rate": 7.893490314189883e-05, + "loss": 0.8778, + "step": 16138 + }, + { + "epoch": 1.0935022698014771, + "grad_norm": 8.59212875366211, + "learning_rate": 7.893353412280101e-05, + "loss": 0.6377, + "step": 16139 + }, + { + "epoch": 1.0935700250694491, + "grad_norm": 5.231501579284668, + "learning_rate": 7.89321651037032e-05, + "loss": 0.609, + "step": 16140 + }, + { + "epoch": 1.0936377803374213, + "grad_norm": 6.0066447257995605, + "learning_rate": 7.893079608460539e-05, + "loss": 0.6786, + "step": 16141 + }, + { + "epoch": 1.0937055356053933, + "grad_norm": 5.8601861000061035, + "learning_rate": 7.892942706550757e-05, + "loss": 0.7069, + "step": 16142 + }, + { + "epoch": 1.0937732908733655, + "grad_norm": 5.470623016357422, + "learning_rate": 7.892805804640975e-05, + "loss": 0.6973, + "step": 16143 + }, + { + "epoch": 1.0938410461413375, + "grad_norm": 6.8463454246521, + "learning_rate": 7.892668902731194e-05, + "loss": 0.6182, + "step": 16144 + }, + { + "epoch": 1.0939088014093095, + "grad_norm": 4.355232238769531, + "learning_rate": 7.892532000821412e-05, + "loss": 0.6908, + "step": 16145 + }, + { + "epoch": 1.0939765566772817, + "grad_norm": 7.3813676834106445, + "learning_rate": 7.89239509891163e-05, + "loss": 0.6964, + "step": 16146 + }, + { + "epoch": 1.0940443119452536, + "grad_norm": 5.615843296051025, + "learning_rate": 7.892258197001848e-05, + "loss": 0.7703, + "step": 16147 + }, + { + "epoch": 1.0941120672132258, + "grad_norm": 4.897951602935791, + "learning_rate": 7.892121295092066e-05, + "loss": 0.7212, + "step": 16148 + }, + { + "epoch": 1.0941798224811978, + "grad_norm": 6.01582670211792, + "learning_rate": 7.891984393182286e-05, + "loss": 0.6612, + "step": 16149 + }, + { + "epoch": 1.09424757774917, + "grad_norm": 6.209997653961182, + "learning_rate": 7.891847491272504e-05, + "loss": 0.7842, + "step": 16150 + }, + { + "epoch": 1.094315333017142, + "grad_norm": 5.317853927612305, + "learning_rate": 7.891710589362722e-05, + "loss": 0.7604, + "step": 16151 + }, + { + "epoch": 1.0943830882851142, + "grad_norm": 4.876063823699951, + "learning_rate": 7.89157368745294e-05, + "loss": 0.6274, + "step": 16152 + }, + { + "epoch": 1.0944508435530862, + "grad_norm": 5.609501361846924, + "learning_rate": 7.891436785543159e-05, + "loss": 0.7662, + "step": 16153 + }, + { + "epoch": 1.0945185988210584, + "grad_norm": 5.9666852951049805, + "learning_rate": 7.891299883633377e-05, + "loss": 0.7864, + "step": 16154 + }, + { + "epoch": 1.0945863540890304, + "grad_norm": 4.681065559387207, + "learning_rate": 7.891162981723595e-05, + "loss": 0.727, + "step": 16155 + }, + { + "epoch": 1.0946541093570026, + "grad_norm": 6.903472423553467, + "learning_rate": 7.891026079813813e-05, + "loss": 0.894, + "step": 16156 + }, + { + "epoch": 1.0947218646249746, + "grad_norm": 6.2425408363342285, + "learning_rate": 7.890889177904031e-05, + "loss": 0.658, + "step": 16157 + }, + { + "epoch": 1.0947896198929468, + "grad_norm": 5.621673583984375, + "learning_rate": 7.890752275994251e-05, + "loss": 0.675, + "step": 16158 + }, + { + "epoch": 1.0948573751609187, + "grad_norm": 6.081178665161133, + "learning_rate": 7.890615374084469e-05, + "loss": 0.5955, + "step": 16159 + }, + { + "epoch": 1.094925130428891, + "grad_norm": 6.864749431610107, + "learning_rate": 7.890478472174687e-05, + "loss": 0.5638, + "step": 16160 + }, + { + "epoch": 1.094992885696863, + "grad_norm": 5.215126037597656, + "learning_rate": 7.890341570264905e-05, + "loss": 0.6291, + "step": 16161 + }, + { + "epoch": 1.0950606409648351, + "grad_norm": 7.269073963165283, + "learning_rate": 7.890204668355123e-05, + "loss": 0.9472, + "step": 16162 + }, + { + "epoch": 1.095128396232807, + "grad_norm": 6.205016613006592, + "learning_rate": 7.890067766445342e-05, + "loss": 0.5892, + "step": 16163 + }, + { + "epoch": 1.095196151500779, + "grad_norm": 5.993995189666748, + "learning_rate": 7.88993086453556e-05, + "loss": 0.6912, + "step": 16164 + }, + { + "epoch": 1.0952639067687513, + "grad_norm": 4.524845600128174, + "learning_rate": 7.889793962625778e-05, + "loss": 0.5425, + "step": 16165 + }, + { + "epoch": 1.0953316620367233, + "grad_norm": 6.5391364097595215, + "learning_rate": 7.889657060715996e-05, + "loss": 0.7429, + "step": 16166 + }, + { + "epoch": 1.0953994173046955, + "grad_norm": 6.908164978027344, + "learning_rate": 7.889520158806216e-05, + "loss": 0.6332, + "step": 16167 + }, + { + "epoch": 1.0954671725726675, + "grad_norm": 6.190462112426758, + "learning_rate": 7.889383256896434e-05, + "loss": 0.8123, + "step": 16168 + }, + { + "epoch": 1.0955349278406397, + "grad_norm": 5.4493207931518555, + "learning_rate": 7.889246354986652e-05, + "loss": 0.6539, + "step": 16169 + }, + { + "epoch": 1.0956026831086116, + "grad_norm": 5.316275119781494, + "learning_rate": 7.889109453076871e-05, + "loss": 0.5694, + "step": 16170 + }, + { + "epoch": 1.0956704383765838, + "grad_norm": 8.243550300598145, + "learning_rate": 7.888972551167089e-05, + "loss": 1.0688, + "step": 16171 + }, + { + "epoch": 1.0957381936445558, + "grad_norm": 5.587428092956543, + "learning_rate": 7.888835649257307e-05, + "loss": 0.7348, + "step": 16172 + }, + { + "epoch": 1.095805948912528, + "grad_norm": 6.552480697631836, + "learning_rate": 7.888698747347527e-05, + "loss": 0.5606, + "step": 16173 + }, + { + "epoch": 1.0958737041805, + "grad_norm": 5.394407272338867, + "learning_rate": 7.888561845437745e-05, + "loss": 0.6762, + "step": 16174 + }, + { + "epoch": 1.0959414594484722, + "grad_norm": 9.323013305664062, + "learning_rate": 7.888424943527963e-05, + "loss": 0.6987, + "step": 16175 + }, + { + "epoch": 1.0960092147164442, + "grad_norm": 4.646063327789307, + "learning_rate": 7.888288041618182e-05, + "loss": 0.4748, + "step": 16176 + }, + { + "epoch": 1.0960769699844164, + "grad_norm": 8.358293533325195, + "learning_rate": 7.8881511397084e-05, + "loss": 0.6064, + "step": 16177 + }, + { + "epoch": 1.0961447252523884, + "grad_norm": 8.130820274353027, + "learning_rate": 7.888014237798618e-05, + "loss": 0.7405, + "step": 16178 + }, + { + "epoch": 1.0962124805203604, + "grad_norm": 5.757419109344482, + "learning_rate": 7.887877335888836e-05, + "loss": 0.7595, + "step": 16179 + }, + { + "epoch": 1.0962802357883326, + "grad_norm": 4.9174580574035645, + "learning_rate": 7.887740433979054e-05, + "loss": 0.5362, + "step": 16180 + }, + { + "epoch": 1.0963479910563045, + "grad_norm": 6.0252604484558105, + "learning_rate": 7.887603532069274e-05, + "loss": 0.6579, + "step": 16181 + }, + { + "epoch": 1.0964157463242767, + "grad_norm": 6.813238620758057, + "learning_rate": 7.887466630159492e-05, + "loss": 0.9755, + "step": 16182 + }, + { + "epoch": 1.0964835015922487, + "grad_norm": 6.028433322906494, + "learning_rate": 7.88732972824971e-05, + "loss": 0.789, + "step": 16183 + }, + { + "epoch": 1.096551256860221, + "grad_norm": 4.869696617126465, + "learning_rate": 7.887192826339928e-05, + "loss": 0.6194, + "step": 16184 + }, + { + "epoch": 1.096619012128193, + "grad_norm": 5.043181419372559, + "learning_rate": 7.887055924430146e-05, + "loss": 0.515, + "step": 16185 + }, + { + "epoch": 1.096686767396165, + "grad_norm": 5.320845127105713, + "learning_rate": 7.886919022520365e-05, + "loss": 0.5807, + "step": 16186 + }, + { + "epoch": 1.096754522664137, + "grad_norm": 6.676725387573242, + "learning_rate": 7.886782120610583e-05, + "loss": 0.8363, + "step": 16187 + }, + { + "epoch": 1.0968222779321093, + "grad_norm": 5.5581464767456055, + "learning_rate": 7.886645218700801e-05, + "loss": 0.7815, + "step": 16188 + }, + { + "epoch": 1.0968900332000813, + "grad_norm": 6.747057914733887, + "learning_rate": 7.886508316791019e-05, + "loss": 0.781, + "step": 16189 + }, + { + "epoch": 1.0969577884680535, + "grad_norm": 5.670258522033691, + "learning_rate": 7.886371414881239e-05, + "loss": 0.7192, + "step": 16190 + }, + { + "epoch": 1.0970255437360255, + "grad_norm": 6.418565273284912, + "learning_rate": 7.886234512971457e-05, + "loss": 0.7151, + "step": 16191 + }, + { + "epoch": 1.0970932990039977, + "grad_norm": 6.145472526550293, + "learning_rate": 7.886097611061675e-05, + "loss": 0.738, + "step": 16192 + }, + { + "epoch": 1.0971610542719696, + "grad_norm": 6.247579574584961, + "learning_rate": 7.885960709151893e-05, + "loss": 0.7107, + "step": 16193 + }, + { + "epoch": 1.0972288095399416, + "grad_norm": 6.627540111541748, + "learning_rate": 7.885823807242111e-05, + "loss": 0.6838, + "step": 16194 + }, + { + "epoch": 1.0972965648079138, + "grad_norm": 5.86225700378418, + "learning_rate": 7.88568690533233e-05, + "loss": 0.7547, + "step": 16195 + }, + { + "epoch": 1.0973643200758858, + "grad_norm": 5.816222667694092, + "learning_rate": 7.885550003422548e-05, + "loss": 0.5167, + "step": 16196 + }, + { + "epoch": 1.097432075343858, + "grad_norm": 4.933455467224121, + "learning_rate": 7.885413101512766e-05, + "loss": 0.6946, + "step": 16197 + }, + { + "epoch": 1.09749983061183, + "grad_norm": 4.601310729980469, + "learning_rate": 7.885276199602984e-05, + "loss": 0.6676, + "step": 16198 + }, + { + "epoch": 1.0975675858798022, + "grad_norm": 6.967305660247803, + "learning_rate": 7.885139297693204e-05, + "loss": 0.7414, + "step": 16199 + }, + { + "epoch": 1.0976353411477742, + "grad_norm": 5.442778587341309, + "learning_rate": 7.885002395783422e-05, + "loss": 0.5816, + "step": 16200 + }, + { + "epoch": 1.0977030964157464, + "grad_norm": 6.803948879241943, + "learning_rate": 7.88486549387364e-05, + "loss": 0.6741, + "step": 16201 + }, + { + "epoch": 1.0977708516837184, + "grad_norm": 4.722171783447266, + "learning_rate": 7.884728591963858e-05, + "loss": 0.7071, + "step": 16202 + }, + { + "epoch": 1.0978386069516906, + "grad_norm": 6.338668346405029, + "learning_rate": 7.884591690054076e-05, + "loss": 0.7145, + "step": 16203 + }, + { + "epoch": 1.0979063622196625, + "grad_norm": 6.372475624084473, + "learning_rate": 7.884454788144295e-05, + "loss": 0.8761, + "step": 16204 + }, + { + "epoch": 1.0979741174876347, + "grad_norm": 5.98175048828125, + "learning_rate": 7.884317886234513e-05, + "loss": 0.7052, + "step": 16205 + }, + { + "epoch": 1.0980418727556067, + "grad_norm": 7.220608711242676, + "learning_rate": 7.884180984324731e-05, + "loss": 0.7702, + "step": 16206 + }, + { + "epoch": 1.098109628023579, + "grad_norm": 4.378786563873291, + "learning_rate": 7.884044082414949e-05, + "loss": 0.5409, + "step": 16207 + }, + { + "epoch": 1.098177383291551, + "grad_norm": 4.2132978439331055, + "learning_rate": 7.883907180505169e-05, + "loss": 0.6008, + "step": 16208 + }, + { + "epoch": 1.098245138559523, + "grad_norm": 5.9434590339660645, + "learning_rate": 7.883770278595387e-05, + "loss": 0.6274, + "step": 16209 + }, + { + "epoch": 1.098312893827495, + "grad_norm": 4.109735012054443, + "learning_rate": 7.883633376685605e-05, + "loss": 0.7516, + "step": 16210 + }, + { + "epoch": 1.0983806490954673, + "grad_norm": 5.5988006591796875, + "learning_rate": 7.883496474775823e-05, + "loss": 0.6469, + "step": 16211 + }, + { + "epoch": 1.0984484043634393, + "grad_norm": 5.006146430969238, + "learning_rate": 7.883359572866041e-05, + "loss": 0.7183, + "step": 16212 + }, + { + "epoch": 1.0985161596314112, + "grad_norm": 5.305348873138428, + "learning_rate": 7.88322267095626e-05, + "loss": 0.6361, + "step": 16213 + }, + { + "epoch": 1.0985839148993835, + "grad_norm": 5.728710174560547, + "learning_rate": 7.883085769046478e-05, + "loss": 0.8569, + "step": 16214 + }, + { + "epoch": 1.0986516701673554, + "grad_norm": 11.328938484191895, + "learning_rate": 7.882948867136696e-05, + "loss": 0.5037, + "step": 16215 + }, + { + "epoch": 1.0987194254353276, + "grad_norm": 4.791298866271973, + "learning_rate": 7.882811965226916e-05, + "loss": 0.6317, + "step": 16216 + }, + { + "epoch": 1.0987871807032996, + "grad_norm": 4.028521537780762, + "learning_rate": 7.882675063317134e-05, + "loss": 0.5422, + "step": 16217 + }, + { + "epoch": 1.0988549359712718, + "grad_norm": 5.723876953125, + "learning_rate": 7.882538161407352e-05, + "loss": 0.8359, + "step": 16218 + }, + { + "epoch": 1.0989226912392438, + "grad_norm": 4.189463138580322, + "learning_rate": 7.882401259497571e-05, + "loss": 0.5966, + "step": 16219 + }, + { + "epoch": 1.098990446507216, + "grad_norm": 5.877222061157227, + "learning_rate": 7.882264357587789e-05, + "loss": 0.5115, + "step": 16220 + }, + { + "epoch": 1.099058201775188, + "grad_norm": 6.062419891357422, + "learning_rate": 7.882127455678007e-05, + "loss": 0.6709, + "step": 16221 + }, + { + "epoch": 1.0991259570431602, + "grad_norm": 8.064435958862305, + "learning_rate": 7.881990553768227e-05, + "loss": 0.8518, + "step": 16222 + }, + { + "epoch": 1.0991937123111322, + "grad_norm": 5.81643533706665, + "learning_rate": 7.881853651858445e-05, + "loss": 0.7286, + "step": 16223 + }, + { + "epoch": 1.0992614675791044, + "grad_norm": 5.727938652038574, + "learning_rate": 7.881716749948663e-05, + "loss": 0.8333, + "step": 16224 + }, + { + "epoch": 1.0993292228470763, + "grad_norm": 8.810190200805664, + "learning_rate": 7.88157984803888e-05, + "loss": 0.9136, + "step": 16225 + }, + { + "epoch": 1.0993969781150486, + "grad_norm": 6.392600059509277, + "learning_rate": 7.881442946129099e-05, + "loss": 0.8322, + "step": 16226 + }, + { + "epoch": 1.0994647333830205, + "grad_norm": 6.082714080810547, + "learning_rate": 7.881306044219318e-05, + "loss": 0.649, + "step": 16227 + }, + { + "epoch": 1.0995324886509925, + "grad_norm": 6.840407848358154, + "learning_rate": 7.881169142309536e-05, + "loss": 0.5934, + "step": 16228 + }, + { + "epoch": 1.0996002439189647, + "grad_norm": 6.920519828796387, + "learning_rate": 7.881032240399754e-05, + "loss": 0.9076, + "step": 16229 + }, + { + "epoch": 1.0996679991869367, + "grad_norm": 5.431446075439453, + "learning_rate": 7.880895338489972e-05, + "loss": 0.6394, + "step": 16230 + }, + { + "epoch": 1.099735754454909, + "grad_norm": 7.608739376068115, + "learning_rate": 7.880758436580192e-05, + "loss": 0.7753, + "step": 16231 + }, + { + "epoch": 1.0998035097228809, + "grad_norm": 6.12460470199585, + "learning_rate": 7.88062153467041e-05, + "loss": 0.7162, + "step": 16232 + }, + { + "epoch": 1.099871264990853, + "grad_norm": 6.770415782928467, + "learning_rate": 7.880484632760628e-05, + "loss": 0.842, + "step": 16233 + }, + { + "epoch": 1.099939020258825, + "grad_norm": 5.251020908355713, + "learning_rate": 7.880347730850846e-05, + "loss": 0.5031, + "step": 16234 + }, + { + "epoch": 1.1000067755267973, + "grad_norm": 6.684656620025635, + "learning_rate": 7.880210828941064e-05, + "loss": 0.6369, + "step": 16235 + }, + { + "epoch": 1.1000745307947692, + "grad_norm": 6.140908241271973, + "learning_rate": 7.880073927031283e-05, + "loss": 0.6889, + "step": 16236 + }, + { + "epoch": 1.1001422860627414, + "grad_norm": 6.55711555480957, + "learning_rate": 7.879937025121501e-05, + "loss": 0.6736, + "step": 16237 + }, + { + "epoch": 1.1002100413307134, + "grad_norm": 5.948589324951172, + "learning_rate": 7.879800123211719e-05, + "loss": 0.6249, + "step": 16238 + }, + { + "epoch": 1.1002777965986856, + "grad_norm": 5.505045413970947, + "learning_rate": 7.879663221301937e-05, + "loss": 0.7244, + "step": 16239 + }, + { + "epoch": 1.1003455518666576, + "grad_norm": 4.607968330383301, + "learning_rate": 7.879526319392155e-05, + "loss": 0.9175, + "step": 16240 + }, + { + "epoch": 1.1004133071346298, + "grad_norm": 6.939621448516846, + "learning_rate": 7.879389417482375e-05, + "loss": 0.7586, + "step": 16241 + }, + { + "epoch": 1.1004810624026018, + "grad_norm": 6.131583213806152, + "learning_rate": 7.879252515572593e-05, + "loss": 0.6007, + "step": 16242 + }, + { + "epoch": 1.1005488176705738, + "grad_norm": 5.63719367980957, + "learning_rate": 7.879115613662811e-05, + "loss": 0.9402, + "step": 16243 + }, + { + "epoch": 1.100616572938546, + "grad_norm": 6.878543376922607, + "learning_rate": 7.878978711753029e-05, + "loss": 0.5446, + "step": 16244 + }, + { + "epoch": 1.100684328206518, + "grad_norm": 5.816869735717773, + "learning_rate": 7.878841809843248e-05, + "loss": 0.5093, + "step": 16245 + }, + { + "epoch": 1.1007520834744902, + "grad_norm": 7.57546329498291, + "learning_rate": 7.878704907933466e-05, + "loss": 0.8181, + "step": 16246 + }, + { + "epoch": 1.1008198387424621, + "grad_norm": 6.603987216949463, + "learning_rate": 7.878568006023684e-05, + "loss": 0.9014, + "step": 16247 + }, + { + "epoch": 1.1008875940104343, + "grad_norm": 5.182919025421143, + "learning_rate": 7.878431104113902e-05, + "loss": 0.5691, + "step": 16248 + }, + { + "epoch": 1.1009553492784063, + "grad_norm": 4.985978126525879, + "learning_rate": 7.87829420220412e-05, + "loss": 0.6697, + "step": 16249 + }, + { + "epoch": 1.1010231045463785, + "grad_norm": 5.756229400634766, + "learning_rate": 7.87815730029434e-05, + "loss": 0.7705, + "step": 16250 + }, + { + "epoch": 1.1010908598143505, + "grad_norm": 5.34890079498291, + "learning_rate": 7.878020398384558e-05, + "loss": 0.8372, + "step": 16251 + }, + { + "epoch": 1.1011586150823227, + "grad_norm": 5.433262825012207, + "learning_rate": 7.877883496474776e-05, + "loss": 0.8024, + "step": 16252 + }, + { + "epoch": 1.1012263703502947, + "grad_norm": 6.8542256355285645, + "learning_rate": 7.877746594564994e-05, + "loss": 0.6715, + "step": 16253 + }, + { + "epoch": 1.101294125618267, + "grad_norm": 6.2984843254089355, + "learning_rate": 7.877609692655213e-05, + "loss": 0.9062, + "step": 16254 + }, + { + "epoch": 1.1013618808862389, + "grad_norm": 3.8605449199676514, + "learning_rate": 7.877472790745431e-05, + "loss": 0.4797, + "step": 16255 + }, + { + "epoch": 1.101429636154211, + "grad_norm": 6.800119400024414, + "learning_rate": 7.877335888835649e-05, + "loss": 0.7336, + "step": 16256 + }, + { + "epoch": 1.101497391422183, + "grad_norm": 5.705935478210449, + "learning_rate": 7.877198986925867e-05, + "loss": 0.7437, + "step": 16257 + }, + { + "epoch": 1.1015651466901553, + "grad_norm": 5.928062915802002, + "learning_rate": 7.877062085016085e-05, + "loss": 0.7036, + "step": 16258 + }, + { + "epoch": 1.1016329019581272, + "grad_norm": 5.317257881164551, + "learning_rate": 7.876925183106305e-05, + "loss": 0.6604, + "step": 16259 + }, + { + "epoch": 1.1017006572260994, + "grad_norm": 7.342348575592041, + "learning_rate": 7.876788281196523e-05, + "loss": 0.8121, + "step": 16260 + }, + { + "epoch": 1.1017684124940714, + "grad_norm": 6.198090553283691, + "learning_rate": 7.876651379286741e-05, + "loss": 0.6446, + "step": 16261 + }, + { + "epoch": 1.1018361677620434, + "grad_norm": 6.285484790802002, + "learning_rate": 7.87651447737696e-05, + "loss": 0.6862, + "step": 16262 + }, + { + "epoch": 1.1019039230300156, + "grad_norm": 6.3264994621276855, + "learning_rate": 7.876377575467178e-05, + "loss": 0.5654, + "step": 16263 + }, + { + "epoch": 1.1019716782979876, + "grad_norm": 4.603121280670166, + "learning_rate": 7.876240673557396e-05, + "loss": 0.799, + "step": 16264 + }, + { + "epoch": 1.1020394335659598, + "grad_norm": 8.007383346557617, + "learning_rate": 7.876103771647616e-05, + "loss": 0.8962, + "step": 16265 + }, + { + "epoch": 1.1021071888339318, + "grad_norm": 5.806975841522217, + "learning_rate": 7.875966869737834e-05, + "loss": 0.7357, + "step": 16266 + }, + { + "epoch": 1.102174944101904, + "grad_norm": 5.703726291656494, + "learning_rate": 7.875829967828052e-05, + "loss": 0.7392, + "step": 16267 + }, + { + "epoch": 1.102242699369876, + "grad_norm": 5.793273448944092, + "learning_rate": 7.875693065918271e-05, + "loss": 0.7173, + "step": 16268 + }, + { + "epoch": 1.1023104546378482, + "grad_norm": 6.515778541564941, + "learning_rate": 7.875556164008489e-05, + "loss": 0.6591, + "step": 16269 + }, + { + "epoch": 1.1023782099058201, + "grad_norm": 7.7042059898376465, + "learning_rate": 7.875419262098707e-05, + "loss": 0.9228, + "step": 16270 + }, + { + "epoch": 1.1024459651737923, + "grad_norm": 4.594401836395264, + "learning_rate": 7.875282360188925e-05, + "loss": 0.5748, + "step": 16271 + }, + { + "epoch": 1.1025137204417643, + "grad_norm": 8.685628890991211, + "learning_rate": 7.875145458279143e-05, + "loss": 0.7174, + "step": 16272 + }, + { + "epoch": 1.1025814757097365, + "grad_norm": 5.300126075744629, + "learning_rate": 7.875008556369363e-05, + "loss": 0.6349, + "step": 16273 + }, + { + "epoch": 1.1026492309777085, + "grad_norm": 5.215007305145264, + "learning_rate": 7.87487165445958e-05, + "loss": 0.5969, + "step": 16274 + }, + { + "epoch": 1.1027169862456807, + "grad_norm": 6.813204765319824, + "learning_rate": 7.874734752549799e-05, + "loss": 0.7632, + "step": 16275 + }, + { + "epoch": 1.1027847415136527, + "grad_norm": 5.5026631355285645, + "learning_rate": 7.874597850640017e-05, + "loss": 0.6924, + "step": 16276 + }, + { + "epoch": 1.1028524967816247, + "grad_norm": 7.050729274749756, + "learning_rate": 7.874460948730236e-05, + "loss": 0.8821, + "step": 16277 + }, + { + "epoch": 1.1029202520495969, + "grad_norm": 6.8287811279296875, + "learning_rate": 7.874324046820454e-05, + "loss": 0.7282, + "step": 16278 + }, + { + "epoch": 1.1029880073175689, + "grad_norm": 5.6110687255859375, + "learning_rate": 7.874187144910672e-05, + "loss": 0.5387, + "step": 16279 + }, + { + "epoch": 1.103055762585541, + "grad_norm": 6.597656726837158, + "learning_rate": 7.87405024300089e-05, + "loss": 0.9386, + "step": 16280 + }, + { + "epoch": 1.103123517853513, + "grad_norm": 6.492720603942871, + "learning_rate": 7.873913341091108e-05, + "loss": 0.5659, + "step": 16281 + }, + { + "epoch": 1.1031912731214852, + "grad_norm": 5.881611347198486, + "learning_rate": 7.873776439181328e-05, + "loss": 0.5639, + "step": 16282 + }, + { + "epoch": 1.1032590283894572, + "grad_norm": 5.611779689788818, + "learning_rate": 7.873639537271546e-05, + "loss": 0.6953, + "step": 16283 + }, + { + "epoch": 1.1033267836574294, + "grad_norm": 7.153480529785156, + "learning_rate": 7.873502635361764e-05, + "loss": 0.9283, + "step": 16284 + }, + { + "epoch": 1.1033945389254014, + "grad_norm": 6.058135986328125, + "learning_rate": 7.873365733451982e-05, + "loss": 0.7362, + "step": 16285 + }, + { + "epoch": 1.1034622941933736, + "grad_norm": 5.711484432220459, + "learning_rate": 7.873228831542201e-05, + "loss": 0.7727, + "step": 16286 + }, + { + "epoch": 1.1035300494613456, + "grad_norm": 4.450558662414551, + "learning_rate": 7.873091929632419e-05, + "loss": 0.687, + "step": 16287 + }, + { + "epoch": 1.1035978047293178, + "grad_norm": 6.392914772033691, + "learning_rate": 7.872955027722637e-05, + "loss": 0.7404, + "step": 16288 + }, + { + "epoch": 1.1036655599972898, + "grad_norm": 7.236051082611084, + "learning_rate": 7.872818125812855e-05, + "loss": 0.8772, + "step": 16289 + }, + { + "epoch": 1.103733315265262, + "grad_norm": 5.396810054779053, + "learning_rate": 7.872681223903073e-05, + "loss": 0.7059, + "step": 16290 + }, + { + "epoch": 1.103801070533234, + "grad_norm": 6.475565433502197, + "learning_rate": 7.872544321993293e-05, + "loss": 0.7454, + "step": 16291 + }, + { + "epoch": 1.103868825801206, + "grad_norm": 7.763959884643555, + "learning_rate": 7.87240742008351e-05, + "loss": 0.8434, + "step": 16292 + }, + { + "epoch": 1.1039365810691781, + "grad_norm": 5.478339195251465, + "learning_rate": 7.872270518173729e-05, + "loss": 0.665, + "step": 16293 + }, + { + "epoch": 1.1040043363371501, + "grad_norm": 6.564233779907227, + "learning_rate": 7.872133616263947e-05, + "loss": 0.8856, + "step": 16294 + }, + { + "epoch": 1.1040720916051223, + "grad_norm": 5.313308238983154, + "learning_rate": 7.871996714354165e-05, + "loss": 0.5264, + "step": 16295 + }, + { + "epoch": 1.1041398468730943, + "grad_norm": 5.785020351409912, + "learning_rate": 7.871859812444384e-05, + "loss": 0.5262, + "step": 16296 + }, + { + "epoch": 1.1042076021410665, + "grad_norm": 4.397270679473877, + "learning_rate": 7.871722910534602e-05, + "loss": 0.5755, + "step": 16297 + }, + { + "epoch": 1.1042753574090385, + "grad_norm": 5.507979393005371, + "learning_rate": 7.87158600862482e-05, + "loss": 0.6667, + "step": 16298 + }, + { + "epoch": 1.1043431126770107, + "grad_norm": 6.59631872177124, + "learning_rate": 7.871449106715038e-05, + "loss": 0.6135, + "step": 16299 + }, + { + "epoch": 1.1044108679449827, + "grad_norm": 5.889465808868408, + "learning_rate": 7.871312204805258e-05, + "loss": 0.6328, + "step": 16300 + }, + { + "epoch": 1.1044786232129549, + "grad_norm": 5.357670307159424, + "learning_rate": 7.871175302895476e-05, + "loss": 0.8888, + "step": 16301 + }, + { + "epoch": 1.1045463784809268, + "grad_norm": 5.813202381134033, + "learning_rate": 7.871038400985694e-05, + "loss": 0.8759, + "step": 16302 + }, + { + "epoch": 1.104614133748899, + "grad_norm": 5.886719226837158, + "learning_rate": 7.870901499075912e-05, + "loss": 0.8037, + "step": 16303 + }, + { + "epoch": 1.104681889016871, + "grad_norm": 5.701767444610596, + "learning_rate": 7.87076459716613e-05, + "loss": 0.7458, + "step": 16304 + }, + { + "epoch": 1.1047496442848432, + "grad_norm": 4.6818413734436035, + "learning_rate": 7.870627695256349e-05, + "loss": 0.5317, + "step": 16305 + }, + { + "epoch": 1.1048173995528152, + "grad_norm": 5.039698123931885, + "learning_rate": 7.870490793346567e-05, + "loss": 0.6784, + "step": 16306 + }, + { + "epoch": 1.1048851548207874, + "grad_norm": 8.11133861541748, + "learning_rate": 7.870353891436785e-05, + "loss": 0.5449, + "step": 16307 + }, + { + "epoch": 1.1049529100887594, + "grad_norm": 6.868621826171875, + "learning_rate": 7.870216989527005e-05, + "loss": 0.6633, + "step": 16308 + }, + { + "epoch": 1.1050206653567316, + "grad_norm": 5.920335292816162, + "learning_rate": 7.870080087617223e-05, + "loss": 0.7819, + "step": 16309 + }, + { + "epoch": 1.1050884206247036, + "grad_norm": 5.8382978439331055, + "learning_rate": 7.86994318570744e-05, + "loss": 0.7254, + "step": 16310 + }, + { + "epoch": 1.1051561758926756, + "grad_norm": 6.541285514831543, + "learning_rate": 7.86980628379766e-05, + "loss": 0.6517, + "step": 16311 + }, + { + "epoch": 1.1052239311606478, + "grad_norm": 6.541675567626953, + "learning_rate": 7.869669381887878e-05, + "loss": 0.5173, + "step": 16312 + }, + { + "epoch": 1.1052916864286197, + "grad_norm": 8.50218677520752, + "learning_rate": 7.869532479978096e-05, + "loss": 0.6266, + "step": 16313 + }, + { + "epoch": 1.105359441696592, + "grad_norm": 4.665585517883301, + "learning_rate": 7.869395578068315e-05, + "loss": 0.6517, + "step": 16314 + }, + { + "epoch": 1.105427196964564, + "grad_norm": 6.082878112792969, + "learning_rate": 7.869258676158534e-05, + "loss": 0.9384, + "step": 16315 + }, + { + "epoch": 1.1054949522325361, + "grad_norm": 6.00462532043457, + "learning_rate": 7.869121774248752e-05, + "loss": 0.7451, + "step": 16316 + }, + { + "epoch": 1.1055627075005081, + "grad_norm": 6.826976776123047, + "learning_rate": 7.86898487233897e-05, + "loss": 0.8636, + "step": 16317 + }, + { + "epoch": 1.1056304627684803, + "grad_norm": 6.036766052246094, + "learning_rate": 7.868847970429188e-05, + "loss": 0.6864, + "step": 16318 + }, + { + "epoch": 1.1056982180364523, + "grad_norm": 5.662924766540527, + "learning_rate": 7.868711068519407e-05, + "loss": 0.6582, + "step": 16319 + }, + { + "epoch": 1.1057659733044245, + "grad_norm": 5.307290077209473, + "learning_rate": 7.868574166609625e-05, + "loss": 0.6985, + "step": 16320 + }, + { + "epoch": 1.1058337285723965, + "grad_norm": 6.92588472366333, + "learning_rate": 7.868437264699843e-05, + "loss": 0.799, + "step": 16321 + }, + { + "epoch": 1.1059014838403687, + "grad_norm": 7.620028972625732, + "learning_rate": 7.868300362790061e-05, + "loss": 0.7356, + "step": 16322 + }, + { + "epoch": 1.1059692391083407, + "grad_norm": 8.529465675354004, + "learning_rate": 7.86816346088028e-05, + "loss": 0.5915, + "step": 16323 + }, + { + "epoch": 1.1060369943763129, + "grad_norm": 5.126700401306152, + "learning_rate": 7.868026558970499e-05, + "loss": 0.66, + "step": 16324 + }, + { + "epoch": 1.1061047496442848, + "grad_norm": 5.512331962585449, + "learning_rate": 7.867889657060717e-05, + "loss": 0.804, + "step": 16325 + }, + { + "epoch": 1.1061725049122568, + "grad_norm": 5.728443145751953, + "learning_rate": 7.867752755150935e-05, + "loss": 0.6654, + "step": 16326 + }, + { + "epoch": 1.106240260180229, + "grad_norm": 4.493624210357666, + "learning_rate": 7.867615853241153e-05, + "loss": 0.5286, + "step": 16327 + }, + { + "epoch": 1.106308015448201, + "grad_norm": 5.167482376098633, + "learning_rate": 7.867478951331372e-05, + "loss": 0.6084, + "step": 16328 + }, + { + "epoch": 1.1063757707161732, + "grad_norm": 5.0305657386779785, + "learning_rate": 7.86734204942159e-05, + "loss": 0.8362, + "step": 16329 + }, + { + "epoch": 1.1064435259841452, + "grad_norm": 5.162072658538818, + "learning_rate": 7.867205147511808e-05, + "loss": 0.8681, + "step": 16330 + }, + { + "epoch": 1.1065112812521174, + "grad_norm": 6.101891040802002, + "learning_rate": 7.867068245602026e-05, + "loss": 0.7332, + "step": 16331 + }, + { + "epoch": 1.1065790365200894, + "grad_norm": 6.320982456207275, + "learning_rate": 7.866931343692246e-05, + "loss": 0.9075, + "step": 16332 + }, + { + "epoch": 1.1066467917880616, + "grad_norm": 5.854201316833496, + "learning_rate": 7.866794441782464e-05, + "loss": 0.5638, + "step": 16333 + }, + { + "epoch": 1.1067145470560336, + "grad_norm": 5.276032447814941, + "learning_rate": 7.866657539872682e-05, + "loss": 0.6531, + "step": 16334 + }, + { + "epoch": 1.1067823023240058, + "grad_norm": 8.056017875671387, + "learning_rate": 7.8665206379629e-05, + "loss": 0.5859, + "step": 16335 + }, + { + "epoch": 1.1068500575919777, + "grad_norm": 5.445096492767334, + "learning_rate": 7.866383736053118e-05, + "loss": 0.8072, + "step": 16336 + }, + { + "epoch": 1.10691781285995, + "grad_norm": 9.058344841003418, + "learning_rate": 7.866246834143337e-05, + "loss": 0.9842, + "step": 16337 + }, + { + "epoch": 1.106985568127922, + "grad_norm": 5.209854602813721, + "learning_rate": 7.866109932233555e-05, + "loss": 0.6562, + "step": 16338 + }, + { + "epoch": 1.1070533233958941, + "grad_norm": 4.583771705627441, + "learning_rate": 7.865973030323773e-05, + "loss": 0.6274, + "step": 16339 + }, + { + "epoch": 1.107121078663866, + "grad_norm": 5.5320940017700195, + "learning_rate": 7.865836128413991e-05, + "loss": 0.6254, + "step": 16340 + }, + { + "epoch": 1.107188833931838, + "grad_norm": 5.773438930511475, + "learning_rate": 7.86569922650421e-05, + "loss": 0.8155, + "step": 16341 + }, + { + "epoch": 1.1072565891998103, + "grad_norm": 5.824255466461182, + "learning_rate": 7.865562324594429e-05, + "loss": 0.6921, + "step": 16342 + }, + { + "epoch": 1.1073243444677823, + "grad_norm": 7.974730014801025, + "learning_rate": 7.865425422684647e-05, + "loss": 0.7548, + "step": 16343 + }, + { + "epoch": 1.1073920997357545, + "grad_norm": 8.403234481811523, + "learning_rate": 7.865288520774865e-05, + "loss": 0.6378, + "step": 16344 + }, + { + "epoch": 1.1074598550037265, + "grad_norm": 6.035255432128906, + "learning_rate": 7.865151618865083e-05, + "loss": 0.9632, + "step": 16345 + }, + { + "epoch": 1.1075276102716987, + "grad_norm": 4.658270835876465, + "learning_rate": 7.865014716955302e-05, + "loss": 0.7039, + "step": 16346 + }, + { + "epoch": 1.1075953655396706, + "grad_norm": 4.916011333465576, + "learning_rate": 7.86487781504552e-05, + "loss": 0.5733, + "step": 16347 + }, + { + "epoch": 1.1076631208076428, + "grad_norm": 5.327943801879883, + "learning_rate": 7.864740913135738e-05, + "loss": 0.6256, + "step": 16348 + }, + { + "epoch": 1.1077308760756148, + "grad_norm": 5.153133392333984, + "learning_rate": 7.864604011225956e-05, + "loss": 0.5463, + "step": 16349 + }, + { + "epoch": 1.107798631343587, + "grad_norm": 4.027024745941162, + "learning_rate": 7.864467109316174e-05, + "loss": 0.5824, + "step": 16350 + }, + { + "epoch": 1.107866386611559, + "grad_norm": 5.886280536651611, + "learning_rate": 7.864330207406394e-05, + "loss": 0.758, + "step": 16351 + }, + { + "epoch": 1.1079341418795312, + "grad_norm": 10.420184135437012, + "learning_rate": 7.864193305496612e-05, + "loss": 0.7657, + "step": 16352 + }, + { + "epoch": 1.1080018971475032, + "grad_norm": 6.934298515319824, + "learning_rate": 7.86405640358683e-05, + "loss": 0.579, + "step": 16353 + }, + { + "epoch": 1.1080696524154754, + "grad_norm": 5.0468573570251465, + "learning_rate": 7.863919501677049e-05, + "loss": 0.6553, + "step": 16354 + }, + { + "epoch": 1.1081374076834474, + "grad_norm": 4.991286754608154, + "learning_rate": 7.863782599767267e-05, + "loss": 0.6482, + "step": 16355 + }, + { + "epoch": 1.1082051629514196, + "grad_norm": 5.812962055206299, + "learning_rate": 7.863645697857485e-05, + "loss": 0.7708, + "step": 16356 + }, + { + "epoch": 1.1082729182193916, + "grad_norm": 6.6987104415893555, + "learning_rate": 7.863508795947705e-05, + "loss": 0.807, + "step": 16357 + }, + { + "epoch": 1.1083406734873638, + "grad_norm": 4.859643459320068, + "learning_rate": 7.863371894037923e-05, + "loss": 0.7066, + "step": 16358 + }, + { + "epoch": 1.1084084287553357, + "grad_norm": 8.146595001220703, + "learning_rate": 7.86323499212814e-05, + "loss": 0.7213, + "step": 16359 + }, + { + "epoch": 1.1084761840233077, + "grad_norm": 5.962488174438477, + "learning_rate": 7.86309809021836e-05, + "loss": 0.9372, + "step": 16360 + }, + { + "epoch": 1.10854393929128, + "grad_norm": 6.733558177947998, + "learning_rate": 7.862961188308578e-05, + "loss": 0.7216, + "step": 16361 + }, + { + "epoch": 1.108611694559252, + "grad_norm": 6.295005798339844, + "learning_rate": 7.862824286398796e-05, + "loss": 0.6771, + "step": 16362 + }, + { + "epoch": 1.108679449827224, + "grad_norm": 5.711909770965576, + "learning_rate": 7.862687384489014e-05, + "loss": 0.6979, + "step": 16363 + }, + { + "epoch": 1.108747205095196, + "grad_norm": 7.529008865356445, + "learning_rate": 7.862550482579233e-05, + "loss": 0.8168, + "step": 16364 + }, + { + "epoch": 1.1088149603631683, + "grad_norm": 6.682563304901123, + "learning_rate": 7.862413580669451e-05, + "loss": 0.7012, + "step": 16365 + }, + { + "epoch": 1.1088827156311403, + "grad_norm": 5.381031513214111, + "learning_rate": 7.86227667875967e-05, + "loss": 0.6594, + "step": 16366 + }, + { + "epoch": 1.1089504708991125, + "grad_norm": 5.263582229614258, + "learning_rate": 7.862139776849888e-05, + "loss": 0.6086, + "step": 16367 + }, + { + "epoch": 1.1090182261670845, + "grad_norm": 4.777389049530029, + "learning_rate": 7.862002874940106e-05, + "loss": 0.563, + "step": 16368 + }, + { + "epoch": 1.1090859814350567, + "grad_norm": 4.717031478881836, + "learning_rate": 7.861865973030325e-05, + "loss": 0.7422, + "step": 16369 + }, + { + "epoch": 1.1091537367030286, + "grad_norm": 6.94821310043335, + "learning_rate": 7.861729071120543e-05, + "loss": 0.8252, + "step": 16370 + }, + { + "epoch": 1.1092214919710008, + "grad_norm": 6.443717956542969, + "learning_rate": 7.861592169210761e-05, + "loss": 0.616, + "step": 16371 + }, + { + "epoch": 1.1092892472389728, + "grad_norm": 5.591299533843994, + "learning_rate": 7.861455267300979e-05, + "loss": 0.7008, + "step": 16372 + }, + { + "epoch": 1.109357002506945, + "grad_norm": 6.745190143585205, + "learning_rate": 7.861318365391197e-05, + "loss": 0.4351, + "step": 16373 + }, + { + "epoch": 1.109424757774917, + "grad_norm": 4.188071250915527, + "learning_rate": 7.861181463481417e-05, + "loss": 0.644, + "step": 16374 + }, + { + "epoch": 1.109492513042889, + "grad_norm": 9.427136421203613, + "learning_rate": 7.861044561571635e-05, + "loss": 0.6456, + "step": 16375 + }, + { + "epoch": 1.1095602683108612, + "grad_norm": 6.370525360107422, + "learning_rate": 7.860907659661853e-05, + "loss": 0.8236, + "step": 16376 + }, + { + "epoch": 1.1096280235788332, + "grad_norm": 5.648697376251221, + "learning_rate": 7.86077075775207e-05, + "loss": 0.7623, + "step": 16377 + }, + { + "epoch": 1.1096957788468054, + "grad_norm": 4.926682472229004, + "learning_rate": 7.86063385584229e-05, + "loss": 0.5091, + "step": 16378 + }, + { + "epoch": 1.1097635341147774, + "grad_norm": 5.184731483459473, + "learning_rate": 7.860496953932508e-05, + "loss": 0.6228, + "step": 16379 + }, + { + "epoch": 1.1098312893827496, + "grad_norm": 4.412238597869873, + "learning_rate": 7.860360052022726e-05, + "loss": 0.4859, + "step": 16380 + }, + { + "epoch": 1.1098990446507215, + "grad_norm": 5.148406505584717, + "learning_rate": 7.860223150112944e-05, + "loss": 0.6026, + "step": 16381 + }, + { + "epoch": 1.1099667999186937, + "grad_norm": 6.479032039642334, + "learning_rate": 7.860086248203162e-05, + "loss": 0.6879, + "step": 16382 + }, + { + "epoch": 1.1100345551866657, + "grad_norm": 5.107529163360596, + "learning_rate": 7.859949346293382e-05, + "loss": 0.8316, + "step": 16383 + }, + { + "epoch": 1.110102310454638, + "grad_norm": 6.092916011810303, + "learning_rate": 7.8598124443836e-05, + "loss": 0.5405, + "step": 16384 + }, + { + "epoch": 1.11017006572261, + "grad_norm": 6.275728702545166, + "learning_rate": 7.859675542473818e-05, + "loss": 0.6238, + "step": 16385 + }, + { + "epoch": 1.110237820990582, + "grad_norm": 5.813026428222656, + "learning_rate": 7.859538640564036e-05, + "loss": 0.7023, + "step": 16386 + }, + { + "epoch": 1.110305576258554, + "grad_norm": 5.524173259735107, + "learning_rate": 7.859401738654255e-05, + "loss": 0.6626, + "step": 16387 + }, + { + "epoch": 1.1103733315265263, + "grad_norm": 7.234399318695068, + "learning_rate": 7.859264836744473e-05, + "loss": 0.5818, + "step": 16388 + }, + { + "epoch": 1.1104410867944983, + "grad_norm": 5.309559345245361, + "learning_rate": 7.859127934834691e-05, + "loss": 0.7264, + "step": 16389 + }, + { + "epoch": 1.1105088420624702, + "grad_norm": 6.291361331939697, + "learning_rate": 7.858991032924909e-05, + "loss": 0.8169, + "step": 16390 + }, + { + "epoch": 1.1105765973304424, + "grad_norm": 5.197052955627441, + "learning_rate": 7.858854131015127e-05, + "loss": 0.7933, + "step": 16391 + }, + { + "epoch": 1.1106443525984144, + "grad_norm": 6.224545955657959, + "learning_rate": 7.858717229105347e-05, + "loss": 0.9598, + "step": 16392 + }, + { + "epoch": 1.1107121078663866, + "grad_norm": 7.769322395324707, + "learning_rate": 7.858580327195565e-05, + "loss": 0.8685, + "step": 16393 + }, + { + "epoch": 1.1107798631343586, + "grad_norm": 6.402470588684082, + "learning_rate": 7.858443425285783e-05, + "loss": 0.8052, + "step": 16394 + }, + { + "epoch": 1.1108476184023308, + "grad_norm": 5.41104793548584, + "learning_rate": 7.858306523376e-05, + "loss": 0.6345, + "step": 16395 + }, + { + "epoch": 1.1109153736703028, + "grad_norm": 6.050509452819824, + "learning_rate": 7.85816962146622e-05, + "loss": 0.5625, + "step": 16396 + }, + { + "epoch": 1.110983128938275, + "grad_norm": 5.332095146179199, + "learning_rate": 7.858032719556438e-05, + "loss": 0.5443, + "step": 16397 + }, + { + "epoch": 1.111050884206247, + "grad_norm": 5.569375991821289, + "learning_rate": 7.857895817646656e-05, + "loss": 0.5195, + "step": 16398 + }, + { + "epoch": 1.1111186394742192, + "grad_norm": 5.020506381988525, + "learning_rate": 7.857758915736874e-05, + "loss": 0.6129, + "step": 16399 + }, + { + "epoch": 1.1111863947421912, + "grad_norm": 4.502058029174805, + "learning_rate": 7.857622013827092e-05, + "loss": 0.6652, + "step": 16400 + }, + { + "epoch": 1.1112541500101634, + "grad_norm": 5.6798481941223145, + "learning_rate": 7.857485111917312e-05, + "loss": 0.7755, + "step": 16401 + }, + { + "epoch": 1.1113219052781353, + "grad_norm": 9.84770679473877, + "learning_rate": 7.85734821000753e-05, + "loss": 0.786, + "step": 16402 + }, + { + "epoch": 1.1113896605461075, + "grad_norm": 6.1381378173828125, + "learning_rate": 7.857211308097748e-05, + "loss": 0.8187, + "step": 16403 + }, + { + "epoch": 1.1114574158140795, + "grad_norm": 5.588286399841309, + "learning_rate": 7.857074406187967e-05, + "loss": 0.6508, + "step": 16404 + }, + { + "epoch": 1.1115251710820517, + "grad_norm": 5.41074275970459, + "learning_rate": 7.856937504278185e-05, + "loss": 0.6348, + "step": 16405 + }, + { + "epoch": 1.1115929263500237, + "grad_norm": 6.4471235275268555, + "learning_rate": 7.856800602368403e-05, + "loss": 0.9603, + "step": 16406 + }, + { + "epoch": 1.111660681617996, + "grad_norm": 6.701603412628174, + "learning_rate": 7.856663700458622e-05, + "loss": 0.7937, + "step": 16407 + }, + { + "epoch": 1.111728436885968, + "grad_norm": 7.06520414352417, + "learning_rate": 7.85652679854884e-05, + "loss": 0.6763, + "step": 16408 + }, + { + "epoch": 1.1117961921539399, + "grad_norm": 4.8184404373168945, + "learning_rate": 7.856389896639059e-05, + "loss": 0.7191, + "step": 16409 + }, + { + "epoch": 1.111863947421912, + "grad_norm": 5.840653896331787, + "learning_rate": 7.856252994729278e-05, + "loss": 0.7294, + "step": 16410 + }, + { + "epoch": 1.111931702689884, + "grad_norm": 5.681267261505127, + "learning_rate": 7.856116092819496e-05, + "loss": 0.8226, + "step": 16411 + }, + { + "epoch": 1.1119994579578563, + "grad_norm": 5.182020664215088, + "learning_rate": 7.855979190909714e-05, + "loss": 0.6697, + "step": 16412 + }, + { + "epoch": 1.1120672132258282, + "grad_norm": 7.743770599365234, + "learning_rate": 7.855842288999932e-05, + "loss": 0.8648, + "step": 16413 + }, + { + "epoch": 1.1121349684938004, + "grad_norm": 4.801437854766846, + "learning_rate": 7.85570538709015e-05, + "loss": 0.611, + "step": 16414 + }, + { + "epoch": 1.1122027237617724, + "grad_norm": 4.749513626098633, + "learning_rate": 7.85556848518037e-05, + "loss": 0.696, + "step": 16415 + }, + { + "epoch": 1.1122704790297446, + "grad_norm": 6.828696250915527, + "learning_rate": 7.855431583270587e-05, + "loss": 0.5749, + "step": 16416 + }, + { + "epoch": 1.1123382342977166, + "grad_norm": 5.405619144439697, + "learning_rate": 7.855294681360806e-05, + "loss": 0.5387, + "step": 16417 + }, + { + "epoch": 1.1124059895656888, + "grad_norm": 5.546106338500977, + "learning_rate": 7.855157779451024e-05, + "loss": 0.6858, + "step": 16418 + }, + { + "epoch": 1.1124737448336608, + "grad_norm": 5.9285712242126465, + "learning_rate": 7.855020877541243e-05, + "loss": 0.8244, + "step": 16419 + }, + { + "epoch": 1.112541500101633, + "grad_norm": 5.90974760055542, + "learning_rate": 7.854883975631461e-05, + "loss": 0.6853, + "step": 16420 + }, + { + "epoch": 1.112609255369605, + "grad_norm": 11.681938171386719, + "learning_rate": 7.854747073721679e-05, + "loss": 0.7719, + "step": 16421 + }, + { + "epoch": 1.1126770106375772, + "grad_norm": 5.424148082733154, + "learning_rate": 7.854610171811897e-05, + "loss": 0.6133, + "step": 16422 + }, + { + "epoch": 1.1127447659055492, + "grad_norm": 6.452503204345703, + "learning_rate": 7.854473269902115e-05, + "loss": 0.523, + "step": 16423 + }, + { + "epoch": 1.1128125211735211, + "grad_norm": 11.88731861114502, + "learning_rate": 7.854336367992334e-05, + "loss": 0.9297, + "step": 16424 + }, + { + "epoch": 1.1128802764414933, + "grad_norm": 6.110743522644043, + "learning_rate": 7.854199466082553e-05, + "loss": 0.5357, + "step": 16425 + }, + { + "epoch": 1.1129480317094653, + "grad_norm": 5.067276477813721, + "learning_rate": 7.85406256417277e-05, + "loss": 0.5576, + "step": 16426 + }, + { + "epoch": 1.1130157869774375, + "grad_norm": 6.3760552406311035, + "learning_rate": 7.853925662262989e-05, + "loss": 0.6453, + "step": 16427 + }, + { + "epoch": 1.1130835422454095, + "grad_norm": 7.936148166656494, + "learning_rate": 7.853788760353207e-05, + "loss": 0.8744, + "step": 16428 + }, + { + "epoch": 1.1131512975133817, + "grad_norm": 4.137428283691406, + "learning_rate": 7.853651858443426e-05, + "loss": 0.5353, + "step": 16429 + }, + { + "epoch": 1.1132190527813537, + "grad_norm": 6.208136081695557, + "learning_rate": 7.853514956533644e-05, + "loss": 0.7925, + "step": 16430 + }, + { + "epoch": 1.113286808049326, + "grad_norm": 7.687192440032959, + "learning_rate": 7.853378054623862e-05, + "loss": 0.9704, + "step": 16431 + }, + { + "epoch": 1.1133545633172979, + "grad_norm": 4.639840602874756, + "learning_rate": 7.85324115271408e-05, + "loss": 0.6534, + "step": 16432 + }, + { + "epoch": 1.11342231858527, + "grad_norm": 6.294159889221191, + "learning_rate": 7.8531042508043e-05, + "loss": 0.8825, + "step": 16433 + }, + { + "epoch": 1.113490073853242, + "grad_norm": 7.234892845153809, + "learning_rate": 7.852967348894518e-05, + "loss": 0.6695, + "step": 16434 + }, + { + "epoch": 1.1135578291212143, + "grad_norm": 7.041029453277588, + "learning_rate": 7.852830446984736e-05, + "loss": 0.8467, + "step": 16435 + }, + { + "epoch": 1.1136255843891862, + "grad_norm": 8.180865287780762, + "learning_rate": 7.852693545074954e-05, + "loss": 0.5544, + "step": 16436 + }, + { + "epoch": 1.1136933396571584, + "grad_norm": 5.615832328796387, + "learning_rate": 7.852556643165172e-05, + "loss": 0.788, + "step": 16437 + }, + { + "epoch": 1.1137610949251304, + "grad_norm": 5.674948692321777, + "learning_rate": 7.852419741255391e-05, + "loss": 0.6344, + "step": 16438 + }, + { + "epoch": 1.1138288501931024, + "grad_norm": 5.536764621734619, + "learning_rate": 7.852282839345609e-05, + "loss": 0.6614, + "step": 16439 + }, + { + "epoch": 1.1138966054610746, + "grad_norm": 5.49966287612915, + "learning_rate": 7.852145937435827e-05, + "loss": 0.5425, + "step": 16440 + }, + { + "epoch": 1.1139643607290466, + "grad_norm": 5.57595157623291, + "learning_rate": 7.852009035526045e-05, + "loss": 0.5386, + "step": 16441 + }, + { + "epoch": 1.1140321159970188, + "grad_norm": 4.964458465576172, + "learning_rate": 7.851872133616265e-05, + "loss": 0.5351, + "step": 16442 + }, + { + "epoch": 1.1140998712649908, + "grad_norm": 6.3265509605407715, + "learning_rate": 7.851735231706483e-05, + "loss": 0.6693, + "step": 16443 + }, + { + "epoch": 1.114167626532963, + "grad_norm": 5.302865505218506, + "learning_rate": 7.8515983297967e-05, + "loss": 0.6336, + "step": 16444 + }, + { + "epoch": 1.114235381800935, + "grad_norm": 4.544825553894043, + "learning_rate": 7.851461427886919e-05, + "loss": 0.5943, + "step": 16445 + }, + { + "epoch": 1.1143031370689072, + "grad_norm": 5.0601630210876465, + "learning_rate": 7.851324525977137e-05, + "loss": 0.714, + "step": 16446 + }, + { + "epoch": 1.1143708923368791, + "grad_norm": 4.90870475769043, + "learning_rate": 7.851187624067356e-05, + "loss": 0.6078, + "step": 16447 + }, + { + "epoch": 1.1144386476048513, + "grad_norm": 5.746411323547363, + "learning_rate": 7.851050722157574e-05, + "loss": 0.5527, + "step": 16448 + }, + { + "epoch": 1.1145064028728233, + "grad_norm": 5.825742244720459, + "learning_rate": 7.850913820247792e-05, + "loss": 0.5043, + "step": 16449 + }, + { + "epoch": 1.1145741581407955, + "grad_norm": 6.97534704208374, + "learning_rate": 7.850776918338011e-05, + "loss": 0.8069, + "step": 16450 + }, + { + "epoch": 1.1146419134087675, + "grad_norm": 6.477380752563477, + "learning_rate": 7.85064001642823e-05, + "loss": 0.6987, + "step": 16451 + }, + { + "epoch": 1.1147096686767397, + "grad_norm": 5.29969596862793, + "learning_rate": 7.850503114518448e-05, + "loss": 0.7252, + "step": 16452 + }, + { + "epoch": 1.1147774239447117, + "grad_norm": 5.611807823181152, + "learning_rate": 7.850366212608667e-05, + "loss": 0.6814, + "step": 16453 + }, + { + "epoch": 1.114845179212684, + "grad_norm": 6.4802350997924805, + "learning_rate": 7.850229310698885e-05, + "loss": 0.4992, + "step": 16454 + }, + { + "epoch": 1.1149129344806559, + "grad_norm": 5.166659832000732, + "learning_rate": 7.850092408789103e-05, + "loss": 0.708, + "step": 16455 + }, + { + "epoch": 1.114980689748628, + "grad_norm": 4.782804012298584, + "learning_rate": 7.849955506879322e-05, + "loss": 0.7891, + "step": 16456 + }, + { + "epoch": 1.1150484450166, + "grad_norm": 6.390671730041504, + "learning_rate": 7.84981860496954e-05, + "loss": 0.731, + "step": 16457 + }, + { + "epoch": 1.115116200284572, + "grad_norm": 5.91715145111084, + "learning_rate": 7.849681703059758e-05, + "loss": 0.6395, + "step": 16458 + }, + { + "epoch": 1.1151839555525442, + "grad_norm": 6.513552665710449, + "learning_rate": 7.849544801149977e-05, + "loss": 0.809, + "step": 16459 + }, + { + "epoch": 1.1152517108205162, + "grad_norm": 6.119845390319824, + "learning_rate": 7.849407899240195e-05, + "loss": 0.6678, + "step": 16460 + }, + { + "epoch": 1.1153194660884884, + "grad_norm": 4.332753658294678, + "learning_rate": 7.849270997330414e-05, + "loss": 0.6528, + "step": 16461 + }, + { + "epoch": 1.1153872213564604, + "grad_norm": 5.871253490447998, + "learning_rate": 7.849134095420632e-05, + "loss": 0.6437, + "step": 16462 + }, + { + "epoch": 1.1154549766244326, + "grad_norm": 4.9431047439575195, + "learning_rate": 7.84899719351085e-05, + "loss": 0.4917, + "step": 16463 + }, + { + "epoch": 1.1155227318924046, + "grad_norm": 5.625153064727783, + "learning_rate": 7.848860291601068e-05, + "loss": 0.7017, + "step": 16464 + }, + { + "epoch": 1.1155904871603768, + "grad_norm": 6.678959369659424, + "learning_rate": 7.848723389691287e-05, + "loss": 0.7541, + "step": 16465 + }, + { + "epoch": 1.1156582424283488, + "grad_norm": 5.871247291564941, + "learning_rate": 7.848586487781505e-05, + "loss": 0.564, + "step": 16466 + }, + { + "epoch": 1.115725997696321, + "grad_norm": 4.651440143585205, + "learning_rate": 7.848449585871723e-05, + "loss": 0.5407, + "step": 16467 + }, + { + "epoch": 1.115793752964293, + "grad_norm": 6.9464521408081055, + "learning_rate": 7.848312683961942e-05, + "loss": 0.7581, + "step": 16468 + }, + { + "epoch": 1.1158615082322652, + "grad_norm": 6.172132968902588, + "learning_rate": 7.84817578205216e-05, + "loss": 0.6855, + "step": 16469 + }, + { + "epoch": 1.1159292635002371, + "grad_norm": 6.238826274871826, + "learning_rate": 7.848038880142379e-05, + "loss": 0.5436, + "step": 16470 + }, + { + "epoch": 1.1159970187682093, + "grad_norm": 7.121433734893799, + "learning_rate": 7.847901978232597e-05, + "loss": 0.962, + "step": 16471 + }, + { + "epoch": 1.1160647740361813, + "grad_norm": 6.923501014709473, + "learning_rate": 7.847765076322815e-05, + "loss": 0.6176, + "step": 16472 + }, + { + "epoch": 1.1161325293041533, + "grad_norm": 5.844583511352539, + "learning_rate": 7.847628174413033e-05, + "loss": 0.8946, + "step": 16473 + }, + { + "epoch": 1.1162002845721255, + "grad_norm": 6.436974048614502, + "learning_rate": 7.847491272503252e-05, + "loss": 0.7809, + "step": 16474 + }, + { + "epoch": 1.1162680398400975, + "grad_norm": 5.668829441070557, + "learning_rate": 7.84735437059347e-05, + "loss": 0.7343, + "step": 16475 + }, + { + "epoch": 1.1163357951080697, + "grad_norm": 4.371751308441162, + "learning_rate": 7.847217468683689e-05, + "loss": 0.5913, + "step": 16476 + }, + { + "epoch": 1.1164035503760417, + "grad_norm": 5.785346031188965, + "learning_rate": 7.847080566773907e-05, + "loss": 0.5453, + "step": 16477 + }, + { + "epoch": 1.1164713056440139, + "grad_norm": 6.657867431640625, + "learning_rate": 7.846943664864125e-05, + "loss": 0.9195, + "step": 16478 + }, + { + "epoch": 1.1165390609119858, + "grad_norm": 7.955598831176758, + "learning_rate": 7.846806762954344e-05, + "loss": 0.7671, + "step": 16479 + }, + { + "epoch": 1.116606816179958, + "grad_norm": 6.41574764251709, + "learning_rate": 7.846669861044562e-05, + "loss": 0.6941, + "step": 16480 + }, + { + "epoch": 1.11667457144793, + "grad_norm": 5.509103775024414, + "learning_rate": 7.84653295913478e-05, + "loss": 0.6792, + "step": 16481 + }, + { + "epoch": 1.1167423267159022, + "grad_norm": 7.486603260040283, + "learning_rate": 7.846396057224998e-05, + "loss": 0.6783, + "step": 16482 + }, + { + "epoch": 1.1168100819838742, + "grad_norm": 5.092047214508057, + "learning_rate": 7.846259155315216e-05, + "loss": 0.5798, + "step": 16483 + }, + { + "epoch": 1.1168778372518464, + "grad_norm": 6.867835998535156, + "learning_rate": 7.846122253405435e-05, + "loss": 0.6381, + "step": 16484 + }, + { + "epoch": 1.1169455925198184, + "grad_norm": 4.400048732757568, + "learning_rate": 7.845985351495654e-05, + "loss": 0.5484, + "step": 16485 + }, + { + "epoch": 1.1170133477877906, + "grad_norm": 5.648205757141113, + "learning_rate": 7.845848449585872e-05, + "loss": 0.7163, + "step": 16486 + }, + { + "epoch": 1.1170811030557626, + "grad_norm": 5.113688945770264, + "learning_rate": 7.84571154767609e-05, + "loss": 0.6432, + "step": 16487 + }, + { + "epoch": 1.1171488583237346, + "grad_norm": 9.245594024658203, + "learning_rate": 7.845574645766309e-05, + "loss": 1.042, + "step": 16488 + }, + { + "epoch": 1.1172166135917068, + "grad_norm": 5.310211181640625, + "learning_rate": 7.845437743856527e-05, + "loss": 0.7199, + "step": 16489 + }, + { + "epoch": 1.1172843688596787, + "grad_norm": 5.872876167297363, + "learning_rate": 7.845300841946745e-05, + "loss": 0.6653, + "step": 16490 + }, + { + "epoch": 1.117352124127651, + "grad_norm": 6.042059421539307, + "learning_rate": 7.845163940036963e-05, + "loss": 0.7398, + "step": 16491 + }, + { + "epoch": 1.117419879395623, + "grad_norm": 6.278406143188477, + "learning_rate": 7.845027038127181e-05, + "loss": 0.6938, + "step": 16492 + }, + { + "epoch": 1.1174876346635951, + "grad_norm": 4.715973854064941, + "learning_rate": 7.8448901362174e-05, + "loss": 0.6145, + "step": 16493 + }, + { + "epoch": 1.1175553899315671, + "grad_norm": 5.362488269805908, + "learning_rate": 7.844753234307619e-05, + "loss": 0.7129, + "step": 16494 + }, + { + "epoch": 1.1176231451995393, + "grad_norm": 5.5275187492370605, + "learning_rate": 7.844616332397837e-05, + "loss": 0.519, + "step": 16495 + }, + { + "epoch": 1.1176909004675113, + "grad_norm": 4.727498531341553, + "learning_rate": 7.844479430488056e-05, + "loss": 0.6799, + "step": 16496 + }, + { + "epoch": 1.1177586557354835, + "grad_norm": 5.0060200691223145, + "learning_rate": 7.844342528578274e-05, + "loss": 0.6559, + "step": 16497 + }, + { + "epoch": 1.1178264110034555, + "grad_norm": 8.323198318481445, + "learning_rate": 7.844205626668492e-05, + "loss": 0.574, + "step": 16498 + }, + { + "epoch": 1.1178941662714277, + "grad_norm": 5.619205951690674, + "learning_rate": 7.844068724758711e-05, + "loss": 0.7148, + "step": 16499 + }, + { + "epoch": 1.1179619215393997, + "grad_norm": 6.951564788818359, + "learning_rate": 7.84393182284893e-05, + "loss": 0.6269, + "step": 16500 + }, + { + "epoch": 1.1180296768073719, + "grad_norm": 5.056802272796631, + "learning_rate": 7.843794920939147e-05, + "loss": 0.8339, + "step": 16501 + }, + { + "epoch": 1.1180974320753438, + "grad_norm": 6.4941792488098145, + "learning_rate": 7.843658019029367e-05, + "loss": 0.7636, + "step": 16502 + }, + { + "epoch": 1.1181651873433158, + "grad_norm": 6.141055583953857, + "learning_rate": 7.843521117119585e-05, + "loss": 0.7537, + "step": 16503 + }, + { + "epoch": 1.118232942611288, + "grad_norm": 7.453089237213135, + "learning_rate": 7.843384215209803e-05, + "loss": 0.8046, + "step": 16504 + }, + { + "epoch": 1.1183006978792602, + "grad_norm": 7.412580490112305, + "learning_rate": 7.843247313300021e-05, + "loss": 0.6542, + "step": 16505 + }, + { + "epoch": 1.1183684531472322, + "grad_norm": 6.80420446395874, + "learning_rate": 7.843110411390239e-05, + "loss": 0.6983, + "step": 16506 + }, + { + "epoch": 1.1184362084152042, + "grad_norm": 5.158776760101318, + "learning_rate": 7.842973509480458e-05, + "loss": 0.5782, + "step": 16507 + }, + { + "epoch": 1.1185039636831764, + "grad_norm": 6.552058219909668, + "learning_rate": 7.842836607570676e-05, + "loss": 0.9792, + "step": 16508 + }, + { + "epoch": 1.1185717189511484, + "grad_norm": 4.8856024742126465, + "learning_rate": 7.842699705660894e-05, + "loss": 0.6559, + "step": 16509 + }, + { + "epoch": 1.1186394742191206, + "grad_norm": 5.895238399505615, + "learning_rate": 7.842562803751113e-05, + "loss": 0.6988, + "step": 16510 + }, + { + "epoch": 1.1187072294870926, + "grad_norm": 10.207904815673828, + "learning_rate": 7.842425901841332e-05, + "loss": 0.9168, + "step": 16511 + }, + { + "epoch": 1.1187749847550648, + "grad_norm": 4.816634654998779, + "learning_rate": 7.84228899993155e-05, + "loss": 0.7268, + "step": 16512 + }, + { + "epoch": 1.1188427400230367, + "grad_norm": 7.246109485626221, + "learning_rate": 7.842152098021768e-05, + "loss": 0.8063, + "step": 16513 + }, + { + "epoch": 1.118910495291009, + "grad_norm": 7.619381427764893, + "learning_rate": 7.842015196111986e-05, + "loss": 0.8026, + "step": 16514 + }, + { + "epoch": 1.118978250558981, + "grad_norm": 8.938972473144531, + "learning_rate": 7.841878294202204e-05, + "loss": 1.2743, + "step": 16515 + }, + { + "epoch": 1.1190460058269531, + "grad_norm": 7.479329586029053, + "learning_rate": 7.841741392292423e-05, + "loss": 0.6163, + "step": 16516 + }, + { + "epoch": 1.119113761094925, + "grad_norm": 5.318511009216309, + "learning_rate": 7.841604490382641e-05, + "loss": 0.7075, + "step": 16517 + }, + { + "epoch": 1.1191815163628973, + "grad_norm": 6.804056644439697, + "learning_rate": 7.84146758847286e-05, + "loss": 0.81, + "step": 16518 + }, + { + "epoch": 1.1192492716308693, + "grad_norm": 5.385700702667236, + "learning_rate": 7.841330686563078e-05, + "loss": 0.7378, + "step": 16519 + }, + { + "epoch": 1.1193170268988415, + "grad_norm": 7.53215217590332, + "learning_rate": 7.841193784653297e-05, + "loss": 0.6082, + "step": 16520 + }, + { + "epoch": 1.1193847821668135, + "grad_norm": 8.614946365356445, + "learning_rate": 7.841056882743515e-05, + "loss": 0.6866, + "step": 16521 + }, + { + "epoch": 1.1194525374347855, + "grad_norm": 6.260163307189941, + "learning_rate": 7.840919980833733e-05, + "loss": 0.6287, + "step": 16522 + }, + { + "epoch": 1.1195202927027577, + "grad_norm": 9.336182594299316, + "learning_rate": 7.840783078923951e-05, + "loss": 0.7944, + "step": 16523 + }, + { + "epoch": 1.1195880479707296, + "grad_norm": 10.182073593139648, + "learning_rate": 7.840646177014169e-05, + "loss": 0.6368, + "step": 16524 + }, + { + "epoch": 1.1196558032387018, + "grad_norm": 5.3863420486450195, + "learning_rate": 7.840509275104388e-05, + "loss": 0.7848, + "step": 16525 + }, + { + "epoch": 1.1197235585066738, + "grad_norm": 4.769381999969482, + "learning_rate": 7.840372373194606e-05, + "loss": 0.5335, + "step": 16526 + }, + { + "epoch": 1.119791313774646, + "grad_norm": 8.063030242919922, + "learning_rate": 7.840235471284825e-05, + "loss": 0.7534, + "step": 16527 + }, + { + "epoch": 1.119859069042618, + "grad_norm": 6.1403045654296875, + "learning_rate": 7.840098569375043e-05, + "loss": 0.8089, + "step": 16528 + }, + { + "epoch": 1.1199268243105902, + "grad_norm": 7.606939792633057, + "learning_rate": 7.839961667465262e-05, + "loss": 0.7578, + "step": 16529 + }, + { + "epoch": 1.1199945795785622, + "grad_norm": 10.877296447753906, + "learning_rate": 7.83982476555548e-05, + "loss": 0.54, + "step": 16530 + }, + { + "epoch": 1.1200623348465344, + "grad_norm": 5.781667709350586, + "learning_rate": 7.839687863645698e-05, + "loss": 0.8624, + "step": 16531 + }, + { + "epoch": 1.1201300901145064, + "grad_norm": 4.3829827308654785, + "learning_rate": 7.839550961735916e-05, + "loss": 0.4979, + "step": 16532 + }, + { + "epoch": 1.1201978453824786, + "grad_norm": 4.735124111175537, + "learning_rate": 7.839414059826134e-05, + "loss": 0.6497, + "step": 16533 + }, + { + "epoch": 1.1202656006504506, + "grad_norm": 5.546881198883057, + "learning_rate": 7.839277157916353e-05, + "loss": 0.6842, + "step": 16534 + }, + { + "epoch": 1.1203333559184228, + "grad_norm": 4.9232177734375, + "learning_rate": 7.839140256006571e-05, + "loss": 0.5805, + "step": 16535 + }, + { + "epoch": 1.1204011111863947, + "grad_norm": 4.378359794616699, + "learning_rate": 7.83900335409679e-05, + "loss": 0.6249, + "step": 16536 + }, + { + "epoch": 1.1204688664543667, + "grad_norm": 5.407273769378662, + "learning_rate": 7.838866452187008e-05, + "loss": 0.8445, + "step": 16537 + }, + { + "epoch": 1.120536621722339, + "grad_norm": 4.610024452209473, + "learning_rate": 7.838729550277226e-05, + "loss": 0.518, + "step": 16538 + }, + { + "epoch": 1.120604376990311, + "grad_norm": 11.108832359313965, + "learning_rate": 7.838592648367445e-05, + "loss": 1.08, + "step": 16539 + }, + { + "epoch": 1.120672132258283, + "grad_norm": 4.610969066619873, + "learning_rate": 7.838455746457663e-05, + "loss": 0.5164, + "step": 16540 + }, + { + "epoch": 1.120739887526255, + "grad_norm": 8.567065238952637, + "learning_rate": 7.838318844547881e-05, + "loss": 1.0346, + "step": 16541 + }, + { + "epoch": 1.1208076427942273, + "grad_norm": 6.773820877075195, + "learning_rate": 7.8381819426381e-05, + "loss": 0.7441, + "step": 16542 + }, + { + "epoch": 1.1208753980621993, + "grad_norm": 9.476489067077637, + "learning_rate": 7.838045040728318e-05, + "loss": 0.5177, + "step": 16543 + }, + { + "epoch": 1.1209431533301715, + "grad_norm": 6.598116397857666, + "learning_rate": 7.837908138818537e-05, + "loss": 0.6661, + "step": 16544 + }, + { + "epoch": 1.1210109085981435, + "grad_norm": 6.180941581726074, + "learning_rate": 7.837771236908756e-05, + "loss": 0.8152, + "step": 16545 + }, + { + "epoch": 1.1210786638661157, + "grad_norm": 6.241885662078857, + "learning_rate": 7.837634334998974e-05, + "loss": 0.8264, + "step": 16546 + }, + { + "epoch": 1.1211464191340876, + "grad_norm": 5.314054489135742, + "learning_rate": 7.837497433089192e-05, + "loss": 0.6123, + "step": 16547 + }, + { + "epoch": 1.1212141744020598, + "grad_norm": 4.802424430847168, + "learning_rate": 7.837360531179411e-05, + "loss": 0.6321, + "step": 16548 + }, + { + "epoch": 1.1212819296700318, + "grad_norm": 5.131801605224609, + "learning_rate": 7.83722362926963e-05, + "loss": 0.5852, + "step": 16549 + }, + { + "epoch": 1.121349684938004, + "grad_norm": 4.5471391677856445, + "learning_rate": 7.837086727359847e-05, + "loss": 0.419, + "step": 16550 + }, + { + "epoch": 1.121417440205976, + "grad_norm": 6.3551225662231445, + "learning_rate": 7.836949825450065e-05, + "loss": 0.6822, + "step": 16551 + }, + { + "epoch": 1.121485195473948, + "grad_norm": 6.698050022125244, + "learning_rate": 7.836812923540285e-05, + "loss": 0.7381, + "step": 16552 + }, + { + "epoch": 1.1215529507419202, + "grad_norm": 5.779103755950928, + "learning_rate": 7.836676021630503e-05, + "loss": 0.7189, + "step": 16553 + }, + { + "epoch": 1.1216207060098922, + "grad_norm": 5.6808552742004395, + "learning_rate": 7.836539119720721e-05, + "loss": 0.6718, + "step": 16554 + }, + { + "epoch": 1.1216884612778644, + "grad_norm": 5.681619644165039, + "learning_rate": 7.836402217810939e-05, + "loss": 0.4688, + "step": 16555 + }, + { + "epoch": 1.1217562165458363, + "grad_norm": 5.486327171325684, + "learning_rate": 7.836265315901157e-05, + "loss": 0.7598, + "step": 16556 + }, + { + "epoch": 1.1218239718138086, + "grad_norm": 5.492034435272217, + "learning_rate": 7.836128413991376e-05, + "loss": 0.8031, + "step": 16557 + }, + { + "epoch": 1.1218917270817805, + "grad_norm": 5.560188293457031, + "learning_rate": 7.835991512081594e-05, + "loss": 0.7689, + "step": 16558 + }, + { + "epoch": 1.1219594823497527, + "grad_norm": 6.246037006378174, + "learning_rate": 7.835854610171812e-05, + "loss": 0.7454, + "step": 16559 + }, + { + "epoch": 1.1220272376177247, + "grad_norm": 4.9940409660339355, + "learning_rate": 7.83571770826203e-05, + "loss": 0.5191, + "step": 16560 + }, + { + "epoch": 1.122094992885697, + "grad_norm": 6.49337100982666, + "learning_rate": 7.835580806352249e-05, + "loss": 0.732, + "step": 16561 + }, + { + "epoch": 1.122162748153669, + "grad_norm": 12.41988468170166, + "learning_rate": 7.835443904442468e-05, + "loss": 0.7257, + "step": 16562 + }, + { + "epoch": 1.122230503421641, + "grad_norm": 6.870421886444092, + "learning_rate": 7.835307002532686e-05, + "loss": 0.6377, + "step": 16563 + }, + { + "epoch": 1.122298258689613, + "grad_norm": 6.671525478363037, + "learning_rate": 7.835170100622904e-05, + "loss": 0.6479, + "step": 16564 + }, + { + "epoch": 1.1223660139575853, + "grad_norm": 5.417186260223389, + "learning_rate": 7.835033198713122e-05, + "loss": 0.9996, + "step": 16565 + }, + { + "epoch": 1.1224337692255573, + "grad_norm": 8.18811321258545, + "learning_rate": 7.834896296803341e-05, + "loss": 0.8235, + "step": 16566 + }, + { + "epoch": 1.1225015244935295, + "grad_norm": 6.063117027282715, + "learning_rate": 7.83475939489356e-05, + "loss": 0.777, + "step": 16567 + }, + { + "epoch": 1.1225692797615014, + "grad_norm": 4.244666576385498, + "learning_rate": 7.834622492983777e-05, + "loss": 0.4565, + "step": 16568 + }, + { + "epoch": 1.1226370350294737, + "grad_norm": 5.733912467956543, + "learning_rate": 7.834485591073995e-05, + "loss": 0.6999, + "step": 16569 + }, + { + "epoch": 1.1227047902974456, + "grad_norm": 5.080024719238281, + "learning_rate": 7.834348689164214e-05, + "loss": 0.5927, + "step": 16570 + }, + { + "epoch": 1.1227725455654176, + "grad_norm": 5.610750198364258, + "learning_rate": 7.834211787254433e-05, + "loss": 0.6637, + "step": 16571 + }, + { + "epoch": 1.1228403008333898, + "grad_norm": 5.9720635414123535, + "learning_rate": 7.834074885344651e-05, + "loss": 0.8085, + "step": 16572 + }, + { + "epoch": 1.1229080561013618, + "grad_norm": 5.229776859283447, + "learning_rate": 7.833937983434869e-05, + "loss": 0.5649, + "step": 16573 + }, + { + "epoch": 1.122975811369334, + "grad_norm": 5.457815647125244, + "learning_rate": 7.833801081525087e-05, + "loss": 0.6228, + "step": 16574 + }, + { + "epoch": 1.123043566637306, + "grad_norm": 5.012820243835449, + "learning_rate": 7.833664179615306e-05, + "loss": 0.6635, + "step": 16575 + }, + { + "epoch": 1.1231113219052782, + "grad_norm": 5.759067535400391, + "learning_rate": 7.833527277705524e-05, + "loss": 0.634, + "step": 16576 + }, + { + "epoch": 1.1231790771732502, + "grad_norm": 9.28831958770752, + "learning_rate": 7.833390375795742e-05, + "loss": 0.7987, + "step": 16577 + }, + { + "epoch": 1.1232468324412224, + "grad_norm": 6.013178825378418, + "learning_rate": 7.83325347388596e-05, + "loss": 0.6997, + "step": 16578 + }, + { + "epoch": 1.1233145877091943, + "grad_norm": 5.971577167510986, + "learning_rate": 7.833116571976179e-05, + "loss": 0.6976, + "step": 16579 + }, + { + "epoch": 1.1233823429771665, + "grad_norm": 7.694915771484375, + "learning_rate": 7.832979670066398e-05, + "loss": 0.9614, + "step": 16580 + }, + { + "epoch": 1.1234500982451385, + "grad_norm": 5.692512512207031, + "learning_rate": 7.832842768156616e-05, + "loss": 0.7804, + "step": 16581 + }, + { + "epoch": 1.1235178535131107, + "grad_norm": 4.4728312492370605, + "learning_rate": 7.832705866246834e-05, + "loss": 0.5845, + "step": 16582 + }, + { + "epoch": 1.1235856087810827, + "grad_norm": 5.194689750671387, + "learning_rate": 7.832568964337052e-05, + "loss": 0.5488, + "step": 16583 + }, + { + "epoch": 1.123653364049055, + "grad_norm": 5.546979904174805, + "learning_rate": 7.832432062427271e-05, + "loss": 0.6886, + "step": 16584 + }, + { + "epoch": 1.123721119317027, + "grad_norm": 5.987720489501953, + "learning_rate": 7.83229516051749e-05, + "loss": 0.8281, + "step": 16585 + }, + { + "epoch": 1.1237888745849989, + "grad_norm": 6.79714822769165, + "learning_rate": 7.832158258607707e-05, + "loss": 0.7205, + "step": 16586 + }, + { + "epoch": 1.123856629852971, + "grad_norm": 5.197304725646973, + "learning_rate": 7.832021356697926e-05, + "loss": 0.5863, + "step": 16587 + }, + { + "epoch": 1.123924385120943, + "grad_norm": 5.661016941070557, + "learning_rate": 7.831884454788145e-05, + "loss": 0.7591, + "step": 16588 + }, + { + "epoch": 1.1239921403889153, + "grad_norm": 7.9939494132995605, + "learning_rate": 7.831747552878363e-05, + "loss": 0.5234, + "step": 16589 + }, + { + "epoch": 1.1240598956568872, + "grad_norm": 4.955355644226074, + "learning_rate": 7.831610650968581e-05, + "loss": 0.5613, + "step": 16590 + }, + { + "epoch": 1.1241276509248594, + "grad_norm": 6.7157883644104, + "learning_rate": 7.8314737490588e-05, + "loss": 0.9179, + "step": 16591 + }, + { + "epoch": 1.1241954061928314, + "grad_norm": 5.742237091064453, + "learning_rate": 7.831336847149018e-05, + "loss": 0.5604, + "step": 16592 + }, + { + "epoch": 1.1242631614608036, + "grad_norm": 5.890753746032715, + "learning_rate": 7.831199945239236e-05, + "loss": 0.6248, + "step": 16593 + }, + { + "epoch": 1.1243309167287756, + "grad_norm": 6.5256242752075195, + "learning_rate": 7.831063043329456e-05, + "loss": 0.9446, + "step": 16594 + }, + { + "epoch": 1.1243986719967478, + "grad_norm": 7.787291526794434, + "learning_rate": 7.830926141419674e-05, + "loss": 0.7736, + "step": 16595 + }, + { + "epoch": 1.1244664272647198, + "grad_norm": 7.017754077911377, + "learning_rate": 7.830789239509892e-05, + "loss": 0.4302, + "step": 16596 + }, + { + "epoch": 1.124534182532692, + "grad_norm": 5.535482406616211, + "learning_rate": 7.83065233760011e-05, + "loss": 0.8716, + "step": 16597 + }, + { + "epoch": 1.124601937800664, + "grad_norm": 4.967796802520752, + "learning_rate": 7.83051543569033e-05, + "loss": 0.639, + "step": 16598 + }, + { + "epoch": 1.1246696930686362, + "grad_norm": 11.574756622314453, + "learning_rate": 7.830378533780547e-05, + "loss": 0.6214, + "step": 16599 + }, + { + "epoch": 1.1247374483366082, + "grad_norm": 5.107860088348389, + "learning_rate": 7.830241631870765e-05, + "loss": 0.5664, + "step": 16600 + }, + { + "epoch": 1.1248052036045801, + "grad_norm": 6.146525859832764, + "learning_rate": 7.830104729960983e-05, + "loss": 0.7694, + "step": 16601 + }, + { + "epoch": 1.1248729588725523, + "grad_norm": 5.966607570648193, + "learning_rate": 7.829967828051201e-05, + "loss": 0.6571, + "step": 16602 + }, + { + "epoch": 1.1249407141405243, + "grad_norm": 6.187711238861084, + "learning_rate": 7.829830926141421e-05, + "loss": 1.0278, + "step": 16603 + }, + { + "epoch": 1.1250084694084965, + "grad_norm": 7.682037353515625, + "learning_rate": 7.829694024231639e-05, + "loss": 0.6538, + "step": 16604 + }, + { + "epoch": 1.1250762246764685, + "grad_norm": 7.3382415771484375, + "learning_rate": 7.829557122321857e-05, + "loss": 0.6901, + "step": 16605 + }, + { + "epoch": 1.1251439799444407, + "grad_norm": 6.714434623718262, + "learning_rate": 7.829420220412075e-05, + "loss": 0.5718, + "step": 16606 + }, + { + "epoch": 1.1252117352124127, + "grad_norm": 6.157127857208252, + "learning_rate": 7.829283318502294e-05, + "loss": 0.7647, + "step": 16607 + }, + { + "epoch": 1.125279490480385, + "grad_norm": 5.902122497558594, + "learning_rate": 7.829146416592512e-05, + "loss": 0.7683, + "step": 16608 + }, + { + "epoch": 1.1253472457483569, + "grad_norm": 5.676656723022461, + "learning_rate": 7.82900951468273e-05, + "loss": 0.6912, + "step": 16609 + }, + { + "epoch": 1.125415001016329, + "grad_norm": 5.85725736618042, + "learning_rate": 7.828872612772948e-05, + "loss": 0.5421, + "step": 16610 + }, + { + "epoch": 1.125482756284301, + "grad_norm": 6.861623287200928, + "learning_rate": 7.828735710863166e-05, + "loss": 0.5735, + "step": 16611 + }, + { + "epoch": 1.1255505115522733, + "grad_norm": 4.622976779937744, + "learning_rate": 7.828598808953386e-05, + "loss": 0.6974, + "step": 16612 + }, + { + "epoch": 1.1256182668202452, + "grad_norm": 5.978318691253662, + "learning_rate": 7.828461907043604e-05, + "loss": 0.7616, + "step": 16613 + }, + { + "epoch": 1.1256860220882174, + "grad_norm": 8.068818092346191, + "learning_rate": 7.828325005133822e-05, + "loss": 0.8741, + "step": 16614 + }, + { + "epoch": 1.1257537773561894, + "grad_norm": 5.705319404602051, + "learning_rate": 7.82818810322404e-05, + "loss": 0.604, + "step": 16615 + }, + { + "epoch": 1.1258215326241616, + "grad_norm": 6.627946853637695, + "learning_rate": 7.828051201314258e-05, + "loss": 0.8083, + "step": 16616 + }, + { + "epoch": 1.1258892878921336, + "grad_norm": 5.951031684875488, + "learning_rate": 7.827914299404477e-05, + "loss": 0.7051, + "step": 16617 + }, + { + "epoch": 1.1259570431601058, + "grad_norm": 5.129096031188965, + "learning_rate": 7.827777397494695e-05, + "loss": 0.7117, + "step": 16618 + }, + { + "epoch": 1.1260247984280778, + "grad_norm": 7.520353317260742, + "learning_rate": 7.827640495584913e-05, + "loss": 0.6925, + "step": 16619 + }, + { + "epoch": 1.1260925536960498, + "grad_norm": 6.071813106536865, + "learning_rate": 7.827503593675131e-05, + "loss": 0.6316, + "step": 16620 + }, + { + "epoch": 1.126160308964022, + "grad_norm": 5.8903679847717285, + "learning_rate": 7.827366691765351e-05, + "loss": 0.6882, + "step": 16621 + }, + { + "epoch": 1.126228064231994, + "grad_norm": 5.252291202545166, + "learning_rate": 7.827229789855569e-05, + "loss": 0.7473, + "step": 16622 + }, + { + "epoch": 1.1262958194999662, + "grad_norm": 6.164458274841309, + "learning_rate": 7.827092887945787e-05, + "loss": 0.7696, + "step": 16623 + }, + { + "epoch": 1.1263635747679381, + "grad_norm": 5.674738883972168, + "learning_rate": 7.826955986036005e-05, + "loss": 0.6838, + "step": 16624 + }, + { + "epoch": 1.1264313300359103, + "grad_norm": 6.459506511688232, + "learning_rate": 7.826819084126223e-05, + "loss": 0.5691, + "step": 16625 + }, + { + "epoch": 1.1264990853038823, + "grad_norm": 8.24375057220459, + "learning_rate": 7.826682182216442e-05, + "loss": 0.5813, + "step": 16626 + }, + { + "epoch": 1.1265668405718545, + "grad_norm": 6.915633201599121, + "learning_rate": 7.82654528030666e-05, + "loss": 0.6045, + "step": 16627 + }, + { + "epoch": 1.1266345958398265, + "grad_norm": 7.0347394943237305, + "learning_rate": 7.826408378396878e-05, + "loss": 1.0053, + "step": 16628 + }, + { + "epoch": 1.1267023511077987, + "grad_norm": 9.013284683227539, + "learning_rate": 7.826271476487097e-05, + "loss": 0.5846, + "step": 16629 + }, + { + "epoch": 1.1267701063757707, + "grad_norm": 5.217116832733154, + "learning_rate": 7.826134574577316e-05, + "loss": 0.5151, + "step": 16630 + }, + { + "epoch": 1.1268378616437429, + "grad_norm": 4.9722466468811035, + "learning_rate": 7.825997672667534e-05, + "loss": 0.6434, + "step": 16631 + }, + { + "epoch": 1.1269056169117149, + "grad_norm": 4.9107666015625, + "learning_rate": 7.825860770757752e-05, + "loss": 0.5564, + "step": 16632 + }, + { + "epoch": 1.126973372179687, + "grad_norm": 5.44745397567749, + "learning_rate": 7.82572386884797e-05, + "loss": 0.5663, + "step": 16633 + }, + { + "epoch": 1.127041127447659, + "grad_norm": 5.827592849731445, + "learning_rate": 7.825586966938188e-05, + "loss": 0.5999, + "step": 16634 + }, + { + "epoch": 1.127108882715631, + "grad_norm": 6.206888198852539, + "learning_rate": 7.825450065028407e-05, + "loss": 0.6755, + "step": 16635 + }, + { + "epoch": 1.1271766379836032, + "grad_norm": 6.200521469116211, + "learning_rate": 7.825313163118625e-05, + "loss": 0.5529, + "step": 16636 + }, + { + "epoch": 1.1272443932515752, + "grad_norm": 5.36111307144165, + "learning_rate": 7.825176261208845e-05, + "loss": 0.762, + "step": 16637 + }, + { + "epoch": 1.1273121485195474, + "grad_norm": 4.9850664138793945, + "learning_rate": 7.825039359299063e-05, + "loss": 0.818, + "step": 16638 + }, + { + "epoch": 1.1273799037875194, + "grad_norm": 5.960310459136963, + "learning_rate": 7.824902457389281e-05, + "loss": 0.8029, + "step": 16639 + }, + { + "epoch": 1.1274476590554916, + "grad_norm": 4.687396049499512, + "learning_rate": 7.8247655554795e-05, + "loss": 0.5797, + "step": 16640 + }, + { + "epoch": 1.1275154143234636, + "grad_norm": 4.676320552825928, + "learning_rate": 7.824628653569718e-05, + "loss": 0.6945, + "step": 16641 + }, + { + "epoch": 1.1275831695914358, + "grad_norm": 6.536602020263672, + "learning_rate": 7.824491751659936e-05, + "loss": 0.7977, + "step": 16642 + }, + { + "epoch": 1.1276509248594078, + "grad_norm": 6.302074432373047, + "learning_rate": 7.824354849750154e-05, + "loss": 0.9523, + "step": 16643 + }, + { + "epoch": 1.12771868012738, + "grad_norm": 7.156180381774902, + "learning_rate": 7.824217947840374e-05, + "loss": 0.6981, + "step": 16644 + }, + { + "epoch": 1.127786435395352, + "grad_norm": 7.256894588470459, + "learning_rate": 7.824081045930592e-05, + "loss": 0.8173, + "step": 16645 + }, + { + "epoch": 1.1278541906633242, + "grad_norm": 9.1759033203125, + "learning_rate": 7.82394414402081e-05, + "loss": 0.6992, + "step": 16646 + }, + { + "epoch": 1.1279219459312961, + "grad_norm": 6.0825886726379395, + "learning_rate": 7.823807242111028e-05, + "loss": 0.837, + "step": 16647 + }, + { + "epoch": 1.1279897011992683, + "grad_norm": 7.240145683288574, + "learning_rate": 7.823670340201246e-05, + "loss": 0.6456, + "step": 16648 + }, + { + "epoch": 1.1280574564672403, + "grad_norm": 4.927544116973877, + "learning_rate": 7.823533438291465e-05, + "loss": 0.7218, + "step": 16649 + }, + { + "epoch": 1.1281252117352123, + "grad_norm": 9.792190551757812, + "learning_rate": 7.823396536381683e-05, + "loss": 0.7131, + "step": 16650 + }, + { + "epoch": 1.1281929670031845, + "grad_norm": 6.260951042175293, + "learning_rate": 7.823259634471901e-05, + "loss": 0.6938, + "step": 16651 + }, + { + "epoch": 1.1282607222711567, + "grad_norm": 5.710566997528076, + "learning_rate": 7.82312273256212e-05, + "loss": 0.6371, + "step": 16652 + }, + { + "epoch": 1.1283284775391287, + "grad_norm": 5.180080413818359, + "learning_rate": 7.822985830652339e-05, + "loss": 0.7467, + "step": 16653 + }, + { + "epoch": 1.1283962328071007, + "grad_norm": 5.3070149421691895, + "learning_rate": 7.822848928742557e-05, + "loss": 0.5804, + "step": 16654 + }, + { + "epoch": 1.1284639880750729, + "grad_norm": 5.685241222381592, + "learning_rate": 7.822712026832775e-05, + "loss": 0.74, + "step": 16655 + }, + { + "epoch": 1.1285317433430448, + "grad_norm": 5.124477386474609, + "learning_rate": 7.822575124922993e-05, + "loss": 0.56, + "step": 16656 + }, + { + "epoch": 1.128599498611017, + "grad_norm": 4.9993577003479, + "learning_rate": 7.822438223013211e-05, + "loss": 0.628, + "step": 16657 + }, + { + "epoch": 1.128667253878989, + "grad_norm": 5.491091251373291, + "learning_rate": 7.82230132110343e-05, + "loss": 0.7368, + "step": 16658 + }, + { + "epoch": 1.1287350091469612, + "grad_norm": 5.759190559387207, + "learning_rate": 7.822164419193648e-05, + "loss": 0.6428, + "step": 16659 + }, + { + "epoch": 1.1288027644149332, + "grad_norm": 6.618605613708496, + "learning_rate": 7.822027517283866e-05, + "loss": 0.695, + "step": 16660 + }, + { + "epoch": 1.1288705196829054, + "grad_norm": 5.8363118171691895, + "learning_rate": 7.821890615374084e-05, + "loss": 0.6375, + "step": 16661 + }, + { + "epoch": 1.1289382749508774, + "grad_norm": 6.037831783294678, + "learning_rate": 7.821753713464304e-05, + "loss": 0.6392, + "step": 16662 + }, + { + "epoch": 1.1290060302188496, + "grad_norm": 6.488530158996582, + "learning_rate": 7.821616811554522e-05, + "loss": 0.6329, + "step": 16663 + }, + { + "epoch": 1.1290737854868216, + "grad_norm": 6.927813529968262, + "learning_rate": 7.82147990964474e-05, + "loss": 0.7017, + "step": 16664 + }, + { + "epoch": 1.1291415407547938, + "grad_norm": 7.420223712921143, + "learning_rate": 7.821343007734958e-05, + "loss": 0.9947, + "step": 16665 + }, + { + "epoch": 1.1292092960227658, + "grad_norm": 6.57767915725708, + "learning_rate": 7.821206105825176e-05, + "loss": 0.7923, + "step": 16666 + }, + { + "epoch": 1.129277051290738, + "grad_norm": 5.805924892425537, + "learning_rate": 7.821069203915395e-05, + "loss": 0.7343, + "step": 16667 + }, + { + "epoch": 1.12934480655871, + "grad_norm": 5.582193374633789, + "learning_rate": 7.820932302005613e-05, + "loss": 0.647, + "step": 16668 + }, + { + "epoch": 1.129412561826682, + "grad_norm": 5.6104607582092285, + "learning_rate": 7.820795400095831e-05, + "loss": 0.673, + "step": 16669 + }, + { + "epoch": 1.1294803170946541, + "grad_norm": 6.4317522048950195, + "learning_rate": 7.82065849818605e-05, + "loss": 0.8305, + "step": 16670 + }, + { + "epoch": 1.129548072362626, + "grad_norm": 6.044923305511475, + "learning_rate": 7.820521596276267e-05, + "loss": 0.6202, + "step": 16671 + }, + { + "epoch": 1.1296158276305983, + "grad_norm": 5.044763088226318, + "learning_rate": 7.820384694366487e-05, + "loss": 0.644, + "step": 16672 + }, + { + "epoch": 1.1296835828985703, + "grad_norm": 7.138949871063232, + "learning_rate": 7.820247792456705e-05, + "loss": 0.8183, + "step": 16673 + }, + { + "epoch": 1.1297513381665425, + "grad_norm": 6.46617317199707, + "learning_rate": 7.820110890546923e-05, + "loss": 0.6277, + "step": 16674 + }, + { + "epoch": 1.1298190934345145, + "grad_norm": 7.636822700500488, + "learning_rate": 7.819973988637141e-05, + "loss": 0.6938, + "step": 16675 + }, + { + "epoch": 1.1298868487024867, + "grad_norm": 7.69808292388916, + "learning_rate": 7.81983708672736e-05, + "loss": 0.7331, + "step": 16676 + }, + { + "epoch": 1.1299546039704587, + "grad_norm": 6.143553733825684, + "learning_rate": 7.819700184817578e-05, + "loss": 0.8705, + "step": 16677 + }, + { + "epoch": 1.1300223592384309, + "grad_norm": 6.685749530792236, + "learning_rate": 7.819563282907796e-05, + "loss": 0.7717, + "step": 16678 + }, + { + "epoch": 1.1300901145064028, + "grad_norm": 6.317489147186279, + "learning_rate": 7.819426380998014e-05, + "loss": 0.6593, + "step": 16679 + }, + { + "epoch": 1.130157869774375, + "grad_norm": 5.497630596160889, + "learning_rate": 7.819289479088233e-05, + "loss": 0.7693, + "step": 16680 + }, + { + "epoch": 1.130225625042347, + "grad_norm": 7.087355136871338, + "learning_rate": 7.819152577178452e-05, + "loss": 0.5709, + "step": 16681 + }, + { + "epoch": 1.1302933803103192, + "grad_norm": 9.138280868530273, + "learning_rate": 7.81901567526867e-05, + "loss": 0.9426, + "step": 16682 + }, + { + "epoch": 1.1303611355782912, + "grad_norm": 8.925271987915039, + "learning_rate": 7.818878773358888e-05, + "loss": 0.7462, + "step": 16683 + }, + { + "epoch": 1.1304288908462632, + "grad_norm": 5.355180263519287, + "learning_rate": 7.818741871449107e-05, + "loss": 0.6806, + "step": 16684 + }, + { + "epoch": 1.1304966461142354, + "grad_norm": 4.724582195281982, + "learning_rate": 7.818604969539325e-05, + "loss": 0.6157, + "step": 16685 + }, + { + "epoch": 1.1305644013822074, + "grad_norm": 7.584688186645508, + "learning_rate": 7.818468067629543e-05, + "loss": 0.6655, + "step": 16686 + }, + { + "epoch": 1.1306321566501796, + "grad_norm": 5.311779022216797, + "learning_rate": 7.818331165719763e-05, + "loss": 0.6513, + "step": 16687 + }, + { + "epoch": 1.1306999119181516, + "grad_norm": 5.313599586486816, + "learning_rate": 7.818194263809981e-05, + "loss": 0.6251, + "step": 16688 + }, + { + "epoch": 1.1307676671861238, + "grad_norm": 4.748965740203857, + "learning_rate": 7.818057361900199e-05, + "loss": 0.5648, + "step": 16689 + }, + { + "epoch": 1.1308354224540957, + "grad_norm": 5.075737476348877, + "learning_rate": 7.817920459990418e-05, + "loss": 0.6007, + "step": 16690 + }, + { + "epoch": 1.130903177722068, + "grad_norm": 6.251148700714111, + "learning_rate": 7.817783558080636e-05, + "loss": 0.635, + "step": 16691 + }, + { + "epoch": 1.13097093299004, + "grad_norm": 8.291831970214844, + "learning_rate": 7.817646656170854e-05, + "loss": 0.7627, + "step": 16692 + }, + { + "epoch": 1.1310386882580121, + "grad_norm": 6.881071090698242, + "learning_rate": 7.817509754261072e-05, + "loss": 0.7503, + "step": 16693 + }, + { + "epoch": 1.131106443525984, + "grad_norm": 7.16835355758667, + "learning_rate": 7.81737285235129e-05, + "loss": 0.7823, + "step": 16694 + }, + { + "epoch": 1.1311741987939563, + "grad_norm": 4.8891096115112305, + "learning_rate": 7.81723595044151e-05, + "loss": 0.6679, + "step": 16695 + }, + { + "epoch": 1.1312419540619283, + "grad_norm": 5.029726505279541, + "learning_rate": 7.817099048531728e-05, + "loss": 0.6262, + "step": 16696 + }, + { + "epoch": 1.1313097093299005, + "grad_norm": 6.097971439361572, + "learning_rate": 7.816962146621946e-05, + "loss": 0.6416, + "step": 16697 + }, + { + "epoch": 1.1313774645978725, + "grad_norm": 6.032967567443848, + "learning_rate": 7.816825244712164e-05, + "loss": 0.7701, + "step": 16698 + }, + { + "epoch": 1.1314452198658445, + "grad_norm": 7.488419055938721, + "learning_rate": 7.816688342802383e-05, + "loss": 0.8387, + "step": 16699 + }, + { + "epoch": 1.1315129751338167, + "grad_norm": 6.614134788513184, + "learning_rate": 7.816551440892601e-05, + "loss": 0.7412, + "step": 16700 + }, + { + "epoch": 1.1315807304017889, + "grad_norm": 4.813358306884766, + "learning_rate": 7.81641453898282e-05, + "loss": 0.7146, + "step": 16701 + }, + { + "epoch": 1.1316484856697608, + "grad_norm": 5.225235939025879, + "learning_rate": 7.816277637073037e-05, + "loss": 0.704, + "step": 16702 + }, + { + "epoch": 1.1317162409377328, + "grad_norm": 6.127220630645752, + "learning_rate": 7.816140735163255e-05, + "loss": 0.7934, + "step": 16703 + }, + { + "epoch": 1.131783996205705, + "grad_norm": 5.334741592407227, + "learning_rate": 7.816003833253475e-05, + "loss": 0.5111, + "step": 16704 + }, + { + "epoch": 1.131851751473677, + "grad_norm": 5.337317943572998, + "learning_rate": 7.815866931343693e-05, + "loss": 0.908, + "step": 16705 + }, + { + "epoch": 1.1319195067416492, + "grad_norm": 7.072608470916748, + "learning_rate": 7.815730029433911e-05, + "loss": 0.884, + "step": 16706 + }, + { + "epoch": 1.1319872620096212, + "grad_norm": 6.734743595123291, + "learning_rate": 7.815593127524129e-05, + "loss": 0.6501, + "step": 16707 + }, + { + "epoch": 1.1320550172775934, + "grad_norm": 6.5684814453125, + "learning_rate": 7.815456225614348e-05, + "loss": 0.9087, + "step": 16708 + }, + { + "epoch": 1.1321227725455654, + "grad_norm": 5.775663375854492, + "learning_rate": 7.815319323704566e-05, + "loss": 0.6729, + "step": 16709 + }, + { + "epoch": 1.1321905278135376, + "grad_norm": 4.196002960205078, + "learning_rate": 7.815182421794784e-05, + "loss": 0.6439, + "step": 16710 + }, + { + "epoch": 1.1322582830815096, + "grad_norm": 9.43163776397705, + "learning_rate": 7.815045519885002e-05, + "loss": 0.72, + "step": 16711 + }, + { + "epoch": 1.1323260383494818, + "grad_norm": 5.663300514221191, + "learning_rate": 7.81490861797522e-05, + "loss": 0.6189, + "step": 16712 + }, + { + "epoch": 1.1323937936174537, + "grad_norm": 5.103667736053467, + "learning_rate": 7.81477171606544e-05, + "loss": 0.9228, + "step": 16713 + }, + { + "epoch": 1.1324615488854257, + "grad_norm": 11.563446044921875, + "learning_rate": 7.814634814155658e-05, + "loss": 0.6651, + "step": 16714 + }, + { + "epoch": 1.132529304153398, + "grad_norm": 6.818861961364746, + "learning_rate": 7.814497912245876e-05, + "loss": 0.6716, + "step": 16715 + }, + { + "epoch": 1.1325970594213701, + "grad_norm": 8.50759506225586, + "learning_rate": 7.814361010336094e-05, + "loss": 0.5372, + "step": 16716 + }, + { + "epoch": 1.132664814689342, + "grad_norm": 5.753479957580566, + "learning_rate": 7.814224108426313e-05, + "loss": 0.7041, + "step": 16717 + }, + { + "epoch": 1.132732569957314, + "grad_norm": 6.033994197845459, + "learning_rate": 7.814087206516531e-05, + "loss": 0.6911, + "step": 16718 + }, + { + "epoch": 1.1328003252252863, + "grad_norm": 5.7770771980285645, + "learning_rate": 7.81395030460675e-05, + "loss": 0.6144, + "step": 16719 + }, + { + "epoch": 1.1328680804932583, + "grad_norm": 6.466103553771973, + "learning_rate": 7.813813402696967e-05, + "loss": 0.822, + "step": 16720 + }, + { + "epoch": 1.1329358357612305, + "grad_norm": 6.284421920776367, + "learning_rate": 7.813676500787185e-05, + "loss": 0.715, + "step": 16721 + }, + { + "epoch": 1.1330035910292025, + "grad_norm": 5.6262407302856445, + "learning_rate": 7.813539598877405e-05, + "loss": 0.7364, + "step": 16722 + }, + { + "epoch": 1.1330713462971747, + "grad_norm": 4.806675910949707, + "learning_rate": 7.813402696967623e-05, + "loss": 0.4433, + "step": 16723 + }, + { + "epoch": 1.1331391015651466, + "grad_norm": 5.149235248565674, + "learning_rate": 7.813265795057841e-05, + "loss": 0.4811, + "step": 16724 + }, + { + "epoch": 1.1332068568331188, + "grad_norm": 5.818775653839111, + "learning_rate": 7.813128893148059e-05, + "loss": 0.5279, + "step": 16725 + }, + { + "epoch": 1.1332746121010908, + "grad_norm": 5.656182289123535, + "learning_rate": 7.812991991238277e-05, + "loss": 0.6816, + "step": 16726 + }, + { + "epoch": 1.133342367369063, + "grad_norm": 4.287816524505615, + "learning_rate": 7.812855089328496e-05, + "loss": 0.6402, + "step": 16727 + }, + { + "epoch": 1.133410122637035, + "grad_norm": 6.147930145263672, + "learning_rate": 7.812718187418714e-05, + "loss": 0.6251, + "step": 16728 + }, + { + "epoch": 1.1334778779050072, + "grad_norm": 5.128780364990234, + "learning_rate": 7.812581285508932e-05, + "loss": 0.7653, + "step": 16729 + }, + { + "epoch": 1.1335456331729792, + "grad_norm": 4.769107341766357, + "learning_rate": 7.812444383599152e-05, + "loss": 0.5929, + "step": 16730 + }, + { + "epoch": 1.1336133884409514, + "grad_norm": 4.782686233520508, + "learning_rate": 7.81230748168937e-05, + "loss": 0.5916, + "step": 16731 + }, + { + "epoch": 1.1336811437089234, + "grad_norm": 5.834915637969971, + "learning_rate": 7.812170579779588e-05, + "loss": 0.6614, + "step": 16732 + }, + { + "epoch": 1.1337488989768953, + "grad_norm": 5.408773899078369, + "learning_rate": 7.812033677869807e-05, + "loss": 0.7269, + "step": 16733 + }, + { + "epoch": 1.1338166542448676, + "grad_norm": 6.599340438842773, + "learning_rate": 7.811896775960025e-05, + "loss": 0.4876, + "step": 16734 + }, + { + "epoch": 1.1338844095128395, + "grad_norm": 4.747675895690918, + "learning_rate": 7.811759874050243e-05, + "loss": 0.674, + "step": 16735 + }, + { + "epoch": 1.1339521647808117, + "grad_norm": 5.954026222229004, + "learning_rate": 7.811622972140463e-05, + "loss": 0.6632, + "step": 16736 + }, + { + "epoch": 1.1340199200487837, + "grad_norm": 4.728879928588867, + "learning_rate": 7.811486070230681e-05, + "loss": 0.6869, + "step": 16737 + }, + { + "epoch": 1.134087675316756, + "grad_norm": 5.576781749725342, + "learning_rate": 7.811349168320899e-05, + "loss": 0.6638, + "step": 16738 + }, + { + "epoch": 1.134155430584728, + "grad_norm": 5.9168782234191895, + "learning_rate": 7.811212266411117e-05, + "loss": 0.759, + "step": 16739 + }, + { + "epoch": 1.1342231858527, + "grad_norm": 5.378051280975342, + "learning_rate": 7.811075364501336e-05, + "loss": 0.5783, + "step": 16740 + }, + { + "epoch": 1.134290941120672, + "grad_norm": 7.849323749542236, + "learning_rate": 7.810938462591554e-05, + "loss": 0.7713, + "step": 16741 + }, + { + "epoch": 1.1343586963886443, + "grad_norm": 5.6644415855407715, + "learning_rate": 7.810801560681772e-05, + "loss": 0.5597, + "step": 16742 + }, + { + "epoch": 1.1344264516566163, + "grad_norm": 6.990406513214111, + "learning_rate": 7.81066465877199e-05, + "loss": 0.5516, + "step": 16743 + }, + { + "epoch": 1.1344942069245885, + "grad_norm": 6.040196418762207, + "learning_rate": 7.810527756862208e-05, + "loss": 0.7269, + "step": 16744 + }, + { + "epoch": 1.1345619621925604, + "grad_norm": 6.549674034118652, + "learning_rate": 7.810390854952428e-05, + "loss": 0.797, + "step": 16745 + }, + { + "epoch": 1.1346297174605326, + "grad_norm": 5.927361011505127, + "learning_rate": 7.810253953042646e-05, + "loss": 0.6387, + "step": 16746 + }, + { + "epoch": 1.1346974727285046, + "grad_norm": 5.803552150726318, + "learning_rate": 7.810117051132864e-05, + "loss": 0.7569, + "step": 16747 + }, + { + "epoch": 1.1347652279964766, + "grad_norm": 6.477768898010254, + "learning_rate": 7.809980149223082e-05, + "loss": 0.8403, + "step": 16748 + }, + { + "epoch": 1.1348329832644488, + "grad_norm": 6.080210208892822, + "learning_rate": 7.8098432473133e-05, + "loss": 0.8066, + "step": 16749 + }, + { + "epoch": 1.134900738532421, + "grad_norm": 7.755611896514893, + "learning_rate": 7.809706345403519e-05, + "loss": 0.7821, + "step": 16750 + }, + { + "epoch": 1.134968493800393, + "grad_norm": 5.807285308837891, + "learning_rate": 7.809569443493737e-05, + "loss": 0.7686, + "step": 16751 + }, + { + "epoch": 1.135036249068365, + "grad_norm": 4.9453020095825195, + "learning_rate": 7.809432541583955e-05, + "loss": 0.5902, + "step": 16752 + }, + { + "epoch": 1.1351040043363372, + "grad_norm": 5.776493072509766, + "learning_rate": 7.809295639674173e-05, + "loss": 0.568, + "step": 16753 + }, + { + "epoch": 1.1351717596043092, + "grad_norm": 8.138739585876465, + "learning_rate": 7.809158737764393e-05, + "loss": 0.9117, + "step": 16754 + }, + { + "epoch": 1.1352395148722814, + "grad_norm": 4.630499839782715, + "learning_rate": 7.809021835854611e-05, + "loss": 0.7508, + "step": 16755 + }, + { + "epoch": 1.1353072701402533, + "grad_norm": 4.743904113769531, + "learning_rate": 7.808884933944829e-05, + "loss": 0.6519, + "step": 16756 + }, + { + "epoch": 1.1353750254082255, + "grad_norm": 6.640937328338623, + "learning_rate": 7.808748032035047e-05, + "loss": 0.6931, + "step": 16757 + }, + { + "epoch": 1.1354427806761975, + "grad_norm": 6.060274600982666, + "learning_rate": 7.808611130125265e-05, + "loss": 0.7295, + "step": 16758 + }, + { + "epoch": 1.1355105359441697, + "grad_norm": 5.2093000411987305, + "learning_rate": 7.808474228215484e-05, + "loss": 0.629, + "step": 16759 + }, + { + "epoch": 1.1355782912121417, + "grad_norm": 5.407078266143799, + "learning_rate": 7.808337326305702e-05, + "loss": 0.6372, + "step": 16760 + }, + { + "epoch": 1.135646046480114, + "grad_norm": 6.483882904052734, + "learning_rate": 7.80820042439592e-05, + "loss": 0.6638, + "step": 16761 + }, + { + "epoch": 1.135713801748086, + "grad_norm": 5.735899925231934, + "learning_rate": 7.808063522486138e-05, + "loss": 0.9567, + "step": 16762 + }, + { + "epoch": 1.1357815570160579, + "grad_norm": 6.978238105773926, + "learning_rate": 7.807926620576358e-05, + "loss": 0.6355, + "step": 16763 + }, + { + "epoch": 1.13584931228403, + "grad_norm": 6.08992338180542, + "learning_rate": 7.807789718666576e-05, + "loss": 0.8777, + "step": 16764 + }, + { + "epoch": 1.1359170675520023, + "grad_norm": 6.668683052062988, + "learning_rate": 7.807652816756794e-05, + "loss": 0.5351, + "step": 16765 + }, + { + "epoch": 1.1359848228199743, + "grad_norm": 5.498349666595459, + "learning_rate": 7.807515914847012e-05, + "loss": 0.7362, + "step": 16766 + }, + { + "epoch": 1.1360525780879462, + "grad_norm": 5.9772868156433105, + "learning_rate": 7.80737901293723e-05, + "loss": 0.7146, + "step": 16767 + }, + { + "epoch": 1.1361203333559184, + "grad_norm": 6.178906440734863, + "learning_rate": 7.80724211102745e-05, + "loss": 0.6685, + "step": 16768 + }, + { + "epoch": 1.1361880886238904, + "grad_norm": 5.166858196258545, + "learning_rate": 7.807105209117667e-05, + "loss": 0.7369, + "step": 16769 + }, + { + "epoch": 1.1362558438918626, + "grad_norm": 5.280497074127197, + "learning_rate": 7.806968307207885e-05, + "loss": 0.6309, + "step": 16770 + }, + { + "epoch": 1.1363235991598346, + "grad_norm": 4.654086112976074, + "learning_rate": 7.806831405298103e-05, + "loss": 0.6645, + "step": 16771 + }, + { + "epoch": 1.1363913544278068, + "grad_norm": 5.552098751068115, + "learning_rate": 7.806694503388321e-05, + "loss": 0.6494, + "step": 16772 + }, + { + "epoch": 1.1364591096957788, + "grad_norm": 5.359954833984375, + "learning_rate": 7.806557601478541e-05, + "loss": 0.8581, + "step": 16773 + }, + { + "epoch": 1.136526864963751, + "grad_norm": 5.112419128417969, + "learning_rate": 7.806420699568759e-05, + "loss": 0.6707, + "step": 16774 + }, + { + "epoch": 1.136594620231723, + "grad_norm": 5.130734443664551, + "learning_rate": 7.806283797658977e-05, + "loss": 0.5441, + "step": 16775 + }, + { + "epoch": 1.1366623754996952, + "grad_norm": 4.156247615814209, + "learning_rate": 7.806146895749196e-05, + "loss": 0.6177, + "step": 16776 + }, + { + "epoch": 1.1367301307676672, + "grad_norm": 7.025331020355225, + "learning_rate": 7.806009993839414e-05, + "loss": 0.7531, + "step": 16777 + }, + { + "epoch": 1.1367978860356394, + "grad_norm": 6.272400856018066, + "learning_rate": 7.805873091929632e-05, + "loss": 0.5415, + "step": 16778 + }, + { + "epoch": 1.1368656413036113, + "grad_norm": 5.994542121887207, + "learning_rate": 7.805736190019852e-05, + "loss": 0.5739, + "step": 16779 + }, + { + "epoch": 1.1369333965715835, + "grad_norm": 7.8840718269348145, + "learning_rate": 7.80559928811007e-05, + "loss": 0.7447, + "step": 16780 + }, + { + "epoch": 1.1370011518395555, + "grad_norm": 5.8447651863098145, + "learning_rate": 7.805462386200288e-05, + "loss": 0.5919, + "step": 16781 + }, + { + "epoch": 1.1370689071075275, + "grad_norm": 7.363594055175781, + "learning_rate": 7.805325484290507e-05, + "loss": 0.7014, + "step": 16782 + }, + { + "epoch": 1.1371366623754997, + "grad_norm": 5.474704265594482, + "learning_rate": 7.805188582380725e-05, + "loss": 0.6021, + "step": 16783 + }, + { + "epoch": 1.1372044176434717, + "grad_norm": 6.830411434173584, + "learning_rate": 7.805051680470943e-05, + "loss": 0.6739, + "step": 16784 + }, + { + "epoch": 1.137272172911444, + "grad_norm": 5.505611896514893, + "learning_rate": 7.804914778561161e-05, + "loss": 0.5898, + "step": 16785 + }, + { + "epoch": 1.1373399281794159, + "grad_norm": 9.52569580078125, + "learning_rate": 7.804777876651381e-05, + "loss": 0.6024, + "step": 16786 + }, + { + "epoch": 1.137407683447388, + "grad_norm": 6.93997049331665, + "learning_rate": 7.804640974741599e-05, + "loss": 0.6355, + "step": 16787 + }, + { + "epoch": 1.13747543871536, + "grad_norm": 5.036192893981934, + "learning_rate": 7.804504072831817e-05, + "loss": 0.7398, + "step": 16788 + }, + { + "epoch": 1.1375431939833323, + "grad_norm": 5.485294818878174, + "learning_rate": 7.804367170922035e-05, + "loss": 0.7825, + "step": 16789 + }, + { + "epoch": 1.1376109492513042, + "grad_norm": 5.600368976593018, + "learning_rate": 7.804230269012253e-05, + "loss": 0.7205, + "step": 16790 + }, + { + "epoch": 1.1376787045192764, + "grad_norm": 5.522429943084717, + "learning_rate": 7.804093367102472e-05, + "loss": 0.7872, + "step": 16791 + }, + { + "epoch": 1.1377464597872484, + "grad_norm": 5.987887382507324, + "learning_rate": 7.80395646519269e-05, + "loss": 0.6782, + "step": 16792 + }, + { + "epoch": 1.1378142150552206, + "grad_norm": 5.354010105133057, + "learning_rate": 7.803819563282908e-05, + "loss": 0.6363, + "step": 16793 + }, + { + "epoch": 1.1378819703231926, + "grad_norm": 6.39069938659668, + "learning_rate": 7.803682661373126e-05, + "loss": 0.7089, + "step": 16794 + }, + { + "epoch": 1.1379497255911648, + "grad_norm": 5.727047443389893, + "learning_rate": 7.803545759463346e-05, + "loss": 0.7008, + "step": 16795 + }, + { + "epoch": 1.1380174808591368, + "grad_norm": 7.178287506103516, + "learning_rate": 7.803408857553564e-05, + "loss": 0.7488, + "step": 16796 + }, + { + "epoch": 1.1380852361271088, + "grad_norm": 6.201870441436768, + "learning_rate": 7.803271955643782e-05, + "loss": 0.6254, + "step": 16797 + }, + { + "epoch": 1.138152991395081, + "grad_norm": 6.881004810333252, + "learning_rate": 7.803135053734e-05, + "loss": 0.6558, + "step": 16798 + }, + { + "epoch": 1.1382207466630532, + "grad_norm": 6.2866291999816895, + "learning_rate": 7.802998151824218e-05, + "loss": 0.6537, + "step": 16799 + }, + { + "epoch": 1.1382885019310252, + "grad_norm": 5.333223342895508, + "learning_rate": 7.802861249914437e-05, + "loss": 0.6693, + "step": 16800 + }, + { + "epoch": 1.1383562571989971, + "grad_norm": 6.103747367858887, + "learning_rate": 7.802724348004655e-05, + "loss": 0.6822, + "step": 16801 + }, + { + "epoch": 1.1384240124669693, + "grad_norm": 10.445125579833984, + "learning_rate": 7.802587446094873e-05, + "loss": 0.7734, + "step": 16802 + }, + { + "epoch": 1.1384917677349413, + "grad_norm": 5.043206691741943, + "learning_rate": 7.802450544185091e-05, + "loss": 0.6898, + "step": 16803 + }, + { + "epoch": 1.1385595230029135, + "grad_norm": 7.022032260894775, + "learning_rate": 7.80231364227531e-05, + "loss": 0.6542, + "step": 16804 + }, + { + "epoch": 1.1386272782708855, + "grad_norm": 7.176365852355957, + "learning_rate": 7.802176740365529e-05, + "loss": 0.7367, + "step": 16805 + }, + { + "epoch": 1.1386950335388577, + "grad_norm": 4.644754409790039, + "learning_rate": 7.802039838455747e-05, + "loss": 0.6602, + "step": 16806 + }, + { + "epoch": 1.1387627888068297, + "grad_norm": 5.330687522888184, + "learning_rate": 7.801902936545965e-05, + "loss": 0.5775, + "step": 16807 + }, + { + "epoch": 1.1388305440748019, + "grad_norm": 5.8319501876831055, + "learning_rate": 7.801766034636183e-05, + "loss": 0.599, + "step": 16808 + }, + { + "epoch": 1.1388982993427739, + "grad_norm": 6.8199286460876465, + "learning_rate": 7.801629132726402e-05, + "loss": 0.8001, + "step": 16809 + }, + { + "epoch": 1.138966054610746, + "grad_norm": 6.324723720550537, + "learning_rate": 7.80149223081662e-05, + "loss": 0.773, + "step": 16810 + }, + { + "epoch": 1.139033809878718, + "grad_norm": 8.192002296447754, + "learning_rate": 7.801355328906838e-05, + "loss": 0.647, + "step": 16811 + }, + { + "epoch": 1.13910156514669, + "grad_norm": 7.2895050048828125, + "learning_rate": 7.801218426997056e-05, + "loss": 0.8682, + "step": 16812 + }, + { + "epoch": 1.1391693204146622, + "grad_norm": 6.973208904266357, + "learning_rate": 7.801081525087274e-05, + "loss": 0.667, + "step": 16813 + }, + { + "epoch": 1.1392370756826344, + "grad_norm": 6.551448345184326, + "learning_rate": 7.800944623177494e-05, + "loss": 0.7502, + "step": 16814 + }, + { + "epoch": 1.1393048309506064, + "grad_norm": 5.396692752838135, + "learning_rate": 7.800807721267712e-05, + "loss": 0.9984, + "step": 16815 + }, + { + "epoch": 1.1393725862185784, + "grad_norm": 4.447607040405273, + "learning_rate": 7.80067081935793e-05, + "loss": 0.6526, + "step": 16816 + }, + { + "epoch": 1.1394403414865506, + "grad_norm": 8.116009712219238, + "learning_rate": 7.800533917448148e-05, + "loss": 0.6774, + "step": 16817 + }, + { + "epoch": 1.1395080967545226, + "grad_norm": 5.555187225341797, + "learning_rate": 7.800397015538367e-05, + "loss": 0.7358, + "step": 16818 + }, + { + "epoch": 1.1395758520224948, + "grad_norm": 7.893671035766602, + "learning_rate": 7.800260113628585e-05, + "loss": 0.4427, + "step": 16819 + }, + { + "epoch": 1.1396436072904668, + "grad_norm": 6.3167853355407715, + "learning_rate": 7.800123211718803e-05, + "loss": 0.7659, + "step": 16820 + }, + { + "epoch": 1.139711362558439, + "grad_norm": 7.062374591827393, + "learning_rate": 7.799986309809021e-05, + "loss": 0.8105, + "step": 16821 + }, + { + "epoch": 1.139779117826411, + "grad_norm": 6.286994457244873, + "learning_rate": 7.799849407899241e-05, + "loss": 0.6914, + "step": 16822 + }, + { + "epoch": 1.1398468730943832, + "grad_norm": 5.267313480377197, + "learning_rate": 7.799712505989459e-05, + "loss": 0.6803, + "step": 16823 + }, + { + "epoch": 1.1399146283623551, + "grad_norm": 5.873160362243652, + "learning_rate": 7.799575604079677e-05, + "loss": 0.8037, + "step": 16824 + }, + { + "epoch": 1.1399823836303273, + "grad_norm": 6.283346176147461, + "learning_rate": 7.799438702169896e-05, + "loss": 0.6624, + "step": 16825 + }, + { + "epoch": 1.1400501388982993, + "grad_norm": 6.988799571990967, + "learning_rate": 7.799301800260114e-05, + "loss": 0.7537, + "step": 16826 + }, + { + "epoch": 1.1401178941662715, + "grad_norm": 6.114986419677734, + "learning_rate": 7.799164898350332e-05, + "loss": 0.7707, + "step": 16827 + }, + { + "epoch": 1.1401856494342435, + "grad_norm": 5.79031229019165, + "learning_rate": 7.799027996440552e-05, + "loss": 0.5132, + "step": 16828 + }, + { + "epoch": 1.1402534047022157, + "grad_norm": 6.05492639541626, + "learning_rate": 7.79889109453077e-05, + "loss": 0.604, + "step": 16829 + }, + { + "epoch": 1.1403211599701877, + "grad_norm": 5.041617393493652, + "learning_rate": 7.798754192620988e-05, + "loss": 0.7133, + "step": 16830 + }, + { + "epoch": 1.1403889152381597, + "grad_norm": 6.444592475891113, + "learning_rate": 7.798617290711206e-05, + "loss": 0.653, + "step": 16831 + }, + { + "epoch": 1.1404566705061319, + "grad_norm": 6.009650230407715, + "learning_rate": 7.798480388801425e-05, + "loss": 0.6469, + "step": 16832 + }, + { + "epoch": 1.1405244257741038, + "grad_norm": 8.08381462097168, + "learning_rate": 7.798343486891643e-05, + "loss": 0.7966, + "step": 16833 + }, + { + "epoch": 1.140592181042076, + "grad_norm": 5.3437676429748535, + "learning_rate": 7.798206584981861e-05, + "loss": 0.7883, + "step": 16834 + }, + { + "epoch": 1.140659936310048, + "grad_norm": 5.706287860870361, + "learning_rate": 7.798069683072079e-05, + "loss": 0.8088, + "step": 16835 + }, + { + "epoch": 1.1407276915780202, + "grad_norm": 5.616438865661621, + "learning_rate": 7.797932781162297e-05, + "loss": 0.5782, + "step": 16836 + }, + { + "epoch": 1.1407954468459922, + "grad_norm": 5.1152777671813965, + "learning_rate": 7.797795879252517e-05, + "loss": 0.8015, + "step": 16837 + }, + { + "epoch": 1.1408632021139644, + "grad_norm": 5.392358303070068, + "learning_rate": 7.797658977342735e-05, + "loss": 0.6961, + "step": 16838 + }, + { + "epoch": 1.1409309573819364, + "grad_norm": 5.752062797546387, + "learning_rate": 7.797522075432953e-05, + "loss": 0.7793, + "step": 16839 + }, + { + "epoch": 1.1409987126499086, + "grad_norm": 6.018402576446533, + "learning_rate": 7.797385173523171e-05, + "loss": 0.6414, + "step": 16840 + }, + { + "epoch": 1.1410664679178806, + "grad_norm": 4.536129951477051, + "learning_rate": 7.79724827161339e-05, + "loss": 0.4998, + "step": 16841 + }, + { + "epoch": 1.1411342231858528, + "grad_norm": 5.575126647949219, + "learning_rate": 7.797111369703608e-05, + "loss": 0.6466, + "step": 16842 + }, + { + "epoch": 1.1412019784538248, + "grad_norm": 8.243531227111816, + "learning_rate": 7.796974467793826e-05, + "loss": 0.9981, + "step": 16843 + }, + { + "epoch": 1.141269733721797, + "grad_norm": 6.683679580688477, + "learning_rate": 7.796837565884044e-05, + "loss": 0.8015, + "step": 16844 + }, + { + "epoch": 1.141337488989769, + "grad_norm": 7.662330627441406, + "learning_rate": 7.796700663974262e-05, + "loss": 0.6639, + "step": 16845 + }, + { + "epoch": 1.141405244257741, + "grad_norm": 4.948780536651611, + "learning_rate": 7.796563762064482e-05, + "loss": 0.6174, + "step": 16846 + }, + { + "epoch": 1.1414729995257131, + "grad_norm": 6.792630195617676, + "learning_rate": 7.7964268601547e-05, + "loss": 0.7958, + "step": 16847 + }, + { + "epoch": 1.1415407547936853, + "grad_norm": 5.603211879730225, + "learning_rate": 7.796289958244918e-05, + "loss": 0.8995, + "step": 16848 + }, + { + "epoch": 1.1416085100616573, + "grad_norm": 7.316423416137695, + "learning_rate": 7.796153056335136e-05, + "loss": 0.592, + "step": 16849 + }, + { + "epoch": 1.1416762653296293, + "grad_norm": 5.690333843231201, + "learning_rate": 7.796016154425355e-05, + "loss": 0.568, + "step": 16850 + }, + { + "epoch": 1.1417440205976015, + "grad_norm": 5.481085777282715, + "learning_rate": 7.795879252515573e-05, + "loss": 0.8911, + "step": 16851 + }, + { + "epoch": 1.1418117758655735, + "grad_norm": 5.878414154052734, + "learning_rate": 7.795742350605791e-05, + "loss": 0.5673, + "step": 16852 + }, + { + "epoch": 1.1418795311335457, + "grad_norm": 6.474081993103027, + "learning_rate": 7.79560544869601e-05, + "loss": 0.6036, + "step": 16853 + }, + { + "epoch": 1.1419472864015177, + "grad_norm": 5.981825351715088, + "learning_rate": 7.795468546786227e-05, + "loss": 0.7612, + "step": 16854 + }, + { + "epoch": 1.1420150416694899, + "grad_norm": 5.728933811187744, + "learning_rate": 7.795331644876447e-05, + "loss": 0.7179, + "step": 16855 + }, + { + "epoch": 1.1420827969374618, + "grad_norm": 5.504831790924072, + "learning_rate": 7.795194742966665e-05, + "loss": 0.9218, + "step": 16856 + }, + { + "epoch": 1.142150552205434, + "grad_norm": 6.834359169006348, + "learning_rate": 7.795057841056883e-05, + "loss": 0.7309, + "step": 16857 + }, + { + "epoch": 1.142218307473406, + "grad_norm": 6.295196056365967, + "learning_rate": 7.794920939147101e-05, + "loss": 0.8664, + "step": 16858 + }, + { + "epoch": 1.1422860627413782, + "grad_norm": 6.097269058227539, + "learning_rate": 7.794784037237319e-05, + "loss": 0.6478, + "step": 16859 + }, + { + "epoch": 1.1423538180093502, + "grad_norm": 6.314675331115723, + "learning_rate": 7.794647135327538e-05, + "loss": 0.8395, + "step": 16860 + }, + { + "epoch": 1.1424215732773222, + "grad_norm": 4.988530158996582, + "learning_rate": 7.794510233417756e-05, + "loss": 0.5958, + "step": 16861 + }, + { + "epoch": 1.1424893285452944, + "grad_norm": 5.622345924377441, + "learning_rate": 7.794373331507974e-05, + "loss": 0.4794, + "step": 16862 + }, + { + "epoch": 1.1425570838132666, + "grad_norm": 5.737193584442139, + "learning_rate": 7.794236429598192e-05, + "loss": 0.7704, + "step": 16863 + }, + { + "epoch": 1.1426248390812386, + "grad_norm": 6.432234764099121, + "learning_rate": 7.794099527688412e-05, + "loss": 0.5991, + "step": 16864 + }, + { + "epoch": 1.1426925943492106, + "grad_norm": 6.44136381149292, + "learning_rate": 7.79396262577863e-05, + "loss": 0.7507, + "step": 16865 + }, + { + "epoch": 1.1427603496171828, + "grad_norm": 4.948369979858398, + "learning_rate": 7.793825723868848e-05, + "loss": 0.7673, + "step": 16866 + }, + { + "epoch": 1.1428281048851547, + "grad_norm": 5.5636162757873535, + "learning_rate": 7.793688821959066e-05, + "loss": 0.4776, + "step": 16867 + }, + { + "epoch": 1.142895860153127, + "grad_norm": 7.1905741691589355, + "learning_rate": 7.793551920049285e-05, + "loss": 0.8011, + "step": 16868 + }, + { + "epoch": 1.142963615421099, + "grad_norm": 6.490988254547119, + "learning_rate": 7.793415018139503e-05, + "loss": 0.8343, + "step": 16869 + }, + { + "epoch": 1.1430313706890711, + "grad_norm": 5.666626453399658, + "learning_rate": 7.793278116229721e-05, + "loss": 0.6194, + "step": 16870 + }, + { + "epoch": 1.143099125957043, + "grad_norm": 7.139801025390625, + "learning_rate": 7.793141214319941e-05, + "loss": 0.6305, + "step": 16871 + }, + { + "epoch": 1.1431668812250153, + "grad_norm": 5.280755043029785, + "learning_rate": 7.793004312410159e-05, + "loss": 0.69, + "step": 16872 + }, + { + "epoch": 1.1432346364929873, + "grad_norm": 6.890513896942139, + "learning_rate": 7.792867410500377e-05, + "loss": 0.9433, + "step": 16873 + }, + { + "epoch": 1.1433023917609595, + "grad_norm": 7.743908882141113, + "learning_rate": 7.792730508590596e-05, + "loss": 0.6906, + "step": 16874 + }, + { + "epoch": 1.1433701470289315, + "grad_norm": 9.331748962402344, + "learning_rate": 7.792593606680814e-05, + "loss": 0.581, + "step": 16875 + }, + { + "epoch": 1.1434379022969037, + "grad_norm": 5.105838775634766, + "learning_rate": 7.792456704771032e-05, + "loss": 0.6817, + "step": 16876 + }, + { + "epoch": 1.1435056575648757, + "grad_norm": 7.092813014984131, + "learning_rate": 7.79231980286125e-05, + "loss": 0.5524, + "step": 16877 + }, + { + "epoch": 1.1435734128328479, + "grad_norm": 5.3407673835754395, + "learning_rate": 7.79218290095147e-05, + "loss": 0.6377, + "step": 16878 + }, + { + "epoch": 1.1436411681008198, + "grad_norm": 8.154608726501465, + "learning_rate": 7.792045999041688e-05, + "loss": 0.7605, + "step": 16879 + }, + { + "epoch": 1.1437089233687918, + "grad_norm": 6.083822250366211, + "learning_rate": 7.791909097131906e-05, + "loss": 0.6825, + "step": 16880 + }, + { + "epoch": 1.143776678636764, + "grad_norm": 5.946974754333496, + "learning_rate": 7.791772195222124e-05, + "loss": 0.6463, + "step": 16881 + }, + { + "epoch": 1.143844433904736, + "grad_norm": 4.58415412902832, + "learning_rate": 7.791635293312342e-05, + "loss": 0.5366, + "step": 16882 + }, + { + "epoch": 1.1439121891727082, + "grad_norm": 5.008115768432617, + "learning_rate": 7.791498391402561e-05, + "loss": 0.6225, + "step": 16883 + }, + { + "epoch": 1.1439799444406802, + "grad_norm": 6.249969005584717, + "learning_rate": 7.791361489492779e-05, + "loss": 0.6601, + "step": 16884 + }, + { + "epoch": 1.1440476997086524, + "grad_norm": 5.142440319061279, + "learning_rate": 7.791224587582997e-05, + "loss": 0.5248, + "step": 16885 + }, + { + "epoch": 1.1441154549766244, + "grad_norm": 4.58303165435791, + "learning_rate": 7.791087685673215e-05, + "loss": 0.6253, + "step": 16886 + }, + { + "epoch": 1.1441832102445966, + "grad_norm": 5.830782413482666, + "learning_rate": 7.790950783763435e-05, + "loss": 0.7958, + "step": 16887 + }, + { + "epoch": 1.1442509655125686, + "grad_norm": 6.463974952697754, + "learning_rate": 7.790813881853653e-05, + "loss": 0.7663, + "step": 16888 + }, + { + "epoch": 1.1443187207805408, + "grad_norm": 7.233205318450928, + "learning_rate": 7.790676979943871e-05, + "loss": 0.7953, + "step": 16889 + }, + { + "epoch": 1.1443864760485127, + "grad_norm": 5.574187278747559, + "learning_rate": 7.790540078034089e-05, + "loss": 0.5744, + "step": 16890 + }, + { + "epoch": 1.144454231316485, + "grad_norm": 6.045961856842041, + "learning_rate": 7.790403176124307e-05, + "loss": 0.6247, + "step": 16891 + }, + { + "epoch": 1.144521986584457, + "grad_norm": 5.329028606414795, + "learning_rate": 7.790266274214526e-05, + "loss": 0.6026, + "step": 16892 + }, + { + "epoch": 1.1445897418524291, + "grad_norm": 6.763497829437256, + "learning_rate": 7.790129372304744e-05, + "loss": 0.6519, + "step": 16893 + }, + { + "epoch": 1.144657497120401, + "grad_norm": 5.743830680847168, + "learning_rate": 7.789992470394962e-05, + "loss": 0.6935, + "step": 16894 + }, + { + "epoch": 1.144725252388373, + "grad_norm": 4.693161487579346, + "learning_rate": 7.78985556848518e-05, + "loss": 0.5889, + "step": 16895 + }, + { + "epoch": 1.1447930076563453, + "grad_norm": 6.895607948303223, + "learning_rate": 7.7897186665754e-05, + "loss": 0.561, + "step": 16896 + }, + { + "epoch": 1.1448607629243175, + "grad_norm": 5.308940410614014, + "learning_rate": 7.789581764665618e-05, + "loss": 0.8614, + "step": 16897 + }, + { + "epoch": 1.1449285181922895, + "grad_norm": 6.690733909606934, + "learning_rate": 7.789444862755836e-05, + "loss": 0.5136, + "step": 16898 + }, + { + "epoch": 1.1449962734602614, + "grad_norm": 4.7946624755859375, + "learning_rate": 7.789307960846054e-05, + "loss": 0.5016, + "step": 16899 + }, + { + "epoch": 1.1450640287282337, + "grad_norm": 5.833794593811035, + "learning_rate": 7.789171058936272e-05, + "loss": 0.5225, + "step": 16900 + }, + { + "epoch": 1.1451317839962056, + "grad_norm": 6.518613338470459, + "learning_rate": 7.789034157026491e-05, + "loss": 0.7277, + "step": 16901 + }, + { + "epoch": 1.1451995392641778, + "grad_norm": 5.891839981079102, + "learning_rate": 7.788897255116709e-05, + "loss": 0.5277, + "step": 16902 + }, + { + "epoch": 1.1452672945321498, + "grad_norm": 7.279749393463135, + "learning_rate": 7.788760353206927e-05, + "loss": 0.6505, + "step": 16903 + }, + { + "epoch": 1.145335049800122, + "grad_norm": 5.246143341064453, + "learning_rate": 7.788623451297145e-05, + "loss": 0.6035, + "step": 16904 + }, + { + "epoch": 1.145402805068094, + "grad_norm": 6.899950981140137, + "learning_rate": 7.788486549387363e-05, + "loss": 0.6674, + "step": 16905 + }, + { + "epoch": 1.1454705603360662, + "grad_norm": 5.364004611968994, + "learning_rate": 7.788349647477583e-05, + "loss": 0.6255, + "step": 16906 + }, + { + "epoch": 1.1455383156040382, + "grad_norm": 6.842263698577881, + "learning_rate": 7.788212745567801e-05, + "loss": 0.6159, + "step": 16907 + }, + { + "epoch": 1.1456060708720104, + "grad_norm": 6.661463260650635, + "learning_rate": 7.788075843658019e-05, + "loss": 0.6893, + "step": 16908 + }, + { + "epoch": 1.1456738261399824, + "grad_norm": 7.597821235656738, + "learning_rate": 7.787938941748237e-05, + "loss": 0.666, + "step": 16909 + }, + { + "epoch": 1.1457415814079543, + "grad_norm": 6.1869354248046875, + "learning_rate": 7.787802039838456e-05, + "loss": 0.5579, + "step": 16910 + }, + { + "epoch": 1.1458093366759265, + "grad_norm": 5.620526313781738, + "learning_rate": 7.787665137928674e-05, + "loss": 0.6303, + "step": 16911 + }, + { + "epoch": 1.1458770919438988, + "grad_norm": 6.049895286560059, + "learning_rate": 7.787528236018892e-05, + "loss": 0.5657, + "step": 16912 + }, + { + "epoch": 1.1459448472118707, + "grad_norm": 5.7843756675720215, + "learning_rate": 7.78739133410911e-05, + "loss": 0.6183, + "step": 16913 + }, + { + "epoch": 1.1460126024798427, + "grad_norm": 7.550663471221924, + "learning_rate": 7.787254432199328e-05, + "loss": 0.8373, + "step": 16914 + }, + { + "epoch": 1.146080357747815, + "grad_norm": 6.411697864532471, + "learning_rate": 7.787117530289548e-05, + "loss": 0.7271, + "step": 16915 + }, + { + "epoch": 1.146148113015787, + "grad_norm": 5.198284149169922, + "learning_rate": 7.786980628379766e-05, + "loss": 0.7704, + "step": 16916 + }, + { + "epoch": 1.146215868283759, + "grad_norm": 7.581902980804443, + "learning_rate": 7.786843726469984e-05, + "loss": 0.8091, + "step": 16917 + }, + { + "epoch": 1.146283623551731, + "grad_norm": 6.926025390625, + "learning_rate": 7.786706824560203e-05, + "loss": 0.5879, + "step": 16918 + }, + { + "epoch": 1.1463513788197033, + "grad_norm": 6.271979808807373, + "learning_rate": 7.786569922650421e-05, + "loss": 0.8076, + "step": 16919 + }, + { + "epoch": 1.1464191340876753, + "grad_norm": 5.187648296356201, + "learning_rate": 7.786433020740639e-05, + "loss": 0.6618, + "step": 16920 + }, + { + "epoch": 1.1464868893556475, + "grad_norm": 5.481020450592041, + "learning_rate": 7.786296118830859e-05, + "loss": 0.6813, + "step": 16921 + }, + { + "epoch": 1.1465546446236194, + "grad_norm": 5.956170082092285, + "learning_rate": 7.786159216921077e-05, + "loss": 0.6529, + "step": 16922 + }, + { + "epoch": 1.1466223998915916, + "grad_norm": 6.283894062042236, + "learning_rate": 7.786022315011295e-05, + "loss": 0.6642, + "step": 16923 + }, + { + "epoch": 1.1466901551595636, + "grad_norm": 6.540280342102051, + "learning_rate": 7.785885413101514e-05, + "loss": 0.9651, + "step": 16924 + }, + { + "epoch": 1.1467579104275358, + "grad_norm": 5.8760528564453125, + "learning_rate": 7.785748511191732e-05, + "loss": 0.5935, + "step": 16925 + }, + { + "epoch": 1.1468256656955078, + "grad_norm": 6.100462436676025, + "learning_rate": 7.78561160928195e-05, + "loss": 0.6853, + "step": 16926 + }, + { + "epoch": 1.14689342096348, + "grad_norm": 8.684874534606934, + "learning_rate": 7.785474707372168e-05, + "loss": 0.8358, + "step": 16927 + }, + { + "epoch": 1.146961176231452, + "grad_norm": 9.009882926940918, + "learning_rate": 7.785337805462388e-05, + "loss": 0.6265, + "step": 16928 + }, + { + "epoch": 1.147028931499424, + "grad_norm": 5.696036338806152, + "learning_rate": 7.785200903552606e-05, + "loss": 0.6748, + "step": 16929 + }, + { + "epoch": 1.1470966867673962, + "grad_norm": 5.4167866706848145, + "learning_rate": 7.785064001642824e-05, + "loss": 0.7013, + "step": 16930 + }, + { + "epoch": 1.1471644420353682, + "grad_norm": 9.469526290893555, + "learning_rate": 7.784927099733042e-05, + "loss": 0.9617, + "step": 16931 + }, + { + "epoch": 1.1472321973033404, + "grad_norm": 7.8615593910217285, + "learning_rate": 7.78479019782326e-05, + "loss": 0.6682, + "step": 16932 + }, + { + "epoch": 1.1472999525713123, + "grad_norm": 5.507148742675781, + "learning_rate": 7.784653295913479e-05, + "loss": 0.5723, + "step": 16933 + }, + { + "epoch": 1.1473677078392845, + "grad_norm": 7.456667423248291, + "learning_rate": 7.784516394003697e-05, + "loss": 0.8836, + "step": 16934 + }, + { + "epoch": 1.1474354631072565, + "grad_norm": 6.641378402709961, + "learning_rate": 7.784379492093915e-05, + "loss": 0.8819, + "step": 16935 + }, + { + "epoch": 1.1475032183752287, + "grad_norm": 7.157406330108643, + "learning_rate": 7.784242590184133e-05, + "loss": 0.6635, + "step": 16936 + }, + { + "epoch": 1.1475709736432007, + "grad_norm": 5.309220314025879, + "learning_rate": 7.784105688274351e-05, + "loss": 0.5876, + "step": 16937 + }, + { + "epoch": 1.147638728911173, + "grad_norm": 6.855804920196533, + "learning_rate": 7.783968786364571e-05, + "loss": 0.7916, + "step": 16938 + }, + { + "epoch": 1.147706484179145, + "grad_norm": 6.300415992736816, + "learning_rate": 7.783831884454789e-05, + "loss": 0.7407, + "step": 16939 + }, + { + "epoch": 1.147774239447117, + "grad_norm": 5.024459362030029, + "learning_rate": 7.783694982545007e-05, + "loss": 0.669, + "step": 16940 + }, + { + "epoch": 1.147841994715089, + "grad_norm": 5.556980609893799, + "learning_rate": 7.783558080635225e-05, + "loss": 0.6507, + "step": 16941 + }, + { + "epoch": 1.1479097499830613, + "grad_norm": 6.7080535888671875, + "learning_rate": 7.783421178725444e-05, + "loss": 0.8388, + "step": 16942 + }, + { + "epoch": 1.1479775052510333, + "grad_norm": 5.859105110168457, + "learning_rate": 7.783284276815662e-05, + "loss": 0.7661, + "step": 16943 + }, + { + "epoch": 1.1480452605190052, + "grad_norm": 5.6181159019470215, + "learning_rate": 7.78314737490588e-05, + "loss": 0.7629, + "step": 16944 + }, + { + "epoch": 1.1481130157869774, + "grad_norm": 5.8782854080200195, + "learning_rate": 7.783010472996098e-05, + "loss": 0.7253, + "step": 16945 + }, + { + "epoch": 1.1481807710549496, + "grad_norm": 4.898889064788818, + "learning_rate": 7.782873571086316e-05, + "loss": 0.6607, + "step": 16946 + }, + { + "epoch": 1.1482485263229216, + "grad_norm": 6.8231329917907715, + "learning_rate": 7.782736669176536e-05, + "loss": 0.6647, + "step": 16947 + }, + { + "epoch": 1.1483162815908936, + "grad_norm": 4.969480514526367, + "learning_rate": 7.782599767266754e-05, + "loss": 0.5449, + "step": 16948 + }, + { + "epoch": 1.1483840368588658, + "grad_norm": 7.149383544921875, + "learning_rate": 7.782462865356972e-05, + "loss": 0.6182, + "step": 16949 + }, + { + "epoch": 1.1484517921268378, + "grad_norm": 6.851714134216309, + "learning_rate": 7.78232596344719e-05, + "loss": 0.7021, + "step": 16950 + }, + { + "epoch": 1.14851954739481, + "grad_norm": 6.9529924392700195, + "learning_rate": 7.782189061537409e-05, + "loss": 0.7937, + "step": 16951 + }, + { + "epoch": 1.148587302662782, + "grad_norm": 5.88994026184082, + "learning_rate": 7.782052159627627e-05, + "loss": 0.9483, + "step": 16952 + }, + { + "epoch": 1.1486550579307542, + "grad_norm": 5.6566643714904785, + "learning_rate": 7.781915257717845e-05, + "loss": 0.5805, + "step": 16953 + }, + { + "epoch": 1.1487228131987262, + "grad_norm": 6.150807857513428, + "learning_rate": 7.781778355808063e-05, + "loss": 0.7529, + "step": 16954 + }, + { + "epoch": 1.1487905684666984, + "grad_norm": 5.5446038246154785, + "learning_rate": 7.781641453898281e-05, + "loss": 0.6351, + "step": 16955 + }, + { + "epoch": 1.1488583237346703, + "grad_norm": 5.691783428192139, + "learning_rate": 7.781504551988501e-05, + "loss": 0.5886, + "step": 16956 + }, + { + "epoch": 1.1489260790026425, + "grad_norm": 5.392401218414307, + "learning_rate": 7.781367650078719e-05, + "loss": 0.5915, + "step": 16957 + }, + { + "epoch": 1.1489938342706145, + "grad_norm": 7.51052713394165, + "learning_rate": 7.781230748168937e-05, + "loss": 0.713, + "step": 16958 + }, + { + "epoch": 1.1490615895385865, + "grad_norm": 6.7278523445129395, + "learning_rate": 7.781093846259155e-05, + "loss": 0.6796, + "step": 16959 + }, + { + "epoch": 1.1491293448065587, + "grad_norm": 8.327152252197266, + "learning_rate": 7.780956944349373e-05, + "loss": 0.6551, + "step": 16960 + }, + { + "epoch": 1.149197100074531, + "grad_norm": 7.18127965927124, + "learning_rate": 7.780820042439592e-05, + "loss": 0.6148, + "step": 16961 + }, + { + "epoch": 1.149264855342503, + "grad_norm": 4.836153030395508, + "learning_rate": 7.78068314052981e-05, + "loss": 0.5284, + "step": 16962 + }, + { + "epoch": 1.1493326106104749, + "grad_norm": 5.184370994567871, + "learning_rate": 7.780546238620028e-05, + "loss": 0.5887, + "step": 16963 + }, + { + "epoch": 1.149400365878447, + "grad_norm": 7.310936450958252, + "learning_rate": 7.780409336710248e-05, + "loss": 0.6862, + "step": 16964 + }, + { + "epoch": 1.149468121146419, + "grad_norm": 5.159470081329346, + "learning_rate": 7.780272434800466e-05, + "loss": 0.6848, + "step": 16965 + }, + { + "epoch": 1.1495358764143913, + "grad_norm": 5.7770161628723145, + "learning_rate": 7.780135532890684e-05, + "loss": 0.9589, + "step": 16966 + }, + { + "epoch": 1.1496036316823632, + "grad_norm": 6.089867115020752, + "learning_rate": 7.779998630980903e-05, + "loss": 0.6904, + "step": 16967 + }, + { + "epoch": 1.1496713869503354, + "grad_norm": 4.882379531860352, + "learning_rate": 7.779861729071121e-05, + "loss": 0.5668, + "step": 16968 + }, + { + "epoch": 1.1497391422183074, + "grad_norm": 6.222308158874512, + "learning_rate": 7.779724827161339e-05, + "loss": 0.8463, + "step": 16969 + }, + { + "epoch": 1.1498068974862796, + "grad_norm": 7.721385478973389, + "learning_rate": 7.779587925251559e-05, + "loss": 0.869, + "step": 16970 + }, + { + "epoch": 1.1498746527542516, + "grad_norm": 4.819281101226807, + "learning_rate": 7.779451023341777e-05, + "loss": 0.5424, + "step": 16971 + }, + { + "epoch": 1.1499424080222238, + "grad_norm": 5.347133636474609, + "learning_rate": 7.779314121431995e-05, + "loss": 0.5816, + "step": 16972 + }, + { + "epoch": 1.1500101632901958, + "grad_norm": 6.357661724090576, + "learning_rate": 7.779177219522213e-05, + "loss": 0.7384, + "step": 16973 + }, + { + "epoch": 1.150077918558168, + "grad_norm": 6.177036762237549, + "learning_rate": 7.779040317612432e-05, + "loss": 0.6792, + "step": 16974 + }, + { + "epoch": 1.15014567382614, + "grad_norm": 5.559142589569092, + "learning_rate": 7.77890341570265e-05, + "loss": 0.6939, + "step": 16975 + }, + { + "epoch": 1.1502134290941122, + "grad_norm": 4.8017473220825195, + "learning_rate": 7.778766513792868e-05, + "loss": 0.7028, + "step": 16976 + }, + { + "epoch": 1.1502811843620842, + "grad_norm": 5.330672264099121, + "learning_rate": 7.778629611883086e-05, + "loss": 0.7333, + "step": 16977 + }, + { + "epoch": 1.1503489396300561, + "grad_norm": 7.376459121704102, + "learning_rate": 7.778492709973304e-05, + "loss": 0.6522, + "step": 16978 + }, + { + "epoch": 1.1504166948980283, + "grad_norm": 4.467447280883789, + "learning_rate": 7.778355808063524e-05, + "loss": 0.5219, + "step": 16979 + }, + { + "epoch": 1.1504844501660003, + "grad_norm": 5.761083126068115, + "learning_rate": 7.778218906153742e-05, + "loss": 0.6823, + "step": 16980 + }, + { + "epoch": 1.1505522054339725, + "grad_norm": 7.13974666595459, + "learning_rate": 7.77808200424396e-05, + "loss": 0.6415, + "step": 16981 + }, + { + "epoch": 1.1506199607019445, + "grad_norm": 5.323427200317383, + "learning_rate": 7.777945102334178e-05, + "loss": 0.7416, + "step": 16982 + }, + { + "epoch": 1.1506877159699167, + "grad_norm": 8.04574203491211, + "learning_rate": 7.777808200424397e-05, + "loss": 0.7871, + "step": 16983 + }, + { + "epoch": 1.1507554712378887, + "grad_norm": 7.360369682312012, + "learning_rate": 7.777671298514615e-05, + "loss": 0.7206, + "step": 16984 + }, + { + "epoch": 1.1508232265058609, + "grad_norm": 6.681160926818848, + "learning_rate": 7.777534396604833e-05, + "loss": 0.7463, + "step": 16985 + }, + { + "epoch": 1.1508909817738329, + "grad_norm": 7.055415630340576, + "learning_rate": 7.777397494695051e-05, + "loss": 0.8558, + "step": 16986 + }, + { + "epoch": 1.150958737041805, + "grad_norm": 6.905728340148926, + "learning_rate": 7.777260592785269e-05, + "loss": 0.5685, + "step": 16987 + }, + { + "epoch": 1.151026492309777, + "grad_norm": 5.242982387542725, + "learning_rate": 7.777123690875489e-05, + "loss": 0.4737, + "step": 16988 + }, + { + "epoch": 1.1510942475777493, + "grad_norm": 4.060085773468018, + "learning_rate": 7.776986788965707e-05, + "loss": 0.6109, + "step": 16989 + }, + { + "epoch": 1.1511620028457212, + "grad_norm": 6.4642229080200195, + "learning_rate": 7.776849887055925e-05, + "loss": 0.9386, + "step": 16990 + }, + { + "epoch": 1.1512297581136934, + "grad_norm": 4.4571123123168945, + "learning_rate": 7.776712985146143e-05, + "loss": 0.6162, + "step": 16991 + }, + { + "epoch": 1.1512975133816654, + "grad_norm": 4.490910530090332, + "learning_rate": 7.776576083236361e-05, + "loss": 0.6082, + "step": 16992 + }, + { + "epoch": 1.1513652686496374, + "grad_norm": 7.802062034606934, + "learning_rate": 7.77643918132658e-05, + "loss": 0.7308, + "step": 16993 + }, + { + "epoch": 1.1514330239176096, + "grad_norm": 4.726348876953125, + "learning_rate": 7.776302279416798e-05, + "loss": 0.7596, + "step": 16994 + }, + { + "epoch": 1.1515007791855818, + "grad_norm": 7.226054668426514, + "learning_rate": 7.776165377507016e-05, + "loss": 0.8948, + "step": 16995 + }, + { + "epoch": 1.1515685344535538, + "grad_norm": 4.567721843719482, + "learning_rate": 7.776028475597234e-05, + "loss": 0.5063, + "step": 16996 + }, + { + "epoch": 1.1516362897215258, + "grad_norm": 8.410115242004395, + "learning_rate": 7.775891573687454e-05, + "loss": 0.6893, + "step": 16997 + }, + { + "epoch": 1.151704044989498, + "grad_norm": 5.118034839630127, + "learning_rate": 7.775754671777672e-05, + "loss": 0.6749, + "step": 16998 + }, + { + "epoch": 1.15177180025747, + "grad_norm": 6.0118184089660645, + "learning_rate": 7.77561776986789e-05, + "loss": 0.6228, + "step": 16999 + }, + { + "epoch": 1.1518395555254421, + "grad_norm": 4.889934062957764, + "learning_rate": 7.775480867958108e-05, + "loss": 0.7567, + "step": 17000 + }, + { + "epoch": 1.1519073107934141, + "grad_norm": 4.716844081878662, + "learning_rate": 7.775343966048326e-05, + "loss": 0.6083, + "step": 17001 + }, + { + "epoch": 1.1519750660613863, + "grad_norm": 6.07877254486084, + "learning_rate": 7.775207064138545e-05, + "loss": 0.7686, + "step": 17002 + }, + { + "epoch": 1.1520428213293583, + "grad_norm": 6.810544013977051, + "learning_rate": 7.775070162228763e-05, + "loss": 0.5769, + "step": 17003 + }, + { + "epoch": 1.1521105765973305, + "grad_norm": 6.12678337097168, + "learning_rate": 7.774933260318981e-05, + "loss": 0.5156, + "step": 17004 + }, + { + "epoch": 1.1521783318653025, + "grad_norm": 5.999094009399414, + "learning_rate": 7.774796358409199e-05, + "loss": 0.5038, + "step": 17005 + }, + { + "epoch": 1.1522460871332747, + "grad_norm": 5.4058332443237305, + "learning_rate": 7.774659456499419e-05, + "loss": 0.7177, + "step": 17006 + }, + { + "epoch": 1.1523138424012467, + "grad_norm": 4.426954746246338, + "learning_rate": 7.774522554589637e-05, + "loss": 0.4639, + "step": 17007 + }, + { + "epoch": 1.1523815976692187, + "grad_norm": 5.5401530265808105, + "learning_rate": 7.774385652679855e-05, + "loss": 0.818, + "step": 17008 + }, + { + "epoch": 1.1524493529371909, + "grad_norm": 5.004909038543701, + "learning_rate": 7.774248750770073e-05, + "loss": 0.8341, + "step": 17009 + }, + { + "epoch": 1.152517108205163, + "grad_norm": 8.242889404296875, + "learning_rate": 7.774111848860292e-05, + "loss": 0.8211, + "step": 17010 + }, + { + "epoch": 1.152584863473135, + "grad_norm": 5.827528476715088, + "learning_rate": 7.77397494695051e-05, + "loss": 0.6921, + "step": 17011 + }, + { + "epoch": 1.152652618741107, + "grad_norm": 8.284795761108398, + "learning_rate": 7.773838045040728e-05, + "loss": 0.7018, + "step": 17012 + }, + { + "epoch": 1.1527203740090792, + "grad_norm": 8.282230377197266, + "learning_rate": 7.773701143130948e-05, + "loss": 0.7349, + "step": 17013 + }, + { + "epoch": 1.1527881292770512, + "grad_norm": 6.315917015075684, + "learning_rate": 7.773564241221166e-05, + "loss": 0.7272, + "step": 17014 + }, + { + "epoch": 1.1528558845450234, + "grad_norm": 5.585415363311768, + "learning_rate": 7.773427339311384e-05, + "loss": 0.567, + "step": 17015 + }, + { + "epoch": 1.1529236398129954, + "grad_norm": 4.947295665740967, + "learning_rate": 7.773290437401603e-05, + "loss": 0.5206, + "step": 17016 + }, + { + "epoch": 1.1529913950809676, + "grad_norm": 5.741033554077148, + "learning_rate": 7.773153535491821e-05, + "loss": 0.651, + "step": 17017 + }, + { + "epoch": 1.1530591503489396, + "grad_norm": 5.564809322357178, + "learning_rate": 7.773016633582039e-05, + "loss": 0.7332, + "step": 17018 + }, + { + "epoch": 1.1531269056169118, + "grad_norm": 6.336185932159424, + "learning_rate": 7.772879731672257e-05, + "loss": 0.8473, + "step": 17019 + }, + { + "epoch": 1.1531946608848838, + "grad_norm": 10.212484359741211, + "learning_rate": 7.772742829762477e-05, + "loss": 0.598, + "step": 17020 + }, + { + "epoch": 1.153262416152856, + "grad_norm": 5.718522548675537, + "learning_rate": 7.772605927852695e-05, + "loss": 0.7474, + "step": 17021 + }, + { + "epoch": 1.153330171420828, + "grad_norm": 5.485384464263916, + "learning_rate": 7.772469025942913e-05, + "loss": 0.8378, + "step": 17022 + }, + { + "epoch": 1.1533979266888001, + "grad_norm": 4.341707229614258, + "learning_rate": 7.77233212403313e-05, + "loss": 0.6496, + "step": 17023 + }, + { + "epoch": 1.1534656819567721, + "grad_norm": 5.866021633148193, + "learning_rate": 7.772195222123349e-05, + "loss": 0.611, + "step": 17024 + }, + { + "epoch": 1.1535334372247443, + "grad_norm": 5.35130500793457, + "learning_rate": 7.772058320213568e-05, + "loss": 0.7405, + "step": 17025 + }, + { + "epoch": 1.1536011924927163, + "grad_norm": 5.8092780113220215, + "learning_rate": 7.771921418303786e-05, + "loss": 0.7732, + "step": 17026 + }, + { + "epoch": 1.1536689477606883, + "grad_norm": 4.408275604248047, + "learning_rate": 7.771784516394004e-05, + "loss": 0.7354, + "step": 17027 + }, + { + "epoch": 1.1537367030286605, + "grad_norm": 5.520874500274658, + "learning_rate": 7.771647614484222e-05, + "loss": 0.57, + "step": 17028 + }, + { + "epoch": 1.1538044582966325, + "grad_norm": 6.835943698883057, + "learning_rate": 7.771510712574442e-05, + "loss": 0.8913, + "step": 17029 + }, + { + "epoch": 1.1538722135646047, + "grad_norm": 6.106658458709717, + "learning_rate": 7.77137381066466e-05, + "loss": 0.7279, + "step": 17030 + }, + { + "epoch": 1.1539399688325767, + "grad_norm": 4.718100547790527, + "learning_rate": 7.771236908754878e-05, + "loss": 0.5556, + "step": 17031 + }, + { + "epoch": 1.1540077241005489, + "grad_norm": 6.6284871101379395, + "learning_rate": 7.771100006845096e-05, + "loss": 0.8279, + "step": 17032 + }, + { + "epoch": 1.1540754793685208, + "grad_norm": 6.730345726013184, + "learning_rate": 7.770963104935314e-05, + "loss": 0.5271, + "step": 17033 + }, + { + "epoch": 1.154143234636493, + "grad_norm": 8.18601131439209, + "learning_rate": 7.770826203025533e-05, + "loss": 0.7659, + "step": 17034 + }, + { + "epoch": 1.154210989904465, + "grad_norm": 5.387197017669678, + "learning_rate": 7.770689301115751e-05, + "loss": 0.5248, + "step": 17035 + }, + { + "epoch": 1.1542787451724372, + "grad_norm": 7.745261192321777, + "learning_rate": 7.770552399205969e-05, + "loss": 0.9322, + "step": 17036 + }, + { + "epoch": 1.1543465004404092, + "grad_norm": 5.017677307128906, + "learning_rate": 7.770415497296187e-05, + "loss": 0.6844, + "step": 17037 + }, + { + "epoch": 1.1544142557083814, + "grad_norm": 5.272484302520752, + "learning_rate": 7.770278595386407e-05, + "loss": 0.9975, + "step": 17038 + }, + { + "epoch": 1.1544820109763534, + "grad_norm": 6.785526752471924, + "learning_rate": 7.770141693476625e-05, + "loss": 0.8657, + "step": 17039 + }, + { + "epoch": 1.1545497662443256, + "grad_norm": 6.357969284057617, + "learning_rate": 7.770004791566843e-05, + "loss": 0.9176, + "step": 17040 + }, + { + "epoch": 1.1546175215122976, + "grad_norm": 5.590163707733154, + "learning_rate": 7.769867889657061e-05, + "loss": 0.6711, + "step": 17041 + }, + { + "epoch": 1.1546852767802696, + "grad_norm": 6.2367377281188965, + "learning_rate": 7.769730987747279e-05, + "loss": 0.767, + "step": 17042 + }, + { + "epoch": 1.1547530320482418, + "grad_norm": 5.25771951675415, + "learning_rate": 7.769594085837498e-05, + "loss": 0.7303, + "step": 17043 + }, + { + "epoch": 1.154820787316214, + "grad_norm": 5.044614791870117, + "learning_rate": 7.769457183927716e-05, + "loss": 0.5922, + "step": 17044 + }, + { + "epoch": 1.154888542584186, + "grad_norm": 7.982863426208496, + "learning_rate": 7.769320282017934e-05, + "loss": 1.0373, + "step": 17045 + }, + { + "epoch": 1.154956297852158, + "grad_norm": 5.957779884338379, + "learning_rate": 7.769183380108152e-05, + "loss": 0.6403, + "step": 17046 + }, + { + "epoch": 1.1550240531201301, + "grad_norm": 5.288189888000488, + "learning_rate": 7.76904647819837e-05, + "loss": 0.6481, + "step": 17047 + }, + { + "epoch": 1.155091808388102, + "grad_norm": 6.193086624145508, + "learning_rate": 7.76890957628859e-05, + "loss": 0.9038, + "step": 17048 + }, + { + "epoch": 1.1551595636560743, + "grad_norm": 8.397180557250977, + "learning_rate": 7.768772674378808e-05, + "loss": 0.689, + "step": 17049 + }, + { + "epoch": 1.1552273189240463, + "grad_norm": 5.024189472198486, + "learning_rate": 7.768635772469026e-05, + "loss": 0.7721, + "step": 17050 + }, + { + "epoch": 1.1552950741920185, + "grad_norm": 5.071702480316162, + "learning_rate": 7.768498870559244e-05, + "loss": 0.9724, + "step": 17051 + }, + { + "epoch": 1.1553628294599905, + "grad_norm": 6.766203880310059, + "learning_rate": 7.768361968649463e-05, + "loss": 0.8014, + "step": 17052 + }, + { + "epoch": 1.1554305847279627, + "grad_norm": 7.111300945281982, + "learning_rate": 7.768225066739681e-05, + "loss": 0.6352, + "step": 17053 + }, + { + "epoch": 1.1554983399959347, + "grad_norm": 7.011511325836182, + "learning_rate": 7.768088164829899e-05, + "loss": 0.8808, + "step": 17054 + }, + { + "epoch": 1.1555660952639069, + "grad_norm": 5.766463756561279, + "learning_rate": 7.767951262920117e-05, + "loss": 0.7257, + "step": 17055 + }, + { + "epoch": 1.1556338505318788, + "grad_norm": 5.269811153411865, + "learning_rate": 7.767814361010337e-05, + "loss": 0.557, + "step": 17056 + }, + { + "epoch": 1.1557016057998508, + "grad_norm": 5.217252254486084, + "learning_rate": 7.767677459100555e-05, + "loss": 0.6047, + "step": 17057 + }, + { + "epoch": 1.155769361067823, + "grad_norm": 5.220236778259277, + "learning_rate": 7.767540557190773e-05, + "loss": 0.6139, + "step": 17058 + }, + { + "epoch": 1.1558371163357952, + "grad_norm": 5.2868499755859375, + "learning_rate": 7.767403655280992e-05, + "loss": 0.5907, + "step": 17059 + }, + { + "epoch": 1.1559048716037672, + "grad_norm": 5.082064151763916, + "learning_rate": 7.76726675337121e-05, + "loss": 0.6459, + "step": 17060 + }, + { + "epoch": 1.1559726268717392, + "grad_norm": 5.143309116363525, + "learning_rate": 7.767129851461428e-05, + "loss": 0.7064, + "step": 17061 + }, + { + "epoch": 1.1560403821397114, + "grad_norm": 5.849390029907227, + "learning_rate": 7.766992949551648e-05, + "loss": 0.632, + "step": 17062 + }, + { + "epoch": 1.1561081374076834, + "grad_norm": 5.502965927124023, + "learning_rate": 7.766856047641866e-05, + "loss": 0.5645, + "step": 17063 + }, + { + "epoch": 1.1561758926756556, + "grad_norm": 7.527248382568359, + "learning_rate": 7.766719145732084e-05, + "loss": 0.707, + "step": 17064 + }, + { + "epoch": 1.1562436479436276, + "grad_norm": 11.085021018981934, + "learning_rate": 7.766582243822302e-05, + "loss": 0.5026, + "step": 17065 + }, + { + "epoch": 1.1563114032115998, + "grad_norm": 4.332429885864258, + "learning_rate": 7.766445341912521e-05, + "loss": 0.4451, + "step": 17066 + }, + { + "epoch": 1.1563791584795717, + "grad_norm": 6.877386569976807, + "learning_rate": 7.766308440002739e-05, + "loss": 0.6717, + "step": 17067 + }, + { + "epoch": 1.156446913747544, + "grad_norm": 8.853668212890625, + "learning_rate": 7.766171538092957e-05, + "loss": 0.7166, + "step": 17068 + }, + { + "epoch": 1.156514669015516, + "grad_norm": 6.745375156402588, + "learning_rate": 7.766034636183175e-05, + "loss": 0.7458, + "step": 17069 + }, + { + "epoch": 1.1565824242834881, + "grad_norm": 4.851019859313965, + "learning_rate": 7.765897734273393e-05, + "loss": 0.5425, + "step": 17070 + }, + { + "epoch": 1.15665017955146, + "grad_norm": 8.129114151000977, + "learning_rate": 7.765760832363613e-05, + "loss": 0.7745, + "step": 17071 + }, + { + "epoch": 1.1567179348194323, + "grad_norm": 7.2463603019714355, + "learning_rate": 7.76562393045383e-05, + "loss": 0.7822, + "step": 17072 + }, + { + "epoch": 1.1567856900874043, + "grad_norm": 6.648437023162842, + "learning_rate": 7.765487028544049e-05, + "loss": 0.7371, + "step": 17073 + }, + { + "epoch": 1.1568534453553765, + "grad_norm": 4.990374565124512, + "learning_rate": 7.765350126634267e-05, + "loss": 0.6751, + "step": 17074 + }, + { + "epoch": 1.1569212006233485, + "grad_norm": 11.30855655670166, + "learning_rate": 7.765213224724486e-05, + "loss": 0.5861, + "step": 17075 + }, + { + "epoch": 1.1569889558913204, + "grad_norm": 6.271945953369141, + "learning_rate": 7.765076322814704e-05, + "loss": 0.6633, + "step": 17076 + }, + { + "epoch": 1.1570567111592927, + "grad_norm": 6.8997392654418945, + "learning_rate": 7.764939420904922e-05, + "loss": 0.6447, + "step": 17077 + }, + { + "epoch": 1.1571244664272646, + "grad_norm": 7.4019389152526855, + "learning_rate": 7.76480251899514e-05, + "loss": 0.7981, + "step": 17078 + }, + { + "epoch": 1.1571922216952368, + "grad_norm": 8.452839851379395, + "learning_rate": 7.764665617085358e-05, + "loss": 0.9539, + "step": 17079 + }, + { + "epoch": 1.1572599769632088, + "grad_norm": 5.080465793609619, + "learning_rate": 7.764528715175578e-05, + "loss": 0.6904, + "step": 17080 + }, + { + "epoch": 1.157327732231181, + "grad_norm": 5.674568176269531, + "learning_rate": 7.764391813265796e-05, + "loss": 0.5814, + "step": 17081 + }, + { + "epoch": 1.157395487499153, + "grad_norm": 5.936946392059326, + "learning_rate": 7.764254911356014e-05, + "loss": 0.7214, + "step": 17082 + }, + { + "epoch": 1.1574632427671252, + "grad_norm": 5.97898530960083, + "learning_rate": 7.764118009446232e-05, + "loss": 0.8698, + "step": 17083 + }, + { + "epoch": 1.1575309980350972, + "grad_norm": 6.230330467224121, + "learning_rate": 7.763981107536451e-05, + "loss": 0.7555, + "step": 17084 + }, + { + "epoch": 1.1575987533030694, + "grad_norm": 4.197784900665283, + "learning_rate": 7.763844205626669e-05, + "loss": 0.5187, + "step": 17085 + }, + { + "epoch": 1.1576665085710414, + "grad_norm": 7.4203643798828125, + "learning_rate": 7.763707303716887e-05, + "loss": 0.9264, + "step": 17086 + }, + { + "epoch": 1.1577342638390136, + "grad_norm": 8.698983192443848, + "learning_rate": 7.763570401807105e-05, + "loss": 0.605, + "step": 17087 + }, + { + "epoch": 1.1578020191069855, + "grad_norm": 6.924211502075195, + "learning_rate": 7.763433499897323e-05, + "loss": 0.7919, + "step": 17088 + }, + { + "epoch": 1.1578697743749577, + "grad_norm": 4.469345569610596, + "learning_rate": 7.763296597987543e-05, + "loss": 0.4836, + "step": 17089 + }, + { + "epoch": 1.1579375296429297, + "grad_norm": 6.712977409362793, + "learning_rate": 7.76315969607776e-05, + "loss": 0.7123, + "step": 17090 + }, + { + "epoch": 1.1580052849109017, + "grad_norm": 5.67714786529541, + "learning_rate": 7.763022794167979e-05, + "loss": 0.7613, + "step": 17091 + }, + { + "epoch": 1.158073040178874, + "grad_norm": 6.554959297180176, + "learning_rate": 7.762885892258197e-05, + "loss": 0.7535, + "step": 17092 + }, + { + "epoch": 1.1581407954468461, + "grad_norm": 6.625911235809326, + "learning_rate": 7.762748990348415e-05, + "loss": 0.8759, + "step": 17093 + }, + { + "epoch": 1.158208550714818, + "grad_norm": 6.199068069458008, + "learning_rate": 7.762612088438634e-05, + "loss": 1.0106, + "step": 17094 + }, + { + "epoch": 1.15827630598279, + "grad_norm": 7.14901876449585, + "learning_rate": 7.762475186528852e-05, + "loss": 0.665, + "step": 17095 + }, + { + "epoch": 1.1583440612507623, + "grad_norm": 8.458894729614258, + "learning_rate": 7.76233828461907e-05, + "loss": 0.8775, + "step": 17096 + }, + { + "epoch": 1.1584118165187343, + "grad_norm": 5.443024635314941, + "learning_rate": 7.762201382709288e-05, + "loss": 0.6247, + "step": 17097 + }, + { + "epoch": 1.1584795717867065, + "grad_norm": 5.4433817863464355, + "learning_rate": 7.762064480799508e-05, + "loss": 0.8298, + "step": 17098 + }, + { + "epoch": 1.1585473270546784, + "grad_norm": 8.201144218444824, + "learning_rate": 7.761927578889726e-05, + "loss": 0.5813, + "step": 17099 + }, + { + "epoch": 1.1586150823226506, + "grad_norm": 5.2088236808776855, + "learning_rate": 7.761790676979944e-05, + "loss": 0.7396, + "step": 17100 + }, + { + "epoch": 1.1586828375906226, + "grad_norm": 7.154380798339844, + "learning_rate": 7.761653775070162e-05, + "loss": 0.6321, + "step": 17101 + }, + { + "epoch": 1.1587505928585948, + "grad_norm": 7.217487812042236, + "learning_rate": 7.761516873160381e-05, + "loss": 0.4882, + "step": 17102 + }, + { + "epoch": 1.1588183481265668, + "grad_norm": 5.230490684509277, + "learning_rate": 7.761379971250599e-05, + "loss": 0.6101, + "step": 17103 + }, + { + "epoch": 1.158886103394539, + "grad_norm": 8.068559646606445, + "learning_rate": 7.761243069340817e-05, + "loss": 0.594, + "step": 17104 + }, + { + "epoch": 1.158953858662511, + "grad_norm": 4.883063316345215, + "learning_rate": 7.761106167431037e-05, + "loss": 0.6828, + "step": 17105 + }, + { + "epoch": 1.159021613930483, + "grad_norm": 6.669475078582764, + "learning_rate": 7.760969265521255e-05, + "loss": 0.8075, + "step": 17106 + }, + { + "epoch": 1.1590893691984552, + "grad_norm": 7.147501468658447, + "learning_rate": 7.760832363611473e-05, + "loss": 0.6016, + "step": 17107 + }, + { + "epoch": 1.1591571244664274, + "grad_norm": 5.705437660217285, + "learning_rate": 7.760695461701692e-05, + "loss": 0.7725, + "step": 17108 + }, + { + "epoch": 1.1592248797343994, + "grad_norm": 7.769946098327637, + "learning_rate": 7.76055855979191e-05, + "loss": 0.6734, + "step": 17109 + }, + { + "epoch": 1.1592926350023713, + "grad_norm": 5.215763092041016, + "learning_rate": 7.760421657882128e-05, + "loss": 0.659, + "step": 17110 + }, + { + "epoch": 1.1593603902703435, + "grad_norm": 6.284592628479004, + "learning_rate": 7.760284755972346e-05, + "loss": 0.7078, + "step": 17111 + }, + { + "epoch": 1.1594281455383155, + "grad_norm": 6.298258304595947, + "learning_rate": 7.760147854062566e-05, + "loss": 0.4822, + "step": 17112 + }, + { + "epoch": 1.1594959008062877, + "grad_norm": 6.485654830932617, + "learning_rate": 7.760010952152784e-05, + "loss": 0.7425, + "step": 17113 + }, + { + "epoch": 1.1595636560742597, + "grad_norm": 4.581310272216797, + "learning_rate": 7.759874050243002e-05, + "loss": 0.5134, + "step": 17114 + }, + { + "epoch": 1.159631411342232, + "grad_norm": 5.878685474395752, + "learning_rate": 7.75973714833322e-05, + "loss": 0.71, + "step": 17115 + }, + { + "epoch": 1.159699166610204, + "grad_norm": 5.818406581878662, + "learning_rate": 7.759600246423439e-05, + "loss": 0.705, + "step": 17116 + }, + { + "epoch": 1.159766921878176, + "grad_norm": 6.247715950012207, + "learning_rate": 7.759463344513657e-05, + "loss": 0.747, + "step": 17117 + }, + { + "epoch": 1.159834677146148, + "grad_norm": 7.777171611785889, + "learning_rate": 7.759326442603875e-05, + "loss": 0.8842, + "step": 17118 + }, + { + "epoch": 1.1599024324141203, + "grad_norm": 4.369725704193115, + "learning_rate": 7.759189540694093e-05, + "loss": 0.5713, + "step": 17119 + }, + { + "epoch": 1.1599701876820923, + "grad_norm": 5.170098304748535, + "learning_rate": 7.759052638784311e-05, + "loss": 0.7709, + "step": 17120 + }, + { + "epoch": 1.1600379429500645, + "grad_norm": 7.886532306671143, + "learning_rate": 7.75891573687453e-05, + "loss": 0.5762, + "step": 17121 + }, + { + "epoch": 1.1601056982180364, + "grad_norm": 6.549617290496826, + "learning_rate": 7.758778834964749e-05, + "loss": 0.597, + "step": 17122 + }, + { + "epoch": 1.1601734534860086, + "grad_norm": 8.460987091064453, + "learning_rate": 7.758641933054967e-05, + "loss": 0.7194, + "step": 17123 + }, + { + "epoch": 1.1602412087539806, + "grad_norm": 8.659561157226562, + "learning_rate": 7.758505031145185e-05, + "loss": 0.845, + "step": 17124 + }, + { + "epoch": 1.1603089640219526, + "grad_norm": 6.515053749084473, + "learning_rate": 7.758368129235403e-05, + "loss": 0.9518, + "step": 17125 + }, + { + "epoch": 1.1603767192899248, + "grad_norm": 14.59138011932373, + "learning_rate": 7.758231227325622e-05, + "loss": 0.6451, + "step": 17126 + }, + { + "epoch": 1.1604444745578968, + "grad_norm": 5.754178047180176, + "learning_rate": 7.75809432541584e-05, + "loss": 0.6652, + "step": 17127 + }, + { + "epoch": 1.160512229825869, + "grad_norm": 5.252496242523193, + "learning_rate": 7.757957423506058e-05, + "loss": 0.6412, + "step": 17128 + }, + { + "epoch": 1.160579985093841, + "grad_norm": 7.788862228393555, + "learning_rate": 7.757820521596276e-05, + "loss": 0.7068, + "step": 17129 + }, + { + "epoch": 1.1606477403618132, + "grad_norm": 5.3729143142700195, + "learning_rate": 7.757683619686496e-05, + "loss": 0.6523, + "step": 17130 + }, + { + "epoch": 1.1607154956297852, + "grad_norm": 4.968121528625488, + "learning_rate": 7.757546717776714e-05, + "loss": 0.5823, + "step": 17131 + }, + { + "epoch": 1.1607832508977574, + "grad_norm": 6.920864105224609, + "learning_rate": 7.757409815866932e-05, + "loss": 0.8567, + "step": 17132 + }, + { + "epoch": 1.1608510061657293, + "grad_norm": 8.479312896728516, + "learning_rate": 7.75727291395715e-05, + "loss": 0.8317, + "step": 17133 + }, + { + "epoch": 1.1609187614337015, + "grad_norm": 8.27676010131836, + "learning_rate": 7.757136012047368e-05, + "loss": 0.7197, + "step": 17134 + }, + { + "epoch": 1.1609865167016735, + "grad_norm": 4.960324764251709, + "learning_rate": 7.756999110137587e-05, + "loss": 0.6568, + "step": 17135 + }, + { + "epoch": 1.1610542719696457, + "grad_norm": 5.801924705505371, + "learning_rate": 7.756862208227805e-05, + "loss": 0.7134, + "step": 17136 + }, + { + "epoch": 1.1611220272376177, + "grad_norm": 7.101438045501709, + "learning_rate": 7.756725306318023e-05, + "loss": 0.7979, + "step": 17137 + }, + { + "epoch": 1.16118978250559, + "grad_norm": 3.8948190212249756, + "learning_rate": 7.756588404408241e-05, + "loss": 0.6097, + "step": 17138 + }, + { + "epoch": 1.1612575377735619, + "grad_norm": 5.797446250915527, + "learning_rate": 7.75645150249846e-05, + "loss": 0.68, + "step": 17139 + }, + { + "epoch": 1.1613252930415339, + "grad_norm": 7.5896525382995605, + "learning_rate": 7.756314600588679e-05, + "loss": 0.7619, + "step": 17140 + }, + { + "epoch": 1.161393048309506, + "grad_norm": 4.365643501281738, + "learning_rate": 7.756177698678897e-05, + "loss": 0.7374, + "step": 17141 + }, + { + "epoch": 1.1614608035774783, + "grad_norm": 5.378994941711426, + "learning_rate": 7.756040796769115e-05, + "loss": 0.7987, + "step": 17142 + }, + { + "epoch": 1.1615285588454503, + "grad_norm": 5.658888339996338, + "learning_rate": 7.755903894859333e-05, + "loss": 0.6007, + "step": 17143 + }, + { + "epoch": 1.1615963141134222, + "grad_norm": 4.8605170249938965, + "learning_rate": 7.755766992949552e-05, + "loss": 0.5425, + "step": 17144 + }, + { + "epoch": 1.1616640693813944, + "grad_norm": 6.8122100830078125, + "learning_rate": 7.75563009103977e-05, + "loss": 0.5716, + "step": 17145 + }, + { + "epoch": 1.1617318246493664, + "grad_norm": 5.280952453613281, + "learning_rate": 7.755493189129988e-05, + "loss": 0.8245, + "step": 17146 + }, + { + "epoch": 1.1617995799173386, + "grad_norm": 6.752953052520752, + "learning_rate": 7.755356287220206e-05, + "loss": 0.7489, + "step": 17147 + }, + { + "epoch": 1.1618673351853106, + "grad_norm": 5.453287124633789, + "learning_rate": 7.755219385310424e-05, + "loss": 0.6679, + "step": 17148 + }, + { + "epoch": 1.1619350904532828, + "grad_norm": 5.091671466827393, + "learning_rate": 7.755082483400644e-05, + "loss": 0.5512, + "step": 17149 + }, + { + "epoch": 1.1620028457212548, + "grad_norm": 7.830700397491455, + "learning_rate": 7.754945581490862e-05, + "loss": 0.7014, + "step": 17150 + }, + { + "epoch": 1.162070600989227, + "grad_norm": 6.83212947845459, + "learning_rate": 7.75480867958108e-05, + "loss": 0.4873, + "step": 17151 + }, + { + "epoch": 1.162138356257199, + "grad_norm": 6.0037841796875, + "learning_rate": 7.754671777671299e-05, + "loss": 0.6945, + "step": 17152 + }, + { + "epoch": 1.1622061115251712, + "grad_norm": 5.744686603546143, + "learning_rate": 7.754534875761517e-05, + "loss": 0.805, + "step": 17153 + }, + { + "epoch": 1.1622738667931432, + "grad_norm": 8.571599960327148, + "learning_rate": 7.754397973851737e-05, + "loss": 0.8147, + "step": 17154 + }, + { + "epoch": 1.1623416220611151, + "grad_norm": 5.277167320251465, + "learning_rate": 7.754261071941955e-05, + "loss": 0.6513, + "step": 17155 + }, + { + "epoch": 1.1624093773290873, + "grad_norm": 5.50898551940918, + "learning_rate": 7.754124170032173e-05, + "loss": 0.7769, + "step": 17156 + }, + { + "epoch": 1.1624771325970595, + "grad_norm": 6.313873767852783, + "learning_rate": 7.75398726812239e-05, + "loss": 0.7234, + "step": 17157 + }, + { + "epoch": 1.1625448878650315, + "grad_norm": 5.537959575653076, + "learning_rate": 7.75385036621261e-05, + "loss": 0.7568, + "step": 17158 + }, + { + "epoch": 1.1626126431330035, + "grad_norm": 5.766448497772217, + "learning_rate": 7.753713464302828e-05, + "loss": 0.6761, + "step": 17159 + }, + { + "epoch": 1.1626803984009757, + "grad_norm": 6.09381628036499, + "learning_rate": 7.753576562393046e-05, + "loss": 0.8554, + "step": 17160 + }, + { + "epoch": 1.1627481536689477, + "grad_norm": 5.5013251304626465, + "learning_rate": 7.753439660483264e-05, + "loss": 0.6297, + "step": 17161 + }, + { + "epoch": 1.1628159089369199, + "grad_norm": 6.226048946380615, + "learning_rate": 7.753302758573483e-05, + "loss": 0.7102, + "step": 17162 + }, + { + "epoch": 1.1628836642048919, + "grad_norm": 5.695722579956055, + "learning_rate": 7.753165856663702e-05, + "loss": 0.5396, + "step": 17163 + }, + { + "epoch": 1.162951419472864, + "grad_norm": 6.318460941314697, + "learning_rate": 7.75302895475392e-05, + "loss": 0.7032, + "step": 17164 + }, + { + "epoch": 1.163019174740836, + "grad_norm": 6.319543361663818, + "learning_rate": 7.752892052844138e-05, + "loss": 0.5056, + "step": 17165 + }, + { + "epoch": 1.1630869300088083, + "grad_norm": 5.164156913757324, + "learning_rate": 7.752755150934356e-05, + "loss": 0.5995, + "step": 17166 + }, + { + "epoch": 1.1631546852767802, + "grad_norm": 5.994420051574707, + "learning_rate": 7.752618249024575e-05, + "loss": 1.1241, + "step": 17167 + }, + { + "epoch": 1.1632224405447524, + "grad_norm": 5.081625461578369, + "learning_rate": 7.752481347114793e-05, + "loss": 0.6978, + "step": 17168 + }, + { + "epoch": 1.1632901958127244, + "grad_norm": 5.624451637268066, + "learning_rate": 7.752344445205011e-05, + "loss": 0.7173, + "step": 17169 + }, + { + "epoch": 1.1633579510806966, + "grad_norm": 4.424026966094971, + "learning_rate": 7.752207543295229e-05, + "loss": 0.4382, + "step": 17170 + }, + { + "epoch": 1.1634257063486686, + "grad_norm": 6.661045551300049, + "learning_rate": 7.752070641385449e-05, + "loss": 0.6755, + "step": 17171 + }, + { + "epoch": 1.1634934616166408, + "grad_norm": 5.17708158493042, + "learning_rate": 7.751933739475667e-05, + "loss": 0.4775, + "step": 17172 + }, + { + "epoch": 1.1635612168846128, + "grad_norm": 6.10673713684082, + "learning_rate": 7.751796837565885e-05, + "loss": 0.7051, + "step": 17173 + }, + { + "epoch": 1.1636289721525848, + "grad_norm": 5.64744234085083, + "learning_rate": 7.751659935656103e-05, + "loss": 0.7049, + "step": 17174 + }, + { + "epoch": 1.163696727420557, + "grad_norm": 7.636764049530029, + "learning_rate": 7.75152303374632e-05, + "loss": 0.7381, + "step": 17175 + }, + { + "epoch": 1.163764482688529, + "grad_norm": 10.210199356079102, + "learning_rate": 7.75138613183654e-05, + "loss": 0.8437, + "step": 17176 + }, + { + "epoch": 1.1638322379565011, + "grad_norm": 4.762551784515381, + "learning_rate": 7.751249229926758e-05, + "loss": 0.5048, + "step": 17177 + }, + { + "epoch": 1.1638999932244731, + "grad_norm": 5.5700883865356445, + "learning_rate": 7.751112328016976e-05, + "loss": 0.8869, + "step": 17178 + }, + { + "epoch": 1.1639677484924453, + "grad_norm": 5.91666316986084, + "learning_rate": 7.750975426107194e-05, + "loss": 0.6451, + "step": 17179 + }, + { + "epoch": 1.1640355037604173, + "grad_norm": 5.347517013549805, + "learning_rate": 7.750838524197412e-05, + "loss": 0.6594, + "step": 17180 + }, + { + "epoch": 1.1641032590283895, + "grad_norm": 5.739136219024658, + "learning_rate": 7.750701622287632e-05, + "loss": 0.6144, + "step": 17181 + }, + { + "epoch": 1.1641710142963615, + "grad_norm": 5.545490264892578, + "learning_rate": 7.75056472037785e-05, + "loss": 0.5842, + "step": 17182 + }, + { + "epoch": 1.1642387695643337, + "grad_norm": 7.26037073135376, + "learning_rate": 7.750427818468068e-05, + "loss": 0.7446, + "step": 17183 + }, + { + "epoch": 1.1643065248323057, + "grad_norm": 8.13321590423584, + "learning_rate": 7.750290916558286e-05, + "loss": 0.7403, + "step": 17184 + }, + { + "epoch": 1.1643742801002779, + "grad_norm": 5.851143836975098, + "learning_rate": 7.750154014648505e-05, + "loss": 0.6878, + "step": 17185 + }, + { + "epoch": 1.1644420353682499, + "grad_norm": 5.682823181152344, + "learning_rate": 7.750017112738723e-05, + "loss": 0.7803, + "step": 17186 + }, + { + "epoch": 1.164509790636222, + "grad_norm": 7.069244861602783, + "learning_rate": 7.749880210828941e-05, + "loss": 1.0644, + "step": 17187 + }, + { + "epoch": 1.164577545904194, + "grad_norm": 4.923807144165039, + "learning_rate": 7.749743308919159e-05, + "loss": 0.5087, + "step": 17188 + }, + { + "epoch": 1.164645301172166, + "grad_norm": 7.384223937988281, + "learning_rate": 7.749606407009377e-05, + "loss": 0.6694, + "step": 17189 + }, + { + "epoch": 1.1647130564401382, + "grad_norm": 4.950394630432129, + "learning_rate": 7.749469505099597e-05, + "loss": 0.5682, + "step": 17190 + }, + { + "epoch": 1.1647808117081102, + "grad_norm": 3.849876642227173, + "learning_rate": 7.749332603189815e-05, + "loss": 0.4433, + "step": 17191 + }, + { + "epoch": 1.1648485669760824, + "grad_norm": 6.983705520629883, + "learning_rate": 7.749195701280033e-05, + "loss": 0.9134, + "step": 17192 + }, + { + "epoch": 1.1649163222440544, + "grad_norm": 5.343315601348877, + "learning_rate": 7.74905879937025e-05, + "loss": 0.6362, + "step": 17193 + }, + { + "epoch": 1.1649840775120266, + "grad_norm": 8.281697273254395, + "learning_rate": 7.74892189746047e-05, + "loss": 0.8319, + "step": 17194 + }, + { + "epoch": 1.1650518327799986, + "grad_norm": 7.667835712432861, + "learning_rate": 7.748784995550688e-05, + "loss": 0.6045, + "step": 17195 + }, + { + "epoch": 1.1651195880479708, + "grad_norm": 7.925049304962158, + "learning_rate": 7.748648093640906e-05, + "loss": 0.7674, + "step": 17196 + }, + { + "epoch": 1.1651873433159428, + "grad_norm": 6.60289192199707, + "learning_rate": 7.748511191731124e-05, + "loss": 0.7478, + "step": 17197 + }, + { + "epoch": 1.165255098583915, + "grad_norm": 4.811013698577881, + "learning_rate": 7.748374289821344e-05, + "loss": 0.5568, + "step": 17198 + }, + { + "epoch": 1.165322853851887, + "grad_norm": 5.416989803314209, + "learning_rate": 7.748237387911562e-05, + "loss": 0.667, + "step": 17199 + }, + { + "epoch": 1.1653906091198591, + "grad_norm": 5.876220703125, + "learning_rate": 7.74810048600178e-05, + "loss": 0.8068, + "step": 17200 + }, + { + "epoch": 1.1654583643878311, + "grad_norm": 5.452373504638672, + "learning_rate": 7.747963584091999e-05, + "loss": 0.7206, + "step": 17201 + }, + { + "epoch": 1.1655261196558033, + "grad_norm": 5.000153064727783, + "learning_rate": 7.747826682182217e-05, + "loss": 0.8598, + "step": 17202 + }, + { + "epoch": 1.1655938749237753, + "grad_norm": 4.371356010437012, + "learning_rate": 7.747689780272435e-05, + "loss": 0.6802, + "step": 17203 + }, + { + "epoch": 1.1656616301917473, + "grad_norm": 5.595484733581543, + "learning_rate": 7.747552878362654e-05, + "loss": 0.71, + "step": 17204 + }, + { + "epoch": 1.1657293854597195, + "grad_norm": 6.505746364593506, + "learning_rate": 7.747415976452873e-05, + "loss": 0.7463, + "step": 17205 + }, + { + "epoch": 1.1657971407276917, + "grad_norm": 8.62131404876709, + "learning_rate": 7.74727907454309e-05, + "loss": 0.5391, + "step": 17206 + }, + { + "epoch": 1.1658648959956637, + "grad_norm": 4.648898601531982, + "learning_rate": 7.747142172633309e-05, + "loss": 0.5731, + "step": 17207 + }, + { + "epoch": 1.1659326512636357, + "grad_norm": 4.276782989501953, + "learning_rate": 7.747005270723528e-05, + "loss": 0.5733, + "step": 17208 + }, + { + "epoch": 1.1660004065316079, + "grad_norm": 7.70058012008667, + "learning_rate": 7.746868368813746e-05, + "loss": 0.655, + "step": 17209 + }, + { + "epoch": 1.1660681617995798, + "grad_norm": 6.740431785583496, + "learning_rate": 7.746731466903964e-05, + "loss": 0.782, + "step": 17210 + }, + { + "epoch": 1.166135917067552, + "grad_norm": 5.23295783996582, + "learning_rate": 7.746594564994182e-05, + "loss": 0.7198, + "step": 17211 + }, + { + "epoch": 1.166203672335524, + "grad_norm": 4.6036458015441895, + "learning_rate": 7.7464576630844e-05, + "loss": 0.7787, + "step": 17212 + }, + { + "epoch": 1.1662714276034962, + "grad_norm": 4.967904567718506, + "learning_rate": 7.74632076117462e-05, + "loss": 0.6894, + "step": 17213 + }, + { + "epoch": 1.1663391828714682, + "grad_norm": 6.81717586517334, + "learning_rate": 7.746183859264838e-05, + "loss": 0.6447, + "step": 17214 + }, + { + "epoch": 1.1664069381394404, + "grad_norm": 5.5367112159729, + "learning_rate": 7.746046957355056e-05, + "loss": 0.6741, + "step": 17215 + }, + { + "epoch": 1.1664746934074124, + "grad_norm": 6.124912261962891, + "learning_rate": 7.745910055445274e-05, + "loss": 0.8282, + "step": 17216 + }, + { + "epoch": 1.1665424486753846, + "grad_norm": 7.484246253967285, + "learning_rate": 7.745773153535493e-05, + "loss": 0.8897, + "step": 17217 + }, + { + "epoch": 1.1666102039433566, + "grad_norm": 5.709711074829102, + "learning_rate": 7.745636251625711e-05, + "loss": 0.7469, + "step": 17218 + }, + { + "epoch": 1.1666779592113288, + "grad_norm": 5.799917221069336, + "learning_rate": 7.745499349715929e-05, + "loss": 0.6627, + "step": 17219 + }, + { + "epoch": 1.1667457144793008, + "grad_norm": 8.688015937805176, + "learning_rate": 7.745362447806147e-05, + "loss": 0.8772, + "step": 17220 + }, + { + "epoch": 1.166813469747273, + "grad_norm": 5.448634147644043, + "learning_rate": 7.745225545896365e-05, + "loss": 0.5945, + "step": 17221 + }, + { + "epoch": 1.166881225015245, + "grad_norm": 5.151609420776367, + "learning_rate": 7.745088643986585e-05, + "loss": 0.557, + "step": 17222 + }, + { + "epoch": 1.166948980283217, + "grad_norm": 7.523448467254639, + "learning_rate": 7.744951742076803e-05, + "loss": 0.7231, + "step": 17223 + }, + { + "epoch": 1.1670167355511891, + "grad_norm": 6.95708703994751, + "learning_rate": 7.74481484016702e-05, + "loss": 0.9136, + "step": 17224 + }, + { + "epoch": 1.167084490819161, + "grad_norm": 4.518056869506836, + "learning_rate": 7.744677938257239e-05, + "loss": 0.6328, + "step": 17225 + }, + { + "epoch": 1.1671522460871333, + "grad_norm": 5.213376522064209, + "learning_rate": 7.744541036347457e-05, + "loss": 0.5484, + "step": 17226 + }, + { + "epoch": 1.1672200013551053, + "grad_norm": 6.388365268707275, + "learning_rate": 7.744404134437676e-05, + "loss": 0.6401, + "step": 17227 + }, + { + "epoch": 1.1672877566230775, + "grad_norm": 5.914669990539551, + "learning_rate": 7.744267232527894e-05, + "loss": 0.7474, + "step": 17228 + }, + { + "epoch": 1.1673555118910495, + "grad_norm": 7.565332889556885, + "learning_rate": 7.744130330618112e-05, + "loss": 0.637, + "step": 17229 + }, + { + "epoch": 1.1674232671590217, + "grad_norm": 4.865163326263428, + "learning_rate": 7.74399342870833e-05, + "loss": 0.6212, + "step": 17230 + }, + { + "epoch": 1.1674910224269937, + "grad_norm": 4.324069976806641, + "learning_rate": 7.74385652679855e-05, + "loss": 0.533, + "step": 17231 + }, + { + "epoch": 1.1675587776949659, + "grad_norm": 5.414638519287109, + "learning_rate": 7.743719624888768e-05, + "loss": 0.4943, + "step": 17232 + }, + { + "epoch": 1.1676265329629378, + "grad_norm": 6.219473838806152, + "learning_rate": 7.743582722978986e-05, + "loss": 0.5288, + "step": 17233 + }, + { + "epoch": 1.16769428823091, + "grad_norm": 6.105441093444824, + "learning_rate": 7.743445821069204e-05, + "loss": 0.5303, + "step": 17234 + }, + { + "epoch": 1.167762043498882, + "grad_norm": 5.521098613739014, + "learning_rate": 7.743308919159422e-05, + "loss": 0.6609, + "step": 17235 + }, + { + "epoch": 1.1678297987668542, + "grad_norm": 5.3490424156188965, + "learning_rate": 7.743172017249641e-05, + "loss": 0.5023, + "step": 17236 + }, + { + "epoch": 1.1678975540348262, + "grad_norm": 7.1635541915893555, + "learning_rate": 7.743035115339859e-05, + "loss": 0.8012, + "step": 17237 + }, + { + "epoch": 1.1679653093027982, + "grad_norm": 6.178504467010498, + "learning_rate": 7.742898213430077e-05, + "loss": 0.8079, + "step": 17238 + }, + { + "epoch": 1.1680330645707704, + "grad_norm": 4.957045078277588, + "learning_rate": 7.742761311520295e-05, + "loss": 0.6104, + "step": 17239 + }, + { + "epoch": 1.1681008198387424, + "grad_norm": 8.841684341430664, + "learning_rate": 7.742624409610515e-05, + "loss": 0.551, + "step": 17240 + }, + { + "epoch": 1.1681685751067146, + "grad_norm": 5.315309524536133, + "learning_rate": 7.742487507700733e-05, + "loss": 0.6352, + "step": 17241 + }, + { + "epoch": 1.1682363303746865, + "grad_norm": 7.584867477416992, + "learning_rate": 7.74235060579095e-05, + "loss": 0.6937, + "step": 17242 + }, + { + "epoch": 1.1683040856426588, + "grad_norm": 4.14179801940918, + "learning_rate": 7.742213703881169e-05, + "loss": 0.5933, + "step": 17243 + }, + { + "epoch": 1.1683718409106307, + "grad_norm": 6.462168216705322, + "learning_rate": 7.742076801971388e-05, + "loss": 0.6561, + "step": 17244 + }, + { + "epoch": 1.168439596178603, + "grad_norm": 6.578998565673828, + "learning_rate": 7.741939900061606e-05, + "loss": 0.7552, + "step": 17245 + }, + { + "epoch": 1.168507351446575, + "grad_norm": 6.081613540649414, + "learning_rate": 7.741802998151824e-05, + "loss": 0.5051, + "step": 17246 + }, + { + "epoch": 1.1685751067145471, + "grad_norm": 7.021066188812256, + "learning_rate": 7.741666096242043e-05, + "loss": 0.7581, + "step": 17247 + }, + { + "epoch": 1.168642861982519, + "grad_norm": 6.672050476074219, + "learning_rate": 7.741529194332262e-05, + "loss": 0.832, + "step": 17248 + }, + { + "epoch": 1.1687106172504913, + "grad_norm": 6.199688911437988, + "learning_rate": 7.74139229242248e-05, + "loss": 0.8048, + "step": 17249 + }, + { + "epoch": 1.1687783725184633, + "grad_norm": 5.789974212646484, + "learning_rate": 7.741255390512699e-05, + "loss": 0.8967, + "step": 17250 + }, + { + "epoch": 1.1688461277864355, + "grad_norm": 4.966305732727051, + "learning_rate": 7.741118488602917e-05, + "loss": 0.778, + "step": 17251 + }, + { + "epoch": 1.1689138830544075, + "grad_norm": 4.729147434234619, + "learning_rate": 7.740981586693135e-05, + "loss": 0.4176, + "step": 17252 + }, + { + "epoch": 1.1689816383223794, + "grad_norm": 5.638984203338623, + "learning_rate": 7.740844684783353e-05, + "loss": 0.6909, + "step": 17253 + }, + { + "epoch": 1.1690493935903516, + "grad_norm": 5.719832420349121, + "learning_rate": 7.740707782873572e-05, + "loss": 0.5901, + "step": 17254 + }, + { + "epoch": 1.1691171488583239, + "grad_norm": 5.108078956604004, + "learning_rate": 7.74057088096379e-05, + "loss": 0.5979, + "step": 17255 + }, + { + "epoch": 1.1691849041262958, + "grad_norm": 5.3545732498168945, + "learning_rate": 7.740433979054009e-05, + "loss": 0.5158, + "step": 17256 + }, + { + "epoch": 1.1692526593942678, + "grad_norm": 5.225886344909668, + "learning_rate": 7.740297077144227e-05, + "loss": 0.7653, + "step": 17257 + }, + { + "epoch": 1.16932041466224, + "grad_norm": 5.831471920013428, + "learning_rate": 7.740160175234445e-05, + "loss": 0.7989, + "step": 17258 + }, + { + "epoch": 1.169388169930212, + "grad_norm": 8.2451810836792, + "learning_rate": 7.740023273324664e-05, + "loss": 0.6536, + "step": 17259 + }, + { + "epoch": 1.1694559251981842, + "grad_norm": 6.761453151702881, + "learning_rate": 7.739886371414882e-05, + "loss": 0.6763, + "step": 17260 + }, + { + "epoch": 1.1695236804661562, + "grad_norm": 5.2614240646362305, + "learning_rate": 7.7397494695051e-05, + "loss": 0.7062, + "step": 17261 + }, + { + "epoch": 1.1695914357341284, + "grad_norm": 6.049597263336182, + "learning_rate": 7.739612567595318e-05, + "loss": 0.5726, + "step": 17262 + }, + { + "epoch": 1.1696591910021004, + "grad_norm": 6.359036922454834, + "learning_rate": 7.739475665685537e-05, + "loss": 0.8202, + "step": 17263 + }, + { + "epoch": 1.1697269462700726, + "grad_norm": 5.320462703704834, + "learning_rate": 7.739338763775755e-05, + "loss": 0.5932, + "step": 17264 + }, + { + "epoch": 1.1697947015380445, + "grad_norm": 6.084722518920898, + "learning_rate": 7.739201861865974e-05, + "loss": 0.5786, + "step": 17265 + }, + { + "epoch": 1.1698624568060167, + "grad_norm": 7.363154888153076, + "learning_rate": 7.739064959956192e-05, + "loss": 0.8631, + "step": 17266 + }, + { + "epoch": 1.1699302120739887, + "grad_norm": 7.075015544891357, + "learning_rate": 7.73892805804641e-05, + "loss": 0.8241, + "step": 17267 + }, + { + "epoch": 1.169997967341961, + "grad_norm": 5.505892276763916, + "learning_rate": 7.738791156136629e-05, + "loss": 0.6984, + "step": 17268 + }, + { + "epoch": 1.170065722609933, + "grad_norm": 6.062048435211182, + "learning_rate": 7.738654254226847e-05, + "loss": 0.5967, + "step": 17269 + }, + { + "epoch": 1.1701334778779051, + "grad_norm": 5.961355686187744, + "learning_rate": 7.738517352317065e-05, + "loss": 0.8252, + "step": 17270 + }, + { + "epoch": 1.170201233145877, + "grad_norm": 5.551196098327637, + "learning_rate": 7.738380450407283e-05, + "loss": 0.7744, + "step": 17271 + }, + { + "epoch": 1.170268988413849, + "grad_norm": 5.391801357269287, + "learning_rate": 7.738243548497502e-05, + "loss": 0.5997, + "step": 17272 + }, + { + "epoch": 1.1703367436818213, + "grad_norm": 6.983804702758789, + "learning_rate": 7.73810664658772e-05, + "loss": 0.5914, + "step": 17273 + }, + { + "epoch": 1.1704044989497933, + "grad_norm": 5.468769550323486, + "learning_rate": 7.737969744677939e-05, + "loss": 0.7202, + "step": 17274 + }, + { + "epoch": 1.1704722542177655, + "grad_norm": 5.046876907348633, + "learning_rate": 7.737832842768157e-05, + "loss": 0.5294, + "step": 17275 + }, + { + "epoch": 1.1705400094857374, + "grad_norm": 6.133605480194092, + "learning_rate": 7.737695940858375e-05, + "loss": 0.7646, + "step": 17276 + }, + { + "epoch": 1.1706077647537096, + "grad_norm": 8.28398323059082, + "learning_rate": 7.737559038948594e-05, + "loss": 0.7566, + "step": 17277 + }, + { + "epoch": 1.1706755200216816, + "grad_norm": 6.076024532318115, + "learning_rate": 7.737422137038812e-05, + "loss": 0.6782, + "step": 17278 + }, + { + "epoch": 1.1707432752896538, + "grad_norm": 6.060565948486328, + "learning_rate": 7.73728523512903e-05, + "loss": 0.7511, + "step": 17279 + }, + { + "epoch": 1.1708110305576258, + "grad_norm": 7.766849517822266, + "learning_rate": 7.737148333219248e-05, + "loss": 0.6535, + "step": 17280 + }, + { + "epoch": 1.170878785825598, + "grad_norm": 5.996023654937744, + "learning_rate": 7.737011431309466e-05, + "loss": 0.7026, + "step": 17281 + }, + { + "epoch": 1.17094654109357, + "grad_norm": 6.4893646240234375, + "learning_rate": 7.736874529399686e-05, + "loss": 0.8587, + "step": 17282 + }, + { + "epoch": 1.1710142963615422, + "grad_norm": 5.484610557556152, + "learning_rate": 7.736737627489904e-05, + "loss": 0.5746, + "step": 17283 + }, + { + "epoch": 1.1710820516295142, + "grad_norm": 7.2935309410095215, + "learning_rate": 7.736600725580122e-05, + "loss": 0.7526, + "step": 17284 + }, + { + "epoch": 1.1711498068974864, + "grad_norm": 6.469519138336182, + "learning_rate": 7.73646382367034e-05, + "loss": 0.6975, + "step": 17285 + }, + { + "epoch": 1.1712175621654584, + "grad_norm": 6.044117450714111, + "learning_rate": 7.736326921760559e-05, + "loss": 0.694, + "step": 17286 + }, + { + "epoch": 1.1712853174334303, + "grad_norm": 5.34628963470459, + "learning_rate": 7.736190019850777e-05, + "loss": 0.7829, + "step": 17287 + }, + { + "epoch": 1.1713530727014025, + "grad_norm": 6.716300010681152, + "learning_rate": 7.736053117940995e-05, + "loss": 0.6242, + "step": 17288 + }, + { + "epoch": 1.1714208279693745, + "grad_norm": 6.748067378997803, + "learning_rate": 7.735916216031213e-05, + "loss": 0.7363, + "step": 17289 + }, + { + "epoch": 1.1714885832373467, + "grad_norm": 5.220841407775879, + "learning_rate": 7.735779314121433e-05, + "loss": 0.5735, + "step": 17290 + }, + { + "epoch": 1.1715563385053187, + "grad_norm": 5.8814496994018555, + "learning_rate": 7.73564241221165e-05, + "loss": 0.7137, + "step": 17291 + }, + { + "epoch": 1.171624093773291, + "grad_norm": 7.384030342102051, + "learning_rate": 7.735505510301869e-05, + "loss": 0.6728, + "step": 17292 + }, + { + "epoch": 1.171691849041263, + "grad_norm": 4.854050636291504, + "learning_rate": 7.735368608392088e-05, + "loss": 0.8363, + "step": 17293 + }, + { + "epoch": 1.171759604309235, + "grad_norm": 6.271542072296143, + "learning_rate": 7.735231706482306e-05, + "loss": 0.6886, + "step": 17294 + }, + { + "epoch": 1.171827359577207, + "grad_norm": 8.313980102539062, + "learning_rate": 7.735094804572524e-05, + "loss": 0.6262, + "step": 17295 + }, + { + "epoch": 1.1718951148451793, + "grad_norm": 5.353496551513672, + "learning_rate": 7.734957902662743e-05, + "loss": 0.7738, + "step": 17296 + }, + { + "epoch": 1.1719628701131513, + "grad_norm": 5.875715732574463, + "learning_rate": 7.734821000752961e-05, + "loss": 0.6644, + "step": 17297 + }, + { + "epoch": 1.1720306253811235, + "grad_norm": 5.350170135498047, + "learning_rate": 7.73468409884318e-05, + "loss": 0.5832, + "step": 17298 + }, + { + "epoch": 1.1720983806490954, + "grad_norm": 4.67872953414917, + "learning_rate": 7.734547196933398e-05, + "loss": 0.6281, + "step": 17299 + }, + { + "epoch": 1.1721661359170676, + "grad_norm": 6.967146396636963, + "learning_rate": 7.734410295023617e-05, + "loss": 0.5029, + "step": 17300 + }, + { + "epoch": 1.1722338911850396, + "grad_norm": 5.5732221603393555, + "learning_rate": 7.734273393113835e-05, + "loss": 0.7479, + "step": 17301 + }, + { + "epoch": 1.1723016464530116, + "grad_norm": 5.325867176055908, + "learning_rate": 7.734136491204053e-05, + "loss": 0.5204, + "step": 17302 + }, + { + "epoch": 1.1723694017209838, + "grad_norm": 6.015812873840332, + "learning_rate": 7.733999589294271e-05, + "loss": 0.7403, + "step": 17303 + }, + { + "epoch": 1.172437156988956, + "grad_norm": 7.1903581619262695, + "learning_rate": 7.73386268738449e-05, + "loss": 0.6715, + "step": 17304 + }, + { + "epoch": 1.172504912256928, + "grad_norm": 5.407469272613525, + "learning_rate": 7.733725785474708e-05, + "loss": 0.8218, + "step": 17305 + }, + { + "epoch": 1.1725726675249, + "grad_norm": 5.6599249839782715, + "learning_rate": 7.733588883564926e-05, + "loss": 0.7434, + "step": 17306 + }, + { + "epoch": 1.1726404227928722, + "grad_norm": 5.750308036804199, + "learning_rate": 7.733451981655145e-05, + "loss": 0.8658, + "step": 17307 + }, + { + "epoch": 1.1727081780608442, + "grad_norm": 5.779434680938721, + "learning_rate": 7.733315079745363e-05, + "loss": 0.5016, + "step": 17308 + }, + { + "epoch": 1.1727759333288164, + "grad_norm": 4.432728290557861, + "learning_rate": 7.733178177835582e-05, + "loss": 0.5855, + "step": 17309 + }, + { + "epoch": 1.1728436885967883, + "grad_norm": 5.154073715209961, + "learning_rate": 7.7330412759258e-05, + "loss": 0.6117, + "step": 17310 + }, + { + "epoch": 1.1729114438647605, + "grad_norm": 5.967854022979736, + "learning_rate": 7.732904374016018e-05, + "loss": 0.8827, + "step": 17311 + }, + { + "epoch": 1.1729791991327325, + "grad_norm": 7.544610500335693, + "learning_rate": 7.732767472106236e-05, + "loss": 0.8073, + "step": 17312 + }, + { + "epoch": 1.1730469544007047, + "grad_norm": 6.066003322601318, + "learning_rate": 7.732630570196454e-05, + "loss": 0.9099, + "step": 17313 + }, + { + "epoch": 1.1731147096686767, + "grad_norm": 6.47094202041626, + "learning_rate": 7.732493668286673e-05, + "loss": 0.7394, + "step": 17314 + }, + { + "epoch": 1.173182464936649, + "grad_norm": 6.1977925300598145, + "learning_rate": 7.732356766376891e-05, + "loss": 0.5849, + "step": 17315 + }, + { + "epoch": 1.1732502202046209, + "grad_norm": 6.772620677947998, + "learning_rate": 7.73221986446711e-05, + "loss": 0.8209, + "step": 17316 + }, + { + "epoch": 1.173317975472593, + "grad_norm": 5.0960693359375, + "learning_rate": 7.732082962557328e-05, + "loss": 0.6163, + "step": 17317 + }, + { + "epoch": 1.173385730740565, + "grad_norm": 8.258513450622559, + "learning_rate": 7.731946060647547e-05, + "loss": 0.7963, + "step": 17318 + }, + { + "epoch": 1.1734534860085373, + "grad_norm": 5.403371810913086, + "learning_rate": 7.731809158737765e-05, + "loss": 0.5509, + "step": 17319 + }, + { + "epoch": 1.1735212412765093, + "grad_norm": 4.698668956756592, + "learning_rate": 7.731672256827983e-05, + "loss": 0.712, + "step": 17320 + }, + { + "epoch": 1.1735889965444812, + "grad_norm": 5.766746997833252, + "learning_rate": 7.731535354918201e-05, + "loss": 0.7452, + "step": 17321 + }, + { + "epoch": 1.1736567518124534, + "grad_norm": 5.701009750366211, + "learning_rate": 7.731398453008419e-05, + "loss": 1.0187, + "step": 17322 + }, + { + "epoch": 1.1737245070804254, + "grad_norm": 4.641021251678467, + "learning_rate": 7.731261551098638e-05, + "loss": 0.5497, + "step": 17323 + }, + { + "epoch": 1.1737922623483976, + "grad_norm": 6.468865871429443, + "learning_rate": 7.731124649188857e-05, + "loss": 0.9236, + "step": 17324 + }, + { + "epoch": 1.1738600176163696, + "grad_norm": 4.343420505523682, + "learning_rate": 7.730987747279075e-05, + "loss": 0.4876, + "step": 17325 + }, + { + "epoch": 1.1739277728843418, + "grad_norm": 6.445211887359619, + "learning_rate": 7.730850845369293e-05, + "loss": 0.7793, + "step": 17326 + }, + { + "epoch": 1.1739955281523138, + "grad_norm": 5.686356544494629, + "learning_rate": 7.730713943459512e-05, + "loss": 0.6946, + "step": 17327 + }, + { + "epoch": 1.174063283420286, + "grad_norm": 7.529437065124512, + "learning_rate": 7.73057704154973e-05, + "loss": 0.7395, + "step": 17328 + }, + { + "epoch": 1.174131038688258, + "grad_norm": 8.172284126281738, + "learning_rate": 7.730440139639948e-05, + "loss": 0.6705, + "step": 17329 + }, + { + "epoch": 1.1741987939562302, + "grad_norm": 8.046764373779297, + "learning_rate": 7.730303237730166e-05, + "loss": 0.8011, + "step": 17330 + }, + { + "epoch": 1.1742665492242021, + "grad_norm": 4.910728454589844, + "learning_rate": 7.730166335820384e-05, + "loss": 0.7334, + "step": 17331 + }, + { + "epoch": 1.1743343044921744, + "grad_norm": 6.085247993469238, + "learning_rate": 7.730029433910603e-05, + "loss": 0.5327, + "step": 17332 + }, + { + "epoch": 1.1744020597601463, + "grad_norm": 4.2402520179748535, + "learning_rate": 7.729892532000822e-05, + "loss": 0.7252, + "step": 17333 + }, + { + "epoch": 1.1744698150281185, + "grad_norm": 7.234801769256592, + "learning_rate": 7.72975563009104e-05, + "loss": 0.9923, + "step": 17334 + }, + { + "epoch": 1.1745375702960905, + "grad_norm": 7.858911991119385, + "learning_rate": 7.729618728181258e-05, + "loss": 0.6549, + "step": 17335 + }, + { + "epoch": 1.1746053255640625, + "grad_norm": 4.676433563232422, + "learning_rate": 7.729481826271477e-05, + "loss": 0.7768, + "step": 17336 + }, + { + "epoch": 1.1746730808320347, + "grad_norm": 4.907873630523682, + "learning_rate": 7.729344924361695e-05, + "loss": 0.6704, + "step": 17337 + }, + { + "epoch": 1.1747408361000067, + "grad_norm": 5.7177839279174805, + "learning_rate": 7.729208022451913e-05, + "loss": 0.713, + "step": 17338 + }, + { + "epoch": 1.1748085913679789, + "grad_norm": 5.389528751373291, + "learning_rate": 7.729071120542132e-05, + "loss": 0.6746, + "step": 17339 + }, + { + "epoch": 1.1748763466359509, + "grad_norm": 7.09976863861084, + "learning_rate": 7.72893421863235e-05, + "loss": 0.6885, + "step": 17340 + }, + { + "epoch": 1.174944101903923, + "grad_norm": 7.006227493286133, + "learning_rate": 7.728797316722569e-05, + "loss": 0.9149, + "step": 17341 + }, + { + "epoch": 1.175011857171895, + "grad_norm": 6.186046600341797, + "learning_rate": 7.728660414812788e-05, + "loss": 0.7265, + "step": 17342 + }, + { + "epoch": 1.1750796124398672, + "grad_norm": 6.1935834884643555, + "learning_rate": 7.728523512903006e-05, + "loss": 0.8993, + "step": 17343 + }, + { + "epoch": 1.1751473677078392, + "grad_norm": 5.259952545166016, + "learning_rate": 7.728386610993224e-05, + "loss": 0.7487, + "step": 17344 + }, + { + "epoch": 1.1752151229758114, + "grad_norm": 7.327434539794922, + "learning_rate": 7.728249709083442e-05, + "loss": 1.0751, + "step": 17345 + }, + { + "epoch": 1.1752828782437834, + "grad_norm": 5.855953216552734, + "learning_rate": 7.728112807173661e-05, + "loss": 0.6628, + "step": 17346 + }, + { + "epoch": 1.1753506335117556, + "grad_norm": 5.843200206756592, + "learning_rate": 7.72797590526388e-05, + "loss": 0.8061, + "step": 17347 + }, + { + "epoch": 1.1754183887797276, + "grad_norm": 5.277237892150879, + "learning_rate": 7.727839003354097e-05, + "loss": 0.7312, + "step": 17348 + }, + { + "epoch": 1.1754861440476998, + "grad_norm": 5.734367847442627, + "learning_rate": 7.727702101444315e-05, + "loss": 0.6302, + "step": 17349 + }, + { + "epoch": 1.1755538993156718, + "grad_norm": 4.641081809997559, + "learning_rate": 7.727565199534535e-05, + "loss": 0.5845, + "step": 17350 + }, + { + "epoch": 1.1756216545836438, + "grad_norm": 5.636540412902832, + "learning_rate": 7.727428297624753e-05, + "loss": 0.7319, + "step": 17351 + }, + { + "epoch": 1.175689409851616, + "grad_norm": 5.056623458862305, + "learning_rate": 7.727291395714971e-05, + "loss": 0.6003, + "step": 17352 + }, + { + "epoch": 1.1757571651195882, + "grad_norm": 8.262724876403809, + "learning_rate": 7.727154493805189e-05, + "loss": 0.7032, + "step": 17353 + }, + { + "epoch": 1.1758249203875601, + "grad_norm": 5.76828145980835, + "learning_rate": 7.727017591895407e-05, + "loss": 0.6717, + "step": 17354 + }, + { + "epoch": 1.1758926756555321, + "grad_norm": 4.917673587799072, + "learning_rate": 7.726880689985626e-05, + "loss": 0.8381, + "step": 17355 + }, + { + "epoch": 1.1759604309235043, + "grad_norm": 5.0473127365112305, + "learning_rate": 7.726743788075844e-05, + "loss": 0.7817, + "step": 17356 + }, + { + "epoch": 1.1760281861914763, + "grad_norm": 6.269956588745117, + "learning_rate": 7.726606886166062e-05, + "loss": 0.7135, + "step": 17357 + }, + { + "epoch": 1.1760959414594485, + "grad_norm": 5.429281711578369, + "learning_rate": 7.72646998425628e-05, + "loss": 0.5025, + "step": 17358 + }, + { + "epoch": 1.1761636967274205, + "grad_norm": 6.20123815536499, + "learning_rate": 7.726333082346499e-05, + "loss": 0.618, + "step": 17359 + }, + { + "epoch": 1.1762314519953927, + "grad_norm": 7.61653995513916, + "learning_rate": 7.726196180436718e-05, + "loss": 0.6468, + "step": 17360 + }, + { + "epoch": 1.1762992072633647, + "grad_norm": 6.944809913635254, + "learning_rate": 7.726059278526936e-05, + "loss": 0.6998, + "step": 17361 + }, + { + "epoch": 1.1763669625313369, + "grad_norm": 5.237542629241943, + "learning_rate": 7.725922376617154e-05, + "loss": 0.6578, + "step": 17362 + }, + { + "epoch": 1.1764347177993089, + "grad_norm": 4.988475799560547, + "learning_rate": 7.725785474707372e-05, + "loss": 0.5578, + "step": 17363 + }, + { + "epoch": 1.176502473067281, + "grad_norm": 6.360912322998047, + "learning_rate": 7.725648572797591e-05, + "loss": 0.6835, + "step": 17364 + }, + { + "epoch": 1.176570228335253, + "grad_norm": 5.701276779174805, + "learning_rate": 7.72551167088781e-05, + "loss": 0.6863, + "step": 17365 + }, + { + "epoch": 1.1766379836032252, + "grad_norm": 5.497084140777588, + "learning_rate": 7.725374768978027e-05, + "loss": 0.6515, + "step": 17366 + }, + { + "epoch": 1.1767057388711972, + "grad_norm": 5.4620771408081055, + "learning_rate": 7.725237867068246e-05, + "loss": 0.6462, + "step": 17367 + }, + { + "epoch": 1.1767734941391694, + "grad_norm": 5.820620059967041, + "learning_rate": 7.725100965158464e-05, + "loss": 0.5846, + "step": 17368 + }, + { + "epoch": 1.1768412494071414, + "grad_norm": 5.675363540649414, + "learning_rate": 7.724964063248683e-05, + "loss": 0.8976, + "step": 17369 + }, + { + "epoch": 1.1769090046751134, + "grad_norm": 6.178694248199463, + "learning_rate": 7.724827161338901e-05, + "loss": 0.7486, + "step": 17370 + }, + { + "epoch": 1.1769767599430856, + "grad_norm": 7.601099014282227, + "learning_rate": 7.724690259429119e-05, + "loss": 0.6428, + "step": 17371 + }, + { + "epoch": 1.1770445152110576, + "grad_norm": 5.221908092498779, + "learning_rate": 7.724553357519337e-05, + "loss": 0.5738, + "step": 17372 + }, + { + "epoch": 1.1771122704790298, + "grad_norm": 8.009245872497559, + "learning_rate": 7.724416455609556e-05, + "loss": 0.8131, + "step": 17373 + }, + { + "epoch": 1.1771800257470018, + "grad_norm": 4.87775182723999, + "learning_rate": 7.724279553699774e-05, + "loss": 0.8272, + "step": 17374 + }, + { + "epoch": 1.177247781014974, + "grad_norm": 6.45685338973999, + "learning_rate": 7.724142651789993e-05, + "loss": 0.7668, + "step": 17375 + }, + { + "epoch": 1.177315536282946, + "grad_norm": 6.50517463684082, + "learning_rate": 7.72400574988021e-05, + "loss": 1.0197, + "step": 17376 + }, + { + "epoch": 1.1773832915509181, + "grad_norm": 8.832415580749512, + "learning_rate": 7.723868847970429e-05, + "loss": 0.5662, + "step": 17377 + }, + { + "epoch": 1.1774510468188901, + "grad_norm": 4.831343650817871, + "learning_rate": 7.723731946060648e-05, + "loss": 0.5595, + "step": 17378 + }, + { + "epoch": 1.1775188020868623, + "grad_norm": 5.61880350112915, + "learning_rate": 7.723595044150866e-05, + "loss": 0.7912, + "step": 17379 + }, + { + "epoch": 1.1775865573548343, + "grad_norm": 6.002482891082764, + "learning_rate": 7.723458142241084e-05, + "loss": 0.6987, + "step": 17380 + }, + { + "epoch": 1.1776543126228065, + "grad_norm": 8.140533447265625, + "learning_rate": 7.723321240331302e-05, + "loss": 0.863, + "step": 17381 + }, + { + "epoch": 1.1777220678907785, + "grad_norm": 6.138432502746582, + "learning_rate": 7.723184338421521e-05, + "loss": 0.8564, + "step": 17382 + }, + { + "epoch": 1.1777898231587507, + "grad_norm": 5.278548717498779, + "learning_rate": 7.72304743651174e-05, + "loss": 0.6709, + "step": 17383 + }, + { + "epoch": 1.1778575784267227, + "grad_norm": 5.611672401428223, + "learning_rate": 7.722910534601958e-05, + "loss": 0.8298, + "step": 17384 + }, + { + "epoch": 1.1779253336946947, + "grad_norm": 5.170347690582275, + "learning_rate": 7.722773632692177e-05, + "loss": 1.0291, + "step": 17385 + }, + { + "epoch": 1.1779930889626669, + "grad_norm": 6.799687385559082, + "learning_rate": 7.722636730782395e-05, + "loss": 0.6598, + "step": 17386 + }, + { + "epoch": 1.1780608442306388, + "grad_norm": 8.000127792358398, + "learning_rate": 7.722499828872613e-05, + "loss": 0.6322, + "step": 17387 + }, + { + "epoch": 1.178128599498611, + "grad_norm": 6.476943492889404, + "learning_rate": 7.722362926962832e-05, + "loss": 0.6098, + "step": 17388 + }, + { + "epoch": 1.178196354766583, + "grad_norm": 8.889633178710938, + "learning_rate": 7.72222602505305e-05, + "loss": 0.9242, + "step": 17389 + }, + { + "epoch": 1.1782641100345552, + "grad_norm": 5.422774791717529, + "learning_rate": 7.722089123143268e-05, + "loss": 0.6973, + "step": 17390 + }, + { + "epoch": 1.1783318653025272, + "grad_norm": 5.284212112426758, + "learning_rate": 7.721952221233486e-05, + "loss": 0.5893, + "step": 17391 + }, + { + "epoch": 1.1783996205704994, + "grad_norm": 5.717512607574463, + "learning_rate": 7.721815319323706e-05, + "loss": 0.6507, + "step": 17392 + }, + { + "epoch": 1.1784673758384714, + "grad_norm": 5.993809700012207, + "learning_rate": 7.721678417413924e-05, + "loss": 0.7245, + "step": 17393 + }, + { + "epoch": 1.1785351311064436, + "grad_norm": 5.827890872955322, + "learning_rate": 7.721541515504142e-05, + "loss": 0.8094, + "step": 17394 + }, + { + "epoch": 1.1786028863744156, + "grad_norm": 5.711609363555908, + "learning_rate": 7.72140461359436e-05, + "loss": 0.6623, + "step": 17395 + }, + { + "epoch": 1.1786706416423878, + "grad_norm": 4.487983226776123, + "learning_rate": 7.72126771168458e-05, + "loss": 0.6765, + "step": 17396 + }, + { + "epoch": 1.1787383969103598, + "grad_norm": 6.461648464202881, + "learning_rate": 7.721130809774797e-05, + "loss": 0.7344, + "step": 17397 + }, + { + "epoch": 1.178806152178332, + "grad_norm": 5.570317268371582, + "learning_rate": 7.720993907865015e-05, + "loss": 0.674, + "step": 17398 + }, + { + "epoch": 1.178873907446304, + "grad_norm": 6.213165760040283, + "learning_rate": 7.720857005955233e-05, + "loss": 0.6197, + "step": 17399 + }, + { + "epoch": 1.178941662714276, + "grad_norm": 6.4340500831604, + "learning_rate": 7.720720104045451e-05, + "loss": 0.7551, + "step": 17400 + }, + { + "epoch": 1.1790094179822481, + "grad_norm": 9.295565605163574, + "learning_rate": 7.720583202135671e-05, + "loss": 0.8534, + "step": 17401 + }, + { + "epoch": 1.1790771732502203, + "grad_norm": 5.942518711090088, + "learning_rate": 7.720446300225889e-05, + "loss": 0.5783, + "step": 17402 + }, + { + "epoch": 1.1791449285181923, + "grad_norm": 6.883828163146973, + "learning_rate": 7.720309398316107e-05, + "loss": 0.8772, + "step": 17403 + }, + { + "epoch": 1.1792126837861643, + "grad_norm": 5.310081958770752, + "learning_rate": 7.720172496406325e-05, + "loss": 0.685, + "step": 17404 + }, + { + "epoch": 1.1792804390541365, + "grad_norm": 5.378293037414551, + "learning_rate": 7.720035594496544e-05, + "loss": 0.7761, + "step": 17405 + }, + { + "epoch": 1.1793481943221085, + "grad_norm": 5.762744426727295, + "learning_rate": 7.719898692586762e-05, + "loss": 0.7877, + "step": 17406 + }, + { + "epoch": 1.1794159495900807, + "grad_norm": 6.828834533691406, + "learning_rate": 7.71976179067698e-05, + "loss": 0.7059, + "step": 17407 + }, + { + "epoch": 1.1794837048580527, + "grad_norm": 5.380794525146484, + "learning_rate": 7.719624888767198e-05, + "loss": 0.8572, + "step": 17408 + }, + { + "epoch": 1.1795514601260249, + "grad_norm": 5.599371910095215, + "learning_rate": 7.719487986857417e-05, + "loss": 0.6622, + "step": 17409 + }, + { + "epoch": 1.1796192153939968, + "grad_norm": 6.256725311279297, + "learning_rate": 7.719351084947636e-05, + "loss": 0.6448, + "step": 17410 + }, + { + "epoch": 1.179686970661969, + "grad_norm": 5.341912746429443, + "learning_rate": 7.719214183037854e-05, + "loss": 0.5639, + "step": 17411 + }, + { + "epoch": 1.179754725929941, + "grad_norm": 10.262722969055176, + "learning_rate": 7.719077281128072e-05, + "loss": 0.6745, + "step": 17412 + }, + { + "epoch": 1.1798224811979132, + "grad_norm": 6.846330642700195, + "learning_rate": 7.71894037921829e-05, + "loss": 0.6262, + "step": 17413 + }, + { + "epoch": 1.1798902364658852, + "grad_norm": 5.529928684234619, + "learning_rate": 7.718803477308508e-05, + "loss": 0.8384, + "step": 17414 + }, + { + "epoch": 1.1799579917338574, + "grad_norm": 5.655308723449707, + "learning_rate": 7.718666575398727e-05, + "loss": 0.701, + "step": 17415 + }, + { + "epoch": 1.1800257470018294, + "grad_norm": 5.416240692138672, + "learning_rate": 7.718529673488945e-05, + "loss": 0.5655, + "step": 17416 + }, + { + "epoch": 1.1800935022698016, + "grad_norm": 6.7942705154418945, + "learning_rate": 7.718392771579163e-05, + "loss": 0.7546, + "step": 17417 + }, + { + "epoch": 1.1801612575377736, + "grad_norm": 6.258289813995361, + "learning_rate": 7.718255869669382e-05, + "loss": 0.8086, + "step": 17418 + }, + { + "epoch": 1.1802290128057455, + "grad_norm": 7.271621227264404, + "learning_rate": 7.718118967759601e-05, + "loss": 0.7594, + "step": 17419 + }, + { + "epoch": 1.1802967680737178, + "grad_norm": 7.857482433319092, + "learning_rate": 7.717982065849819e-05, + "loss": 0.7947, + "step": 17420 + }, + { + "epoch": 1.1803645233416897, + "grad_norm": 5.672320365905762, + "learning_rate": 7.717845163940037e-05, + "loss": 0.9218, + "step": 17421 + }, + { + "epoch": 1.180432278609662, + "grad_norm": 7.510291576385498, + "learning_rate": 7.717708262030255e-05, + "loss": 0.8078, + "step": 17422 + }, + { + "epoch": 1.180500033877634, + "grad_norm": 7.348386287689209, + "learning_rate": 7.717571360120473e-05, + "loss": 1.0443, + "step": 17423 + }, + { + "epoch": 1.1805677891456061, + "grad_norm": 5.457333087921143, + "learning_rate": 7.717434458210692e-05, + "loss": 0.7723, + "step": 17424 + }, + { + "epoch": 1.180635544413578, + "grad_norm": 6.572354793548584, + "learning_rate": 7.71729755630091e-05, + "loss": 0.5917, + "step": 17425 + }, + { + "epoch": 1.1807032996815503, + "grad_norm": 5.724460601806641, + "learning_rate": 7.717160654391129e-05, + "loss": 0.7562, + "step": 17426 + }, + { + "epoch": 1.1807710549495223, + "grad_norm": 6.148141860961914, + "learning_rate": 7.717023752481347e-05, + "loss": 0.8323, + "step": 17427 + }, + { + "epoch": 1.1808388102174945, + "grad_norm": 5.528962135314941, + "learning_rate": 7.716886850571566e-05, + "loss": 0.6606, + "step": 17428 + }, + { + "epoch": 1.1809065654854665, + "grad_norm": 7.883500099182129, + "learning_rate": 7.716749948661784e-05, + "loss": 0.8106, + "step": 17429 + }, + { + "epoch": 1.1809743207534387, + "grad_norm": 4.686129093170166, + "learning_rate": 7.716613046752002e-05, + "loss": 0.6902, + "step": 17430 + }, + { + "epoch": 1.1810420760214106, + "grad_norm": 6.630269527435303, + "learning_rate": 7.71647614484222e-05, + "loss": 0.6001, + "step": 17431 + }, + { + "epoch": 1.1811098312893828, + "grad_norm": 6.239437580108643, + "learning_rate": 7.71633924293244e-05, + "loss": 0.5679, + "step": 17432 + }, + { + "epoch": 1.1811775865573548, + "grad_norm": 5.275092124938965, + "learning_rate": 7.716202341022657e-05, + "loss": 0.6574, + "step": 17433 + }, + { + "epoch": 1.1812453418253268, + "grad_norm": 4.501307487487793, + "learning_rate": 7.716065439112875e-05, + "loss": 0.6631, + "step": 17434 + }, + { + "epoch": 1.181313097093299, + "grad_norm": 4.495173931121826, + "learning_rate": 7.715928537203095e-05, + "loss": 0.5067, + "step": 17435 + }, + { + "epoch": 1.181380852361271, + "grad_norm": 5.3275275230407715, + "learning_rate": 7.715791635293313e-05, + "loss": 0.5386, + "step": 17436 + }, + { + "epoch": 1.1814486076292432, + "grad_norm": 9.05092716217041, + "learning_rate": 7.715654733383531e-05, + "loss": 0.7015, + "step": 17437 + }, + { + "epoch": 1.1815163628972152, + "grad_norm": 6.003642559051514, + "learning_rate": 7.71551783147375e-05, + "loss": 0.7313, + "step": 17438 + }, + { + "epoch": 1.1815841181651874, + "grad_norm": 4.795654296875, + "learning_rate": 7.715380929563968e-05, + "loss": 0.6542, + "step": 17439 + }, + { + "epoch": 1.1816518734331594, + "grad_norm": 7.203171730041504, + "learning_rate": 7.715244027654186e-05, + "loss": 0.7249, + "step": 17440 + }, + { + "epoch": 1.1817196287011316, + "grad_norm": 6.126667022705078, + "learning_rate": 7.715107125744404e-05, + "loss": 0.7768, + "step": 17441 + }, + { + "epoch": 1.1817873839691035, + "grad_norm": 8.417376518249512, + "learning_rate": 7.714970223834624e-05, + "loss": 0.5883, + "step": 17442 + }, + { + "epoch": 1.1818551392370757, + "grad_norm": 10.3145112991333, + "learning_rate": 7.714833321924842e-05, + "loss": 0.7328, + "step": 17443 + }, + { + "epoch": 1.1819228945050477, + "grad_norm": 6.670629024505615, + "learning_rate": 7.71469642001506e-05, + "loss": 0.8297, + "step": 17444 + }, + { + "epoch": 1.18199064977302, + "grad_norm": 6.059311389923096, + "learning_rate": 7.714559518105278e-05, + "loss": 0.7134, + "step": 17445 + }, + { + "epoch": 1.182058405040992, + "grad_norm": 5.818484306335449, + "learning_rate": 7.714422616195496e-05, + "loss": 0.5291, + "step": 17446 + }, + { + "epoch": 1.1821261603089641, + "grad_norm": 5.98686408996582, + "learning_rate": 7.714285714285715e-05, + "loss": 0.5893, + "step": 17447 + }, + { + "epoch": 1.182193915576936, + "grad_norm": 5.623051643371582, + "learning_rate": 7.714148812375933e-05, + "loss": 0.6766, + "step": 17448 + }, + { + "epoch": 1.182261670844908, + "grad_norm": 6.6503586769104, + "learning_rate": 7.714011910466151e-05, + "loss": 0.551, + "step": 17449 + }, + { + "epoch": 1.1823294261128803, + "grad_norm": 8.11122989654541, + "learning_rate": 7.71387500855637e-05, + "loss": 0.6561, + "step": 17450 + }, + { + "epoch": 1.1823971813808525, + "grad_norm": 6.778097152709961, + "learning_rate": 7.713738106646589e-05, + "loss": 0.8429, + "step": 17451 + }, + { + "epoch": 1.1824649366488245, + "grad_norm": 4.3668107986450195, + "learning_rate": 7.713601204736807e-05, + "loss": 0.6024, + "step": 17452 + }, + { + "epoch": 1.1825326919167964, + "grad_norm": 10.554072380065918, + "learning_rate": 7.713464302827025e-05, + "loss": 0.5214, + "step": 17453 + }, + { + "epoch": 1.1826004471847686, + "grad_norm": 6.878169059753418, + "learning_rate": 7.713327400917243e-05, + "loss": 0.727, + "step": 17454 + }, + { + "epoch": 1.1826682024527406, + "grad_norm": 5.530303001403809, + "learning_rate": 7.713190499007461e-05, + "loss": 0.6007, + "step": 17455 + }, + { + "epoch": 1.1827359577207128, + "grad_norm": 6.1936354637146, + "learning_rate": 7.71305359709768e-05, + "loss": 0.6258, + "step": 17456 + }, + { + "epoch": 1.1828037129886848, + "grad_norm": 8.023333549499512, + "learning_rate": 7.712916695187898e-05, + "loss": 0.6559, + "step": 17457 + }, + { + "epoch": 1.182871468256657, + "grad_norm": 5.9769110679626465, + "learning_rate": 7.712779793278116e-05, + "loss": 0.6817, + "step": 17458 + }, + { + "epoch": 1.182939223524629, + "grad_norm": 6.784766674041748, + "learning_rate": 7.712642891368334e-05, + "loss": 0.6251, + "step": 17459 + }, + { + "epoch": 1.1830069787926012, + "grad_norm": 5.5998382568359375, + "learning_rate": 7.712505989458554e-05, + "loss": 0.7717, + "step": 17460 + }, + { + "epoch": 1.1830747340605732, + "grad_norm": 5.586696147918701, + "learning_rate": 7.712369087548772e-05, + "loss": 0.6819, + "step": 17461 + }, + { + "epoch": 1.1831424893285454, + "grad_norm": 5.863603591918945, + "learning_rate": 7.71223218563899e-05, + "loss": 0.6594, + "step": 17462 + }, + { + "epoch": 1.1832102445965174, + "grad_norm": 9.297795295715332, + "learning_rate": 7.712095283729208e-05, + "loss": 0.7023, + "step": 17463 + }, + { + "epoch": 1.1832779998644896, + "grad_norm": 4.512932300567627, + "learning_rate": 7.711958381819426e-05, + "loss": 0.5351, + "step": 17464 + }, + { + "epoch": 1.1833457551324615, + "grad_norm": 5.763913631439209, + "learning_rate": 7.711821479909645e-05, + "loss": 0.5338, + "step": 17465 + }, + { + "epoch": 1.1834135104004337, + "grad_norm": 6.064705848693848, + "learning_rate": 7.711684577999863e-05, + "loss": 0.6775, + "step": 17466 + }, + { + "epoch": 1.1834812656684057, + "grad_norm": 7.101622581481934, + "learning_rate": 7.711547676090081e-05, + "loss": 0.8101, + "step": 17467 + }, + { + "epoch": 1.1835490209363777, + "grad_norm": 5.599690914154053, + "learning_rate": 7.7114107741803e-05, + "loss": 0.7406, + "step": 17468 + }, + { + "epoch": 1.18361677620435, + "grad_norm": 6.1384124755859375, + "learning_rate": 7.711273872270518e-05, + "loss": 0.8194, + "step": 17469 + }, + { + "epoch": 1.1836845314723219, + "grad_norm": 4.1809611320495605, + "learning_rate": 7.711136970360737e-05, + "loss": 0.672, + "step": 17470 + }, + { + "epoch": 1.183752286740294, + "grad_norm": 6.009151935577393, + "learning_rate": 7.711000068450955e-05, + "loss": 0.9584, + "step": 17471 + }, + { + "epoch": 1.183820042008266, + "grad_norm": 5.323096752166748, + "learning_rate": 7.710863166541173e-05, + "loss": 0.51, + "step": 17472 + }, + { + "epoch": 1.1838877972762383, + "grad_norm": 5.447154998779297, + "learning_rate": 7.710726264631391e-05, + "loss": 0.5468, + "step": 17473 + }, + { + "epoch": 1.1839555525442103, + "grad_norm": 8.218817710876465, + "learning_rate": 7.71058936272161e-05, + "loss": 0.7344, + "step": 17474 + }, + { + "epoch": 1.1840233078121825, + "grad_norm": 5.56463098526001, + "learning_rate": 7.710452460811828e-05, + "loss": 0.5611, + "step": 17475 + }, + { + "epoch": 1.1840910630801544, + "grad_norm": 6.369572639465332, + "learning_rate": 7.710315558902046e-05, + "loss": 0.9165, + "step": 17476 + }, + { + "epoch": 1.1841588183481266, + "grad_norm": 7.159587860107422, + "learning_rate": 7.710178656992265e-05, + "loss": 0.8848, + "step": 17477 + }, + { + "epoch": 1.1842265736160986, + "grad_norm": 5.673058986663818, + "learning_rate": 7.710041755082484e-05, + "loss": 0.7024, + "step": 17478 + }, + { + "epoch": 1.1842943288840708, + "grad_norm": 4.88538122177124, + "learning_rate": 7.709904853172702e-05, + "loss": 0.7115, + "step": 17479 + }, + { + "epoch": 1.1843620841520428, + "grad_norm": 4.682442665100098, + "learning_rate": 7.70976795126292e-05, + "loss": 0.5612, + "step": 17480 + }, + { + "epoch": 1.184429839420015, + "grad_norm": 5.996410369873047, + "learning_rate": 7.70963104935314e-05, + "loss": 0.6594, + "step": 17481 + }, + { + "epoch": 1.184497594687987, + "grad_norm": 7.457950115203857, + "learning_rate": 7.709494147443357e-05, + "loss": 0.7196, + "step": 17482 + }, + { + "epoch": 1.184565349955959, + "grad_norm": 9.7376127243042, + "learning_rate": 7.709357245533575e-05, + "loss": 0.5854, + "step": 17483 + }, + { + "epoch": 1.1846331052239312, + "grad_norm": 7.527082920074463, + "learning_rate": 7.709220343623795e-05, + "loss": 0.6919, + "step": 17484 + }, + { + "epoch": 1.1847008604919032, + "grad_norm": 6.091211795806885, + "learning_rate": 7.709083441714013e-05, + "loss": 1.0379, + "step": 17485 + }, + { + "epoch": 1.1847686157598754, + "grad_norm": 5.3425750732421875, + "learning_rate": 7.708946539804231e-05, + "loss": 0.8544, + "step": 17486 + }, + { + "epoch": 1.1848363710278473, + "grad_norm": 5.463139057159424, + "learning_rate": 7.708809637894449e-05, + "loss": 0.5774, + "step": 17487 + }, + { + "epoch": 1.1849041262958195, + "grad_norm": 4.83765983581543, + "learning_rate": 7.708672735984668e-05, + "loss": 0.6807, + "step": 17488 + }, + { + "epoch": 1.1849718815637915, + "grad_norm": 6.051468372344971, + "learning_rate": 7.708535834074886e-05, + "loss": 0.7163, + "step": 17489 + }, + { + "epoch": 1.1850396368317637, + "grad_norm": 5.282536506652832, + "learning_rate": 7.708398932165104e-05, + "loss": 0.7486, + "step": 17490 + }, + { + "epoch": 1.1851073920997357, + "grad_norm": 6.964774131774902, + "learning_rate": 7.708262030255322e-05, + "loss": 0.8596, + "step": 17491 + }, + { + "epoch": 1.185175147367708, + "grad_norm": 5.607154846191406, + "learning_rate": 7.70812512834554e-05, + "loss": 0.5895, + "step": 17492 + }, + { + "epoch": 1.1852429026356799, + "grad_norm": 6.054999351501465, + "learning_rate": 7.70798822643576e-05, + "loss": 0.7251, + "step": 17493 + }, + { + "epoch": 1.185310657903652, + "grad_norm": 5.619368553161621, + "learning_rate": 7.707851324525978e-05, + "loss": 0.4792, + "step": 17494 + }, + { + "epoch": 1.185378413171624, + "grad_norm": 5.682523250579834, + "learning_rate": 7.707714422616196e-05, + "loss": 0.6317, + "step": 17495 + }, + { + "epoch": 1.1854461684395963, + "grad_norm": 5.336386680603027, + "learning_rate": 7.707577520706414e-05, + "loss": 0.7298, + "step": 17496 + }, + { + "epoch": 1.1855139237075683, + "grad_norm": 4.904666900634766, + "learning_rate": 7.707440618796633e-05, + "loss": 0.5701, + "step": 17497 + }, + { + "epoch": 1.1855816789755402, + "grad_norm": 7.6780571937561035, + "learning_rate": 7.707303716886851e-05, + "loss": 0.8559, + "step": 17498 + }, + { + "epoch": 1.1856494342435124, + "grad_norm": 6.725180149078369, + "learning_rate": 7.70716681497707e-05, + "loss": 0.7298, + "step": 17499 + }, + { + "epoch": 1.1857171895114846, + "grad_norm": 6.419045925140381, + "learning_rate": 7.707029913067287e-05, + "loss": 0.5927, + "step": 17500 + }, + { + "epoch": 1.1857849447794566, + "grad_norm": 5.36964225769043, + "learning_rate": 7.706893011157505e-05, + "loss": 0.5406, + "step": 17501 + }, + { + "epoch": 1.1858527000474286, + "grad_norm": 4.675073623657227, + "learning_rate": 7.706756109247725e-05, + "loss": 0.4895, + "step": 17502 + }, + { + "epoch": 1.1859204553154008, + "grad_norm": 4.49945592880249, + "learning_rate": 7.706619207337943e-05, + "loss": 0.5266, + "step": 17503 + }, + { + "epoch": 1.1859882105833728, + "grad_norm": 5.286568641662598, + "learning_rate": 7.706482305428161e-05, + "loss": 0.8502, + "step": 17504 + }, + { + "epoch": 1.186055965851345, + "grad_norm": 6.318417549133301, + "learning_rate": 7.706345403518379e-05, + "loss": 0.9895, + "step": 17505 + }, + { + "epoch": 1.186123721119317, + "grad_norm": 6.430423259735107, + "learning_rate": 7.706208501608598e-05, + "loss": 0.7683, + "step": 17506 + }, + { + "epoch": 1.1861914763872892, + "grad_norm": 5.740241050720215, + "learning_rate": 7.706071599698816e-05, + "loss": 0.9374, + "step": 17507 + }, + { + "epoch": 1.1862592316552611, + "grad_norm": 12.021029472351074, + "learning_rate": 7.705934697789034e-05, + "loss": 0.5365, + "step": 17508 + }, + { + "epoch": 1.1863269869232334, + "grad_norm": 7.397732257843018, + "learning_rate": 7.705797795879252e-05, + "loss": 0.5, + "step": 17509 + }, + { + "epoch": 1.1863947421912053, + "grad_norm": 6.771727561950684, + "learning_rate": 7.70566089396947e-05, + "loss": 0.545, + "step": 17510 + }, + { + "epoch": 1.1864624974591775, + "grad_norm": 6.742783069610596, + "learning_rate": 7.70552399205969e-05, + "loss": 0.9527, + "step": 17511 + }, + { + "epoch": 1.1865302527271495, + "grad_norm": 5.033138275146484, + "learning_rate": 7.705387090149908e-05, + "loss": 0.5438, + "step": 17512 + }, + { + "epoch": 1.1865980079951217, + "grad_norm": 7.680271148681641, + "learning_rate": 7.705250188240126e-05, + "loss": 0.543, + "step": 17513 + }, + { + "epoch": 1.1866657632630937, + "grad_norm": 6.076967716217041, + "learning_rate": 7.705113286330344e-05, + "loss": 0.8015, + "step": 17514 + }, + { + "epoch": 1.186733518531066, + "grad_norm": 5.922079086303711, + "learning_rate": 7.704976384420563e-05, + "loss": 0.6701, + "step": 17515 + }, + { + "epoch": 1.1868012737990379, + "grad_norm": 6.352739334106445, + "learning_rate": 7.704839482510781e-05, + "loss": 0.6318, + "step": 17516 + }, + { + "epoch": 1.1868690290670099, + "grad_norm": 7.644301414489746, + "learning_rate": 7.704702580601e-05, + "loss": 0.6471, + "step": 17517 + }, + { + "epoch": 1.186936784334982, + "grad_norm": 5.560800552368164, + "learning_rate": 7.704565678691217e-05, + "loss": 0.5093, + "step": 17518 + }, + { + "epoch": 1.187004539602954, + "grad_norm": 8.711373329162598, + "learning_rate": 7.704428776781435e-05, + "loss": 0.6588, + "step": 17519 + }, + { + "epoch": 1.1870722948709262, + "grad_norm": 5.718215465545654, + "learning_rate": 7.704291874871655e-05, + "loss": 0.8604, + "step": 17520 + }, + { + "epoch": 1.1871400501388982, + "grad_norm": 6.022330284118652, + "learning_rate": 7.704154972961873e-05, + "loss": 0.7205, + "step": 17521 + }, + { + "epoch": 1.1872078054068704, + "grad_norm": 7.659083366394043, + "learning_rate": 7.704018071052091e-05, + "loss": 0.4675, + "step": 17522 + }, + { + "epoch": 1.1872755606748424, + "grad_norm": 6.095610618591309, + "learning_rate": 7.703881169142309e-05, + "loss": 0.6772, + "step": 17523 + }, + { + "epoch": 1.1873433159428146, + "grad_norm": 5.625333786010742, + "learning_rate": 7.703744267232528e-05, + "loss": 0.618, + "step": 17524 + }, + { + "epoch": 1.1874110712107866, + "grad_norm": 7.851975440979004, + "learning_rate": 7.703607365322746e-05, + "loss": 0.763, + "step": 17525 + }, + { + "epoch": 1.1874788264787588, + "grad_norm": 9.398256301879883, + "learning_rate": 7.703470463412964e-05, + "loss": 0.6771, + "step": 17526 + }, + { + "epoch": 1.1875465817467308, + "grad_norm": 9.461082458496094, + "learning_rate": 7.703333561503184e-05, + "loss": 0.9216, + "step": 17527 + }, + { + "epoch": 1.187614337014703, + "grad_norm": 7.010152339935303, + "learning_rate": 7.703196659593402e-05, + "loss": 0.8298, + "step": 17528 + }, + { + "epoch": 1.187682092282675, + "grad_norm": 5.272936820983887, + "learning_rate": 7.70305975768362e-05, + "loss": 0.7349, + "step": 17529 + }, + { + "epoch": 1.1877498475506472, + "grad_norm": 6.14058780670166, + "learning_rate": 7.702922855773839e-05, + "loss": 0.6735, + "step": 17530 + }, + { + "epoch": 1.1878176028186191, + "grad_norm": 8.246026039123535, + "learning_rate": 7.702785953864057e-05, + "loss": 0.6286, + "step": 17531 + }, + { + "epoch": 1.1878853580865911, + "grad_norm": 8.610701560974121, + "learning_rate": 7.702649051954275e-05, + "loss": 0.5919, + "step": 17532 + }, + { + "epoch": 1.1879531133545633, + "grad_norm": 8.594643592834473, + "learning_rate": 7.702512150044493e-05, + "loss": 0.4835, + "step": 17533 + }, + { + "epoch": 1.1880208686225353, + "grad_norm": 4.968395709991455, + "learning_rate": 7.702375248134713e-05, + "loss": 0.5888, + "step": 17534 + }, + { + "epoch": 1.1880886238905075, + "grad_norm": 6.422661781311035, + "learning_rate": 7.702238346224931e-05, + "loss": 0.5625, + "step": 17535 + }, + { + "epoch": 1.1881563791584795, + "grad_norm": 9.77650260925293, + "learning_rate": 7.702101444315149e-05, + "loss": 0.7993, + "step": 17536 + }, + { + "epoch": 1.1882241344264517, + "grad_norm": 5.65856409072876, + "learning_rate": 7.701964542405367e-05, + "loss": 0.5667, + "step": 17537 + }, + { + "epoch": 1.1882918896944237, + "grad_norm": 6.888467311859131, + "learning_rate": 7.701827640495586e-05, + "loss": 0.9473, + "step": 17538 + }, + { + "epoch": 1.1883596449623959, + "grad_norm": 5.9407548904418945, + "learning_rate": 7.701690738585804e-05, + "loss": 0.888, + "step": 17539 + }, + { + "epoch": 1.1884274002303679, + "grad_norm": 4.671491622924805, + "learning_rate": 7.701553836676022e-05, + "loss": 0.5841, + "step": 17540 + }, + { + "epoch": 1.18849515549834, + "grad_norm": 6.472855567932129, + "learning_rate": 7.70141693476624e-05, + "loss": 0.5864, + "step": 17541 + }, + { + "epoch": 1.188562910766312, + "grad_norm": 6.266822338104248, + "learning_rate": 7.701280032856458e-05, + "loss": 0.6501, + "step": 17542 + }, + { + "epoch": 1.1886306660342842, + "grad_norm": 5.629112243652344, + "learning_rate": 7.701143130946678e-05, + "loss": 0.5439, + "step": 17543 + }, + { + "epoch": 1.1886984213022562, + "grad_norm": 4.828508377075195, + "learning_rate": 7.701006229036896e-05, + "loss": 0.5442, + "step": 17544 + }, + { + "epoch": 1.1887661765702284, + "grad_norm": 5.5869059562683105, + "learning_rate": 7.700869327127114e-05, + "loss": 0.8221, + "step": 17545 + }, + { + "epoch": 1.1888339318382004, + "grad_norm": 6.538342475891113, + "learning_rate": 7.700732425217332e-05, + "loss": 0.6933, + "step": 17546 + }, + { + "epoch": 1.1889016871061724, + "grad_norm": 6.4926981925964355, + "learning_rate": 7.70059552330755e-05, + "loss": 0.8851, + "step": 17547 + }, + { + "epoch": 1.1889694423741446, + "grad_norm": 6.433711528778076, + "learning_rate": 7.700458621397769e-05, + "loss": 0.7035, + "step": 17548 + }, + { + "epoch": 1.1890371976421168, + "grad_norm": 9.490755081176758, + "learning_rate": 7.700321719487987e-05, + "loss": 0.6435, + "step": 17549 + }, + { + "epoch": 1.1891049529100888, + "grad_norm": 5.095461845397949, + "learning_rate": 7.700184817578205e-05, + "loss": 0.4888, + "step": 17550 + }, + { + "epoch": 1.1891727081780608, + "grad_norm": 6.992951393127441, + "learning_rate": 7.700047915668423e-05, + "loss": 0.6976, + "step": 17551 + }, + { + "epoch": 1.189240463446033, + "grad_norm": 5.5607709884643555, + "learning_rate": 7.699911013758643e-05, + "loss": 0.5983, + "step": 17552 + }, + { + "epoch": 1.189308218714005, + "grad_norm": 5.8281402587890625, + "learning_rate": 7.699774111848861e-05, + "loss": 0.9768, + "step": 17553 + }, + { + "epoch": 1.1893759739819771, + "grad_norm": 6.718166351318359, + "learning_rate": 7.699637209939079e-05, + "loss": 0.7467, + "step": 17554 + }, + { + "epoch": 1.1894437292499491, + "grad_norm": 5.299029350280762, + "learning_rate": 7.699500308029297e-05, + "loss": 0.4338, + "step": 17555 + }, + { + "epoch": 1.1895114845179213, + "grad_norm": 7.574790954589844, + "learning_rate": 7.699363406119515e-05, + "loss": 0.5508, + "step": 17556 + }, + { + "epoch": 1.1895792397858933, + "grad_norm": 5.672619819641113, + "learning_rate": 7.699226504209734e-05, + "loss": 0.8091, + "step": 17557 + }, + { + "epoch": 1.1896469950538655, + "grad_norm": 4.973649978637695, + "learning_rate": 7.699089602299952e-05, + "loss": 0.5796, + "step": 17558 + }, + { + "epoch": 1.1897147503218375, + "grad_norm": 6.112509250640869, + "learning_rate": 7.69895270039017e-05, + "loss": 0.7684, + "step": 17559 + }, + { + "epoch": 1.1897825055898097, + "grad_norm": 5.4346723556518555, + "learning_rate": 7.698815798480388e-05, + "loss": 0.5285, + "step": 17560 + }, + { + "epoch": 1.1898502608577817, + "grad_norm": 5.107007026672363, + "learning_rate": 7.698678896570608e-05, + "loss": 0.6484, + "step": 17561 + }, + { + "epoch": 1.1899180161257537, + "grad_norm": 4.733124732971191, + "learning_rate": 7.698541994660826e-05, + "loss": 0.658, + "step": 17562 + }, + { + "epoch": 1.1899857713937259, + "grad_norm": 3.8034684658050537, + "learning_rate": 7.698405092751044e-05, + "loss": 0.507, + "step": 17563 + }, + { + "epoch": 1.190053526661698, + "grad_norm": 5.694223880767822, + "learning_rate": 7.698268190841262e-05, + "loss": 0.622, + "step": 17564 + }, + { + "epoch": 1.19012128192967, + "grad_norm": 5.963348865509033, + "learning_rate": 7.69813128893148e-05, + "loss": 0.5774, + "step": 17565 + }, + { + "epoch": 1.190189037197642, + "grad_norm": 6.509255886077881, + "learning_rate": 7.6979943870217e-05, + "loss": 0.6859, + "step": 17566 + }, + { + "epoch": 1.1902567924656142, + "grad_norm": 5.054556369781494, + "learning_rate": 7.697857485111917e-05, + "loss": 0.7494, + "step": 17567 + }, + { + "epoch": 1.1903245477335862, + "grad_norm": 6.527555465698242, + "learning_rate": 7.697720583202135e-05, + "loss": 1.0898, + "step": 17568 + }, + { + "epoch": 1.1903923030015584, + "grad_norm": 4.6289896965026855, + "learning_rate": 7.697583681292353e-05, + "loss": 0.5725, + "step": 17569 + }, + { + "epoch": 1.1904600582695304, + "grad_norm": 6.46843147277832, + "learning_rate": 7.697446779382573e-05, + "loss": 0.7417, + "step": 17570 + }, + { + "epoch": 1.1905278135375026, + "grad_norm": 6.553291320800781, + "learning_rate": 7.697309877472791e-05, + "loss": 0.7798, + "step": 17571 + }, + { + "epoch": 1.1905955688054746, + "grad_norm": 4.56556510925293, + "learning_rate": 7.697172975563009e-05, + "loss": 0.5706, + "step": 17572 + }, + { + "epoch": 1.1906633240734468, + "grad_norm": 6.320700645446777, + "learning_rate": 7.697036073653228e-05, + "loss": 0.6367, + "step": 17573 + }, + { + "epoch": 1.1907310793414188, + "grad_norm": 7.7214226722717285, + "learning_rate": 7.696899171743446e-05, + "loss": 0.8209, + "step": 17574 + }, + { + "epoch": 1.190798834609391, + "grad_norm": 5.526475429534912, + "learning_rate": 7.696762269833664e-05, + "loss": 0.7385, + "step": 17575 + }, + { + "epoch": 1.190866589877363, + "grad_norm": 5.44613790512085, + "learning_rate": 7.696625367923884e-05, + "loss": 0.608, + "step": 17576 + }, + { + "epoch": 1.1909343451453351, + "grad_norm": 5.303778648376465, + "learning_rate": 7.696488466014102e-05, + "loss": 0.7954, + "step": 17577 + }, + { + "epoch": 1.1910021004133071, + "grad_norm": 5.654077529907227, + "learning_rate": 7.69635156410432e-05, + "loss": 0.8997, + "step": 17578 + }, + { + "epoch": 1.1910698556812793, + "grad_norm": 5.673882961273193, + "learning_rate": 7.696214662194538e-05, + "loss": 0.6305, + "step": 17579 + }, + { + "epoch": 1.1911376109492513, + "grad_norm": 6.7035675048828125, + "learning_rate": 7.696077760284757e-05, + "loss": 0.6158, + "step": 17580 + }, + { + "epoch": 1.1912053662172233, + "grad_norm": 5.399929046630859, + "learning_rate": 7.695940858374975e-05, + "loss": 0.6918, + "step": 17581 + }, + { + "epoch": 1.1912731214851955, + "grad_norm": 6.211287975311279, + "learning_rate": 7.695803956465193e-05, + "loss": 0.7219, + "step": 17582 + }, + { + "epoch": 1.1913408767531675, + "grad_norm": 6.3117876052856445, + "learning_rate": 7.695667054555411e-05, + "loss": 0.7323, + "step": 17583 + }, + { + "epoch": 1.1914086320211397, + "grad_norm": 5.704460144042969, + "learning_rate": 7.695530152645631e-05, + "loss": 0.7325, + "step": 17584 + }, + { + "epoch": 1.1914763872891116, + "grad_norm": 6.604908466339111, + "learning_rate": 7.695393250735849e-05, + "loss": 0.6055, + "step": 17585 + }, + { + "epoch": 1.1915441425570839, + "grad_norm": 5.135843276977539, + "learning_rate": 7.695256348826067e-05, + "loss": 0.7543, + "step": 17586 + }, + { + "epoch": 1.1916118978250558, + "grad_norm": 6.044817924499512, + "learning_rate": 7.695119446916285e-05, + "loss": 0.6591, + "step": 17587 + }, + { + "epoch": 1.191679653093028, + "grad_norm": 10.055615425109863, + "learning_rate": 7.694982545006503e-05, + "loss": 0.6775, + "step": 17588 + }, + { + "epoch": 1.191747408361, + "grad_norm": 5.508429050445557, + "learning_rate": 7.694845643096722e-05, + "loss": 0.8237, + "step": 17589 + }, + { + "epoch": 1.1918151636289722, + "grad_norm": 5.428280353546143, + "learning_rate": 7.69470874118694e-05, + "loss": 0.5512, + "step": 17590 + }, + { + "epoch": 1.1918829188969442, + "grad_norm": 6.358389377593994, + "learning_rate": 7.694571839277158e-05, + "loss": 0.9419, + "step": 17591 + }, + { + "epoch": 1.1919506741649164, + "grad_norm": 4.868531227111816, + "learning_rate": 7.694434937367376e-05, + "loss": 0.6449, + "step": 17592 + }, + { + "epoch": 1.1920184294328884, + "grad_norm": 4.3719000816345215, + "learning_rate": 7.694298035457596e-05, + "loss": 0.6134, + "step": 17593 + }, + { + "epoch": 1.1920861847008606, + "grad_norm": 12.582817077636719, + "learning_rate": 7.694161133547814e-05, + "loss": 0.5755, + "step": 17594 + }, + { + "epoch": 1.1921539399688326, + "grad_norm": 5.3732781410217285, + "learning_rate": 7.694024231638032e-05, + "loss": 0.732, + "step": 17595 + }, + { + "epoch": 1.1922216952368045, + "grad_norm": 5.49517822265625, + "learning_rate": 7.69388732972825e-05, + "loss": 0.5249, + "step": 17596 + }, + { + "epoch": 1.1922894505047767, + "grad_norm": 4.9107584953308105, + "learning_rate": 7.693750427818468e-05, + "loss": 0.5645, + "step": 17597 + }, + { + "epoch": 1.192357205772749, + "grad_norm": 6.208014965057373, + "learning_rate": 7.693613525908687e-05, + "loss": 0.7483, + "step": 17598 + }, + { + "epoch": 1.192424961040721, + "grad_norm": 5.240930080413818, + "learning_rate": 7.693476623998905e-05, + "loss": 0.5941, + "step": 17599 + }, + { + "epoch": 1.192492716308693, + "grad_norm": 5.509008884429932, + "learning_rate": 7.693339722089123e-05, + "loss": 0.6262, + "step": 17600 + }, + { + "epoch": 1.1925604715766651, + "grad_norm": 4.510783672332764, + "learning_rate": 7.693202820179341e-05, + "loss": 0.6087, + "step": 17601 + }, + { + "epoch": 1.192628226844637, + "grad_norm": 7.165205478668213, + "learning_rate": 7.69306591826956e-05, + "loss": 0.7364, + "step": 17602 + }, + { + "epoch": 1.1926959821126093, + "grad_norm": 5.687561511993408, + "learning_rate": 7.692929016359779e-05, + "loss": 0.8353, + "step": 17603 + }, + { + "epoch": 1.1927637373805813, + "grad_norm": 7.553353786468506, + "learning_rate": 7.692792114449997e-05, + "loss": 1.0092, + "step": 17604 + }, + { + "epoch": 1.1928314926485535, + "grad_norm": 5.063547611236572, + "learning_rate": 7.692655212540215e-05, + "loss": 0.7275, + "step": 17605 + }, + { + "epoch": 1.1928992479165255, + "grad_norm": 5.614800930023193, + "learning_rate": 7.692518310630433e-05, + "loss": 0.628, + "step": 17606 + }, + { + "epoch": 1.1929670031844977, + "grad_norm": 6.790213584899902, + "learning_rate": 7.692381408720652e-05, + "loss": 0.7567, + "step": 17607 + }, + { + "epoch": 1.1930347584524696, + "grad_norm": 6.226934432983398, + "learning_rate": 7.69224450681087e-05, + "loss": 0.7992, + "step": 17608 + }, + { + "epoch": 1.1931025137204418, + "grad_norm": 5.4407525062561035, + "learning_rate": 7.692107604901088e-05, + "loss": 0.59, + "step": 17609 + }, + { + "epoch": 1.1931702689884138, + "grad_norm": 4.665156841278076, + "learning_rate": 7.691970702991306e-05, + "loss": 0.5459, + "step": 17610 + }, + { + "epoch": 1.1932380242563858, + "grad_norm": 6.73590612411499, + "learning_rate": 7.691833801081524e-05, + "loss": 0.6124, + "step": 17611 + }, + { + "epoch": 1.193305779524358, + "grad_norm": 5.0771074295043945, + "learning_rate": 7.691696899171744e-05, + "loss": 0.579, + "step": 17612 + }, + { + "epoch": 1.1933735347923302, + "grad_norm": 9.059715270996094, + "learning_rate": 7.691559997261962e-05, + "loss": 0.8435, + "step": 17613 + }, + { + "epoch": 1.1934412900603022, + "grad_norm": 6.360510349273682, + "learning_rate": 7.69142309535218e-05, + "loss": 0.7151, + "step": 17614 + }, + { + "epoch": 1.1935090453282742, + "grad_norm": 4.936126232147217, + "learning_rate": 7.691286193442398e-05, + "loss": 0.6126, + "step": 17615 + }, + { + "epoch": 1.1935768005962464, + "grad_norm": 5.501605987548828, + "learning_rate": 7.691149291532617e-05, + "loss": 0.6608, + "step": 17616 + }, + { + "epoch": 1.1936445558642184, + "grad_norm": 5.651332855224609, + "learning_rate": 7.691012389622835e-05, + "loss": 0.5896, + "step": 17617 + }, + { + "epoch": 1.1937123111321906, + "grad_norm": 5.945789813995361, + "learning_rate": 7.690875487713053e-05, + "loss": 0.9682, + "step": 17618 + }, + { + "epoch": 1.1937800664001625, + "grad_norm": 6.9971604347229, + "learning_rate": 7.690738585803273e-05, + "loss": 0.6775, + "step": 17619 + }, + { + "epoch": 1.1938478216681347, + "grad_norm": 5.1300506591796875, + "learning_rate": 7.690601683893491e-05, + "loss": 0.6251, + "step": 17620 + }, + { + "epoch": 1.1939155769361067, + "grad_norm": 6.18295431137085, + "learning_rate": 7.690464781983709e-05, + "loss": 0.7284, + "step": 17621 + }, + { + "epoch": 1.193983332204079, + "grad_norm": 5.232485294342041, + "learning_rate": 7.690327880073928e-05, + "loss": 0.6627, + "step": 17622 + }, + { + "epoch": 1.194051087472051, + "grad_norm": 4.987381458282471, + "learning_rate": 7.690190978164146e-05, + "loss": 0.5891, + "step": 17623 + }, + { + "epoch": 1.1941188427400231, + "grad_norm": 5.091625213623047, + "learning_rate": 7.690054076254364e-05, + "loss": 0.6001, + "step": 17624 + }, + { + "epoch": 1.194186598007995, + "grad_norm": 5.142812252044678, + "learning_rate": 7.689917174344582e-05, + "loss": 0.4461, + "step": 17625 + }, + { + "epoch": 1.1942543532759673, + "grad_norm": 5.461329460144043, + "learning_rate": 7.689780272434802e-05, + "loss": 0.7192, + "step": 17626 + }, + { + "epoch": 1.1943221085439393, + "grad_norm": 5.987907886505127, + "learning_rate": 7.68964337052502e-05, + "loss": 0.7833, + "step": 17627 + }, + { + "epoch": 1.1943898638119115, + "grad_norm": 6.716150760650635, + "learning_rate": 7.689506468615238e-05, + "loss": 0.6963, + "step": 17628 + }, + { + "epoch": 1.1944576190798835, + "grad_norm": 7.355491638183594, + "learning_rate": 7.689369566705456e-05, + "loss": 0.483, + "step": 17629 + }, + { + "epoch": 1.1945253743478554, + "grad_norm": 6.228071689605713, + "learning_rate": 7.689232664795675e-05, + "loss": 0.677, + "step": 17630 + }, + { + "epoch": 1.1945931296158276, + "grad_norm": 6.996883869171143, + "learning_rate": 7.689095762885893e-05, + "loss": 0.8552, + "step": 17631 + }, + { + "epoch": 1.1946608848837996, + "grad_norm": 4.602889537811279, + "learning_rate": 7.688958860976111e-05, + "loss": 0.5121, + "step": 17632 + }, + { + "epoch": 1.1947286401517718, + "grad_norm": 5.40228271484375, + "learning_rate": 7.688821959066329e-05, + "loss": 0.5775, + "step": 17633 + }, + { + "epoch": 1.1947963954197438, + "grad_norm": 5.13663387298584, + "learning_rate": 7.688685057156547e-05, + "loss": 0.7632, + "step": 17634 + }, + { + "epoch": 1.194864150687716, + "grad_norm": 9.4126558303833, + "learning_rate": 7.688548155246767e-05, + "loss": 0.8483, + "step": 17635 + }, + { + "epoch": 1.194931905955688, + "grad_norm": 5.480384349822998, + "learning_rate": 7.688411253336985e-05, + "loss": 0.6682, + "step": 17636 + }, + { + "epoch": 1.1949996612236602, + "grad_norm": 5.476341247558594, + "learning_rate": 7.688274351427203e-05, + "loss": 0.6641, + "step": 17637 + }, + { + "epoch": 1.1950674164916322, + "grad_norm": 4.595249652862549, + "learning_rate": 7.688137449517421e-05, + "loss": 0.5387, + "step": 17638 + }, + { + "epoch": 1.1951351717596044, + "grad_norm": 5.596693515777588, + "learning_rate": 7.68800054760764e-05, + "loss": 0.7083, + "step": 17639 + }, + { + "epoch": 1.1952029270275764, + "grad_norm": 6.919038772583008, + "learning_rate": 7.687863645697858e-05, + "loss": 0.6535, + "step": 17640 + }, + { + "epoch": 1.1952706822955486, + "grad_norm": 5.938333988189697, + "learning_rate": 7.687726743788076e-05, + "loss": 0.8527, + "step": 17641 + }, + { + "epoch": 1.1953384375635205, + "grad_norm": 5.0389299392700195, + "learning_rate": 7.687589841878294e-05, + "loss": 0.7157, + "step": 17642 + }, + { + "epoch": 1.1954061928314927, + "grad_norm": 8.325102806091309, + "learning_rate": 7.687452939968512e-05, + "loss": 0.7156, + "step": 17643 + }, + { + "epoch": 1.1954739480994647, + "grad_norm": 6.028964519500732, + "learning_rate": 7.687316038058732e-05, + "loss": 0.7134, + "step": 17644 + }, + { + "epoch": 1.1955417033674367, + "grad_norm": 7.535099983215332, + "learning_rate": 7.68717913614895e-05, + "loss": 0.7695, + "step": 17645 + }, + { + "epoch": 1.195609458635409, + "grad_norm": 5.430510520935059, + "learning_rate": 7.687042234239168e-05, + "loss": 0.8219, + "step": 17646 + }, + { + "epoch": 1.195677213903381, + "grad_norm": 5.3170013427734375, + "learning_rate": 7.686905332329386e-05, + "loss": 0.9118, + "step": 17647 + }, + { + "epoch": 1.195744969171353, + "grad_norm": 8.100552558898926, + "learning_rate": 7.686768430419605e-05, + "loss": 0.8288, + "step": 17648 + }, + { + "epoch": 1.195812724439325, + "grad_norm": 6.240118980407715, + "learning_rate": 7.686631528509823e-05, + "loss": 0.8284, + "step": 17649 + }, + { + "epoch": 1.1958804797072973, + "grad_norm": 6.110140323638916, + "learning_rate": 7.686494626600041e-05, + "loss": 0.7722, + "step": 17650 + }, + { + "epoch": 1.1959482349752693, + "grad_norm": 6.435049057006836, + "learning_rate": 7.68635772469026e-05, + "loss": 0.6591, + "step": 17651 + }, + { + "epoch": 1.1960159902432415, + "grad_norm": 5.954352855682373, + "learning_rate": 7.686220822780477e-05, + "loss": 0.6074, + "step": 17652 + }, + { + "epoch": 1.1960837455112134, + "grad_norm": 5.165590763092041, + "learning_rate": 7.686083920870697e-05, + "loss": 0.7499, + "step": 17653 + }, + { + "epoch": 1.1961515007791856, + "grad_norm": 7.155406475067139, + "learning_rate": 7.685947018960915e-05, + "loss": 0.8336, + "step": 17654 + }, + { + "epoch": 1.1962192560471576, + "grad_norm": 5.120672702789307, + "learning_rate": 7.685810117051133e-05, + "loss": 0.6934, + "step": 17655 + }, + { + "epoch": 1.1962870113151298, + "grad_norm": 6.173689365386963, + "learning_rate": 7.685673215141351e-05, + "loss": 0.4323, + "step": 17656 + }, + { + "epoch": 1.1963547665831018, + "grad_norm": 4.619163513183594, + "learning_rate": 7.685536313231569e-05, + "loss": 0.4938, + "step": 17657 + }, + { + "epoch": 1.196422521851074, + "grad_norm": 8.75632381439209, + "learning_rate": 7.685399411321788e-05, + "loss": 1.0098, + "step": 17658 + }, + { + "epoch": 1.196490277119046, + "grad_norm": 6.584256172180176, + "learning_rate": 7.685262509412006e-05, + "loss": 0.5902, + "step": 17659 + }, + { + "epoch": 1.196558032387018, + "grad_norm": 6.670119762420654, + "learning_rate": 7.685125607502224e-05, + "loss": 0.8715, + "step": 17660 + }, + { + "epoch": 1.1966257876549902, + "grad_norm": 8.594388008117676, + "learning_rate": 7.684988705592442e-05, + "loss": 0.7048, + "step": 17661 + }, + { + "epoch": 1.1966935429229624, + "grad_norm": 7.816542148590088, + "learning_rate": 7.684851803682662e-05, + "loss": 0.9348, + "step": 17662 + }, + { + "epoch": 1.1967612981909344, + "grad_norm": 4.398709774017334, + "learning_rate": 7.68471490177288e-05, + "loss": 0.5985, + "step": 17663 + }, + { + "epoch": 1.1968290534589063, + "grad_norm": 7.980318069458008, + "learning_rate": 7.684577999863098e-05, + "loss": 0.8392, + "step": 17664 + }, + { + "epoch": 1.1968968087268785, + "grad_norm": 7.779125213623047, + "learning_rate": 7.684441097953316e-05, + "loss": 0.6489, + "step": 17665 + }, + { + "epoch": 1.1969645639948505, + "grad_norm": 6.15852165222168, + "learning_rate": 7.684304196043535e-05, + "loss": 0.8325, + "step": 17666 + }, + { + "epoch": 1.1970323192628227, + "grad_norm": 5.582149982452393, + "learning_rate": 7.684167294133753e-05, + "loss": 0.6112, + "step": 17667 + }, + { + "epoch": 1.1971000745307947, + "grad_norm": 6.231848239898682, + "learning_rate": 7.684030392223973e-05, + "loss": 0.704, + "step": 17668 + }, + { + "epoch": 1.197167829798767, + "grad_norm": 8.48556900024414, + "learning_rate": 7.683893490314191e-05, + "loss": 0.8127, + "step": 17669 + }, + { + "epoch": 1.1972355850667389, + "grad_norm": 5.312507152557373, + "learning_rate": 7.683756588404409e-05, + "loss": 0.5856, + "step": 17670 + }, + { + "epoch": 1.197303340334711, + "grad_norm": 4.0072407722473145, + "learning_rate": 7.683619686494628e-05, + "loss": 0.6048, + "step": 17671 + }, + { + "epoch": 1.197371095602683, + "grad_norm": 4.753046989440918, + "learning_rate": 7.683482784584846e-05, + "loss": 0.7208, + "step": 17672 + }, + { + "epoch": 1.1974388508706553, + "grad_norm": 4.263588905334473, + "learning_rate": 7.683345882675064e-05, + "loss": 0.6492, + "step": 17673 + }, + { + "epoch": 1.1975066061386272, + "grad_norm": 5.775005340576172, + "learning_rate": 7.683208980765282e-05, + "loss": 0.8061, + "step": 17674 + }, + { + "epoch": 1.1975743614065995, + "grad_norm": 6.587451457977295, + "learning_rate": 7.6830720788555e-05, + "loss": 0.7187, + "step": 17675 + }, + { + "epoch": 1.1976421166745714, + "grad_norm": 7.55980110168457, + "learning_rate": 7.68293517694572e-05, + "loss": 0.7459, + "step": 17676 + }, + { + "epoch": 1.1977098719425436, + "grad_norm": 5.851051330566406, + "learning_rate": 7.682798275035938e-05, + "loss": 0.7314, + "step": 17677 + }, + { + "epoch": 1.1977776272105156, + "grad_norm": 6.21124792098999, + "learning_rate": 7.682661373126156e-05, + "loss": 0.6496, + "step": 17678 + }, + { + "epoch": 1.1978453824784876, + "grad_norm": 6.034067630767822, + "learning_rate": 7.682524471216374e-05, + "loss": 0.7823, + "step": 17679 + }, + { + "epoch": 1.1979131377464598, + "grad_norm": 5.624752521514893, + "learning_rate": 7.682387569306592e-05, + "loss": 0.9108, + "step": 17680 + }, + { + "epoch": 1.1979808930144318, + "grad_norm": 7.310678005218506, + "learning_rate": 7.682250667396811e-05, + "loss": 0.7976, + "step": 17681 + }, + { + "epoch": 1.198048648282404, + "grad_norm": 5.958901405334473, + "learning_rate": 7.682113765487029e-05, + "loss": 0.6814, + "step": 17682 + }, + { + "epoch": 1.198116403550376, + "grad_norm": 5.8540120124816895, + "learning_rate": 7.681976863577247e-05, + "loss": 0.9074, + "step": 17683 + }, + { + "epoch": 1.1981841588183482, + "grad_norm": 11.885223388671875, + "learning_rate": 7.681839961667465e-05, + "loss": 0.6314, + "step": 17684 + }, + { + "epoch": 1.1982519140863201, + "grad_norm": 5.285735607147217, + "learning_rate": 7.681703059757685e-05, + "loss": 0.7848, + "step": 17685 + }, + { + "epoch": 1.1983196693542923, + "grad_norm": 6.814820289611816, + "learning_rate": 7.681566157847903e-05, + "loss": 0.7201, + "step": 17686 + }, + { + "epoch": 1.1983874246222643, + "grad_norm": 7.1838788986206055, + "learning_rate": 7.681429255938121e-05, + "loss": 0.8404, + "step": 17687 + }, + { + "epoch": 1.1984551798902365, + "grad_norm": 5.949917793273926, + "learning_rate": 7.681292354028339e-05, + "loss": 0.7767, + "step": 17688 + }, + { + "epoch": 1.1985229351582085, + "grad_norm": 6.235685348510742, + "learning_rate": 7.681155452118557e-05, + "loss": 0.717, + "step": 17689 + }, + { + "epoch": 1.1985906904261807, + "grad_norm": 5.224851131439209, + "learning_rate": 7.681018550208776e-05, + "loss": 0.7371, + "step": 17690 + }, + { + "epoch": 1.1986584456941527, + "grad_norm": 5.613080978393555, + "learning_rate": 7.680881648298994e-05, + "loss": 0.5199, + "step": 17691 + }, + { + "epoch": 1.198726200962125, + "grad_norm": 8.915862083435059, + "learning_rate": 7.680744746389212e-05, + "loss": 0.717, + "step": 17692 + }, + { + "epoch": 1.1987939562300969, + "grad_norm": 5.137214660644531, + "learning_rate": 7.68060784447943e-05, + "loss": 0.6165, + "step": 17693 + }, + { + "epoch": 1.1988617114980689, + "grad_norm": 4.481411933898926, + "learning_rate": 7.68047094256965e-05, + "loss": 0.7008, + "step": 17694 + }, + { + "epoch": 1.198929466766041, + "grad_norm": 4.9299798011779785, + "learning_rate": 7.680334040659868e-05, + "loss": 0.6167, + "step": 17695 + }, + { + "epoch": 1.1989972220340133, + "grad_norm": 6.480248928070068, + "learning_rate": 7.680197138750086e-05, + "loss": 0.6755, + "step": 17696 + }, + { + "epoch": 1.1990649773019852, + "grad_norm": 7.132851600646973, + "learning_rate": 7.680060236840304e-05, + "loss": 0.8065, + "step": 17697 + }, + { + "epoch": 1.1991327325699572, + "grad_norm": 6.641706943511963, + "learning_rate": 7.679923334930522e-05, + "loss": 0.7251, + "step": 17698 + }, + { + "epoch": 1.1992004878379294, + "grad_norm": 10.569890022277832, + "learning_rate": 7.679786433020741e-05, + "loss": 0.6988, + "step": 17699 + }, + { + "epoch": 1.1992682431059014, + "grad_norm": 5.934963226318359, + "learning_rate": 7.679649531110959e-05, + "loss": 0.8192, + "step": 17700 + }, + { + "epoch": 1.1993359983738736, + "grad_norm": 7.19936466217041, + "learning_rate": 7.679512629201177e-05, + "loss": 0.6698, + "step": 17701 + }, + { + "epoch": 1.1994037536418456, + "grad_norm": 4.598423957824707, + "learning_rate": 7.679375727291395e-05, + "loss": 0.6782, + "step": 17702 + }, + { + "epoch": 1.1994715089098178, + "grad_norm": 6.056293487548828, + "learning_rate": 7.679238825381615e-05, + "loss": 1.0353, + "step": 17703 + }, + { + "epoch": 1.1995392641777898, + "grad_norm": 4.992430210113525, + "learning_rate": 7.679101923471833e-05, + "loss": 0.51, + "step": 17704 + }, + { + "epoch": 1.199607019445762, + "grad_norm": 6.1313886642456055, + "learning_rate": 7.678965021562051e-05, + "loss": 0.6802, + "step": 17705 + }, + { + "epoch": 1.199674774713734, + "grad_norm": 8.208282470703125, + "learning_rate": 7.678828119652269e-05, + "loss": 0.7752, + "step": 17706 + }, + { + "epoch": 1.1997425299817062, + "grad_norm": 8.525566101074219, + "learning_rate": 7.678691217742487e-05, + "loss": 0.7824, + "step": 17707 + }, + { + "epoch": 1.1998102852496781, + "grad_norm": 9.573042869567871, + "learning_rate": 7.678554315832706e-05, + "loss": 0.8599, + "step": 17708 + }, + { + "epoch": 1.1998780405176501, + "grad_norm": 6.6548871994018555, + "learning_rate": 7.678417413922924e-05, + "loss": 0.5811, + "step": 17709 + }, + { + "epoch": 1.1999457957856223, + "grad_norm": 7.299272537231445, + "learning_rate": 7.678280512013142e-05, + "loss": 0.7162, + "step": 17710 + }, + { + "epoch": 1.2000135510535945, + "grad_norm": 6.244133949279785, + "learning_rate": 7.67814361010336e-05, + "loss": 0.8619, + "step": 17711 + }, + { + "epoch": 1.2000813063215665, + "grad_norm": 5.655728816986084, + "learning_rate": 7.67800670819358e-05, + "loss": 0.5859, + "step": 17712 + }, + { + "epoch": 1.2001490615895385, + "grad_norm": 6.566688060760498, + "learning_rate": 7.677869806283798e-05, + "loss": 0.7493, + "step": 17713 + }, + { + "epoch": 1.2002168168575107, + "grad_norm": 7.644930839538574, + "learning_rate": 7.677732904374016e-05, + "loss": 0.7606, + "step": 17714 + }, + { + "epoch": 1.2002845721254827, + "grad_norm": 6.584826946258545, + "learning_rate": 7.677596002464235e-05, + "loss": 0.816, + "step": 17715 + }, + { + "epoch": 1.2003523273934549, + "grad_norm": 10.395920753479004, + "learning_rate": 7.677459100554453e-05, + "loss": 0.9352, + "step": 17716 + }, + { + "epoch": 1.2004200826614269, + "grad_norm": 6.787402629852295, + "learning_rate": 7.677322198644671e-05, + "loss": 0.7819, + "step": 17717 + }, + { + "epoch": 1.200487837929399, + "grad_norm": 5.836369037628174, + "learning_rate": 7.67718529673489e-05, + "loss": 0.7411, + "step": 17718 + }, + { + "epoch": 1.200555593197371, + "grad_norm": 5.070427417755127, + "learning_rate": 7.677048394825109e-05, + "loss": 0.6364, + "step": 17719 + }, + { + "epoch": 1.2006233484653432, + "grad_norm": 5.135974407196045, + "learning_rate": 7.676911492915327e-05, + "loss": 0.6565, + "step": 17720 + }, + { + "epoch": 1.2006911037333152, + "grad_norm": 3.663120746612549, + "learning_rate": 7.676774591005545e-05, + "loss": 0.4482, + "step": 17721 + }, + { + "epoch": 1.2007588590012874, + "grad_norm": 4.547859191894531, + "learning_rate": 7.676637689095764e-05, + "loss": 0.5517, + "step": 17722 + }, + { + "epoch": 1.2008266142692594, + "grad_norm": 5.410218238830566, + "learning_rate": 7.676500787185982e-05, + "loss": 0.6991, + "step": 17723 + }, + { + "epoch": 1.2008943695372316, + "grad_norm": 5.378064155578613, + "learning_rate": 7.6763638852762e-05, + "loss": 0.6361, + "step": 17724 + }, + { + "epoch": 1.2009621248052036, + "grad_norm": 4.933994770050049, + "learning_rate": 7.676226983366418e-05, + "loss": 0.587, + "step": 17725 + }, + { + "epoch": 1.2010298800731758, + "grad_norm": 5.668514728546143, + "learning_rate": 7.676090081456638e-05, + "loss": 0.5187, + "step": 17726 + }, + { + "epoch": 1.2010976353411478, + "grad_norm": 6.826091289520264, + "learning_rate": 7.675953179546856e-05, + "loss": 0.656, + "step": 17727 + }, + { + "epoch": 1.2011653906091198, + "grad_norm": 7.071187496185303, + "learning_rate": 7.675816277637074e-05, + "loss": 0.5693, + "step": 17728 + }, + { + "epoch": 1.201233145877092, + "grad_norm": 7.474967002868652, + "learning_rate": 7.675679375727292e-05, + "loss": 0.4735, + "step": 17729 + }, + { + "epoch": 1.201300901145064, + "grad_norm": 6.620955467224121, + "learning_rate": 7.67554247381751e-05, + "loss": 0.8567, + "step": 17730 + }, + { + "epoch": 1.2013686564130361, + "grad_norm": 5.957265853881836, + "learning_rate": 7.675405571907729e-05, + "loss": 0.5876, + "step": 17731 + }, + { + "epoch": 1.2014364116810081, + "grad_norm": 6.6812028884887695, + "learning_rate": 7.675268669997947e-05, + "loss": 0.7244, + "step": 17732 + }, + { + "epoch": 1.2015041669489803, + "grad_norm": 6.950610160827637, + "learning_rate": 7.675131768088165e-05, + "loss": 0.7549, + "step": 17733 + }, + { + "epoch": 1.2015719222169523, + "grad_norm": 6.661945343017578, + "learning_rate": 7.674994866178383e-05, + "loss": 0.8251, + "step": 17734 + }, + { + "epoch": 1.2016396774849245, + "grad_norm": 6.50548791885376, + "learning_rate": 7.674857964268601e-05, + "loss": 0.7072, + "step": 17735 + }, + { + "epoch": 1.2017074327528965, + "grad_norm": 10.191153526306152, + "learning_rate": 7.674721062358821e-05, + "loss": 0.5644, + "step": 17736 + }, + { + "epoch": 1.2017751880208687, + "grad_norm": 5.519306659698486, + "learning_rate": 7.674584160449039e-05, + "loss": 0.649, + "step": 17737 + }, + { + "epoch": 1.2018429432888407, + "grad_norm": 8.362431526184082, + "learning_rate": 7.674447258539257e-05, + "loss": 0.8843, + "step": 17738 + }, + { + "epoch": 1.2019106985568129, + "grad_norm": 6.758439540863037, + "learning_rate": 7.674310356629475e-05, + "loss": 0.6952, + "step": 17739 + }, + { + "epoch": 1.2019784538247849, + "grad_norm": 5.742153167724609, + "learning_rate": 7.674173454719694e-05, + "loss": 0.7082, + "step": 17740 + }, + { + "epoch": 1.202046209092757, + "grad_norm": 6.582305431365967, + "learning_rate": 7.674036552809912e-05, + "loss": 0.734, + "step": 17741 + }, + { + "epoch": 1.202113964360729, + "grad_norm": 5.384820938110352, + "learning_rate": 7.67389965090013e-05, + "loss": 0.7698, + "step": 17742 + }, + { + "epoch": 1.202181719628701, + "grad_norm": 7.0777082443237305, + "learning_rate": 7.673762748990348e-05, + "loss": 0.5331, + "step": 17743 + }, + { + "epoch": 1.2022494748966732, + "grad_norm": 6.164592742919922, + "learning_rate": 7.673625847080566e-05, + "loss": 0.6242, + "step": 17744 + }, + { + "epoch": 1.2023172301646454, + "grad_norm": 6.713252544403076, + "learning_rate": 7.673488945170786e-05, + "loss": 0.5903, + "step": 17745 + }, + { + "epoch": 1.2023849854326174, + "grad_norm": 5.783125877380371, + "learning_rate": 7.673352043261004e-05, + "loss": 0.7877, + "step": 17746 + }, + { + "epoch": 1.2024527407005894, + "grad_norm": 8.683629989624023, + "learning_rate": 7.673215141351222e-05, + "loss": 0.6339, + "step": 17747 + }, + { + "epoch": 1.2025204959685616, + "grad_norm": 8.471028327941895, + "learning_rate": 7.67307823944144e-05, + "loss": 0.7041, + "step": 17748 + }, + { + "epoch": 1.2025882512365336, + "grad_norm": 5.869851112365723, + "learning_rate": 7.672941337531659e-05, + "loss": 0.7802, + "step": 17749 + }, + { + "epoch": 1.2026560065045058, + "grad_norm": 5.1376776695251465, + "learning_rate": 7.672804435621877e-05, + "loss": 0.637, + "step": 17750 + }, + { + "epoch": 1.2027237617724778, + "grad_norm": 4.885056972503662, + "learning_rate": 7.672667533712095e-05, + "loss": 0.6424, + "step": 17751 + }, + { + "epoch": 1.20279151704045, + "grad_norm": 5.53432035446167, + "learning_rate": 7.672530631802313e-05, + "loss": 0.7047, + "step": 17752 + }, + { + "epoch": 1.202859272308422, + "grad_norm": 8.100170135498047, + "learning_rate": 7.672393729892531e-05, + "loss": 0.6902, + "step": 17753 + }, + { + "epoch": 1.2029270275763941, + "grad_norm": 6.803896903991699, + "learning_rate": 7.672256827982751e-05, + "loss": 0.6291, + "step": 17754 + }, + { + "epoch": 1.2029947828443661, + "grad_norm": 4.285371780395508, + "learning_rate": 7.672119926072969e-05, + "loss": 0.6842, + "step": 17755 + }, + { + "epoch": 1.2030625381123383, + "grad_norm": 5.140682697296143, + "learning_rate": 7.671983024163187e-05, + "loss": 0.7823, + "step": 17756 + }, + { + "epoch": 1.2031302933803103, + "grad_norm": 6.522716045379639, + "learning_rate": 7.671846122253405e-05, + "loss": 0.6746, + "step": 17757 + }, + { + "epoch": 1.2031980486482823, + "grad_norm": 7.171764850616455, + "learning_rate": 7.671709220343624e-05, + "loss": 0.7369, + "step": 17758 + }, + { + "epoch": 1.2032658039162545, + "grad_norm": 8.315983772277832, + "learning_rate": 7.671572318433842e-05, + "loss": 0.9706, + "step": 17759 + }, + { + "epoch": 1.2033335591842267, + "grad_norm": 5.393362998962402, + "learning_rate": 7.67143541652406e-05, + "loss": 0.5944, + "step": 17760 + }, + { + "epoch": 1.2034013144521987, + "grad_norm": 7.974836826324463, + "learning_rate": 7.67129851461428e-05, + "loss": 0.7715, + "step": 17761 + }, + { + "epoch": 1.2034690697201706, + "grad_norm": 8.586780548095703, + "learning_rate": 7.671161612704498e-05, + "loss": 0.7409, + "step": 17762 + }, + { + "epoch": 1.2035368249881429, + "grad_norm": 5.896920204162598, + "learning_rate": 7.671024710794716e-05, + "loss": 0.9213, + "step": 17763 + }, + { + "epoch": 1.2036045802561148, + "grad_norm": 6.252879619598389, + "learning_rate": 7.670887808884935e-05, + "loss": 0.6313, + "step": 17764 + }, + { + "epoch": 1.203672335524087, + "grad_norm": 5.003981113433838, + "learning_rate": 7.670750906975153e-05, + "loss": 0.8074, + "step": 17765 + }, + { + "epoch": 1.203740090792059, + "grad_norm": 6.52988862991333, + "learning_rate": 7.670614005065371e-05, + "loss": 0.6897, + "step": 17766 + }, + { + "epoch": 1.2038078460600312, + "grad_norm": 5.87505578994751, + "learning_rate": 7.670477103155589e-05, + "loss": 0.6301, + "step": 17767 + }, + { + "epoch": 1.2038756013280032, + "grad_norm": 5.153764724731445, + "learning_rate": 7.670340201245809e-05, + "loss": 0.8281, + "step": 17768 + }, + { + "epoch": 1.2039433565959754, + "grad_norm": 5.710607051849365, + "learning_rate": 7.670203299336027e-05, + "loss": 0.6222, + "step": 17769 + }, + { + "epoch": 1.2040111118639474, + "grad_norm": 6.170762538909912, + "learning_rate": 7.670066397426245e-05, + "loss": 0.7103, + "step": 17770 + }, + { + "epoch": 1.2040788671319196, + "grad_norm": 6.375565052032471, + "learning_rate": 7.669929495516463e-05, + "loss": 0.6191, + "step": 17771 + }, + { + "epoch": 1.2041466223998916, + "grad_norm": 4.824178218841553, + "learning_rate": 7.669792593606682e-05, + "loss": 0.7482, + "step": 17772 + }, + { + "epoch": 1.2042143776678638, + "grad_norm": 5.683229446411133, + "learning_rate": 7.6696556916969e-05, + "loss": 0.6528, + "step": 17773 + }, + { + "epoch": 1.2042821329358357, + "grad_norm": 7.511515140533447, + "learning_rate": 7.669518789787118e-05, + "loss": 0.7132, + "step": 17774 + }, + { + "epoch": 1.204349888203808, + "grad_norm": 4.747464179992676, + "learning_rate": 7.669381887877336e-05, + "loss": 0.6468, + "step": 17775 + }, + { + "epoch": 1.20441764347178, + "grad_norm": 6.176259517669678, + "learning_rate": 7.669244985967554e-05, + "loss": 0.8254, + "step": 17776 + }, + { + "epoch": 1.204485398739752, + "grad_norm": 6.217640399932861, + "learning_rate": 7.669108084057774e-05, + "loss": 0.7821, + "step": 17777 + }, + { + "epoch": 1.2045531540077241, + "grad_norm": 7.800964832305908, + "learning_rate": 7.668971182147992e-05, + "loss": 0.8139, + "step": 17778 + }, + { + "epoch": 1.204620909275696, + "grad_norm": 6.084750175476074, + "learning_rate": 7.66883428023821e-05, + "loss": 0.5719, + "step": 17779 + }, + { + "epoch": 1.2046886645436683, + "grad_norm": 6.607977390289307, + "learning_rate": 7.668697378328428e-05, + "loss": 0.7541, + "step": 17780 + }, + { + "epoch": 1.2047564198116403, + "grad_norm": 5.644943714141846, + "learning_rate": 7.668560476418647e-05, + "loss": 0.7759, + "step": 17781 + }, + { + "epoch": 1.2048241750796125, + "grad_norm": 10.316941261291504, + "learning_rate": 7.668423574508865e-05, + "loss": 0.6069, + "step": 17782 + }, + { + "epoch": 1.2048919303475845, + "grad_norm": 7.116324424743652, + "learning_rate": 7.668286672599083e-05, + "loss": 0.7976, + "step": 17783 + }, + { + "epoch": 1.2049596856155567, + "grad_norm": 4.7204365730285645, + "learning_rate": 7.668149770689301e-05, + "loss": 0.6528, + "step": 17784 + }, + { + "epoch": 1.2050274408835286, + "grad_norm": 4.601452350616455, + "learning_rate": 7.668012868779519e-05, + "loss": 0.7411, + "step": 17785 + }, + { + "epoch": 1.2050951961515008, + "grad_norm": 5.289638519287109, + "learning_rate": 7.667875966869739e-05, + "loss": 0.7751, + "step": 17786 + }, + { + "epoch": 1.2051629514194728, + "grad_norm": 4.839632987976074, + "learning_rate": 7.667739064959957e-05, + "loss": 0.5549, + "step": 17787 + }, + { + "epoch": 1.205230706687445, + "grad_norm": 5.583590030670166, + "learning_rate": 7.667602163050175e-05, + "loss": 0.6464, + "step": 17788 + }, + { + "epoch": 1.205298461955417, + "grad_norm": 7.026573181152344, + "learning_rate": 7.667465261140393e-05, + "loss": 0.764, + "step": 17789 + }, + { + "epoch": 1.2053662172233892, + "grad_norm": 6.217905044555664, + "learning_rate": 7.667328359230611e-05, + "loss": 0.6443, + "step": 17790 + }, + { + "epoch": 1.2054339724913612, + "grad_norm": 6.040329933166504, + "learning_rate": 7.66719145732083e-05, + "loss": 0.6784, + "step": 17791 + }, + { + "epoch": 1.2055017277593332, + "grad_norm": 9.230742454528809, + "learning_rate": 7.667054555411048e-05, + "loss": 0.6793, + "step": 17792 + }, + { + "epoch": 1.2055694830273054, + "grad_norm": 8.235578536987305, + "learning_rate": 7.666917653501266e-05, + "loss": 0.8269, + "step": 17793 + }, + { + "epoch": 1.2056372382952776, + "grad_norm": 6.351599216461182, + "learning_rate": 7.666780751591484e-05, + "loss": 0.7285, + "step": 17794 + }, + { + "epoch": 1.2057049935632496, + "grad_norm": 7.1128973960876465, + "learning_rate": 7.666643849681704e-05, + "loss": 0.8007, + "step": 17795 + }, + { + "epoch": 1.2057727488312215, + "grad_norm": 6.171035289764404, + "learning_rate": 7.666506947771922e-05, + "loss": 0.7373, + "step": 17796 + }, + { + "epoch": 1.2058405040991937, + "grad_norm": 5.534034729003906, + "learning_rate": 7.66637004586214e-05, + "loss": 0.6877, + "step": 17797 + }, + { + "epoch": 1.2059082593671657, + "grad_norm": 9.175405502319336, + "learning_rate": 7.666233143952358e-05, + "loss": 0.6967, + "step": 17798 + }, + { + "epoch": 1.205976014635138, + "grad_norm": 5.125663757324219, + "learning_rate": 7.666096242042576e-05, + "loss": 0.5005, + "step": 17799 + }, + { + "epoch": 1.20604376990311, + "grad_norm": 6.7034687995910645, + "learning_rate": 7.665959340132795e-05, + "loss": 0.8659, + "step": 17800 + }, + { + "epoch": 1.206111525171082, + "grad_norm": 5.710980415344238, + "learning_rate": 7.665822438223013e-05, + "loss": 0.6413, + "step": 17801 + }, + { + "epoch": 1.206179280439054, + "grad_norm": 4.896183967590332, + "learning_rate": 7.665685536313231e-05, + "loss": 0.5167, + "step": 17802 + }, + { + "epoch": 1.2062470357070263, + "grad_norm": 6.997365474700928, + "learning_rate": 7.665548634403449e-05, + "loss": 0.743, + "step": 17803 + }, + { + "epoch": 1.2063147909749983, + "grad_norm": 5.428713798522949, + "learning_rate": 7.665411732493669e-05, + "loss": 0.7474, + "step": 17804 + }, + { + "epoch": 1.2063825462429705, + "grad_norm": 4.988559722900391, + "learning_rate": 7.665274830583887e-05, + "loss": 0.5755, + "step": 17805 + }, + { + "epoch": 1.2064503015109425, + "grad_norm": 5.562001705169678, + "learning_rate": 7.665137928674105e-05, + "loss": 0.64, + "step": 17806 + }, + { + "epoch": 1.2065180567789144, + "grad_norm": 6.794801235198975, + "learning_rate": 7.665001026764324e-05, + "loss": 0.6635, + "step": 17807 + }, + { + "epoch": 1.2065858120468866, + "grad_norm": 7.743435382843018, + "learning_rate": 7.664864124854542e-05, + "loss": 0.6183, + "step": 17808 + }, + { + "epoch": 1.2066535673148588, + "grad_norm": 5.286825180053711, + "learning_rate": 7.66472722294476e-05, + "loss": 0.7029, + "step": 17809 + }, + { + "epoch": 1.2067213225828308, + "grad_norm": 3.8852932453155518, + "learning_rate": 7.66459032103498e-05, + "loss": 0.5791, + "step": 17810 + }, + { + "epoch": 1.2067890778508028, + "grad_norm": 10.570182800292969, + "learning_rate": 7.664453419125198e-05, + "loss": 0.7256, + "step": 17811 + }, + { + "epoch": 1.206856833118775, + "grad_norm": 6.794580936431885, + "learning_rate": 7.664316517215416e-05, + "loss": 0.6337, + "step": 17812 + }, + { + "epoch": 1.206924588386747, + "grad_norm": 6.657121658325195, + "learning_rate": 7.664179615305634e-05, + "loss": 0.8513, + "step": 17813 + }, + { + "epoch": 1.2069923436547192, + "grad_norm": 5.735461235046387, + "learning_rate": 7.664042713395853e-05, + "loss": 0.7064, + "step": 17814 + }, + { + "epoch": 1.2070600989226912, + "grad_norm": 5.639031887054443, + "learning_rate": 7.663905811486071e-05, + "loss": 0.7369, + "step": 17815 + }, + { + "epoch": 1.2071278541906634, + "grad_norm": 7.826430797576904, + "learning_rate": 7.663768909576289e-05, + "loss": 0.6114, + "step": 17816 + }, + { + "epoch": 1.2071956094586354, + "grad_norm": 5.544642448425293, + "learning_rate": 7.663632007666507e-05, + "loss": 0.543, + "step": 17817 + }, + { + "epoch": 1.2072633647266076, + "grad_norm": 6.61638069152832, + "learning_rate": 7.663495105756727e-05, + "loss": 0.6926, + "step": 17818 + }, + { + "epoch": 1.2073311199945795, + "grad_norm": 4.663098335266113, + "learning_rate": 7.663358203846945e-05, + "loss": 0.7931, + "step": 17819 + }, + { + "epoch": 1.2073988752625517, + "grad_norm": 6.7196478843688965, + "learning_rate": 7.663221301937163e-05, + "loss": 0.8295, + "step": 17820 + }, + { + "epoch": 1.2074666305305237, + "grad_norm": 5.3182196617126465, + "learning_rate": 7.663084400027381e-05, + "loss": 0.624, + "step": 17821 + }, + { + "epoch": 1.207534385798496, + "grad_norm": 5.531083106994629, + "learning_rate": 7.662947498117599e-05, + "loss": 0.4976, + "step": 17822 + }, + { + "epoch": 1.207602141066468, + "grad_norm": 5.298411846160889, + "learning_rate": 7.662810596207818e-05, + "loss": 0.6012, + "step": 17823 + }, + { + "epoch": 1.20766989633444, + "grad_norm": 4.497094631195068, + "learning_rate": 7.662673694298036e-05, + "loss": 0.6826, + "step": 17824 + }, + { + "epoch": 1.207737651602412, + "grad_norm": 5.269436836242676, + "learning_rate": 7.662536792388254e-05, + "loss": 0.6327, + "step": 17825 + }, + { + "epoch": 1.207805406870384, + "grad_norm": 5.4644646644592285, + "learning_rate": 7.662399890478472e-05, + "loss": 0.71, + "step": 17826 + }, + { + "epoch": 1.2078731621383563, + "grad_norm": 5.628913879394531, + "learning_rate": 7.662262988568692e-05, + "loss": 0.7919, + "step": 17827 + }, + { + "epoch": 1.2079409174063283, + "grad_norm": 5.46560525894165, + "learning_rate": 7.66212608665891e-05, + "loss": 0.8439, + "step": 17828 + }, + { + "epoch": 1.2080086726743005, + "grad_norm": 5.631601810455322, + "learning_rate": 7.661989184749128e-05, + "loss": 0.6336, + "step": 17829 + }, + { + "epoch": 1.2080764279422724, + "grad_norm": 5.358332633972168, + "learning_rate": 7.661852282839346e-05, + "loss": 0.9302, + "step": 17830 + }, + { + "epoch": 1.2081441832102446, + "grad_norm": 6.63771915435791, + "learning_rate": 7.661715380929564e-05, + "loss": 0.7401, + "step": 17831 + }, + { + "epoch": 1.2082119384782166, + "grad_norm": 6.354868412017822, + "learning_rate": 7.661578479019783e-05, + "loss": 0.7275, + "step": 17832 + }, + { + "epoch": 1.2082796937461888, + "grad_norm": 7.358850479125977, + "learning_rate": 7.661441577110001e-05, + "loss": 0.7485, + "step": 17833 + }, + { + "epoch": 1.2083474490141608, + "grad_norm": 5.550168991088867, + "learning_rate": 7.661304675200219e-05, + "loss": 0.7116, + "step": 17834 + }, + { + "epoch": 1.208415204282133, + "grad_norm": 6.618070125579834, + "learning_rate": 7.661167773290437e-05, + "loss": 0.6558, + "step": 17835 + }, + { + "epoch": 1.208482959550105, + "grad_norm": 7.177542209625244, + "learning_rate": 7.661030871380657e-05, + "loss": 0.6931, + "step": 17836 + }, + { + "epoch": 1.2085507148180772, + "grad_norm": 5.309508800506592, + "learning_rate": 7.660893969470875e-05, + "loss": 0.6781, + "step": 17837 + }, + { + "epoch": 1.2086184700860492, + "grad_norm": 5.462770462036133, + "learning_rate": 7.660757067561093e-05, + "loss": 0.6715, + "step": 17838 + }, + { + "epoch": 1.2086862253540214, + "grad_norm": 5.621263027191162, + "learning_rate": 7.660620165651311e-05, + "loss": 0.6802, + "step": 17839 + }, + { + "epoch": 1.2087539806219934, + "grad_norm": 6.031620979309082, + "learning_rate": 7.660483263741529e-05, + "loss": 0.708, + "step": 17840 + }, + { + "epoch": 1.2088217358899653, + "grad_norm": 5.407814979553223, + "learning_rate": 7.660346361831748e-05, + "loss": 0.6426, + "step": 17841 + }, + { + "epoch": 1.2088894911579375, + "grad_norm": 5.029693603515625, + "learning_rate": 7.660209459921966e-05, + "loss": 0.7588, + "step": 17842 + }, + { + "epoch": 1.2089572464259097, + "grad_norm": 5.002652645111084, + "learning_rate": 7.660072558012184e-05, + "loss": 0.6632, + "step": 17843 + }, + { + "epoch": 1.2090250016938817, + "grad_norm": 5.994513988494873, + "learning_rate": 7.659935656102402e-05, + "loss": 0.8487, + "step": 17844 + }, + { + "epoch": 1.2090927569618537, + "grad_norm": 6.450139045715332, + "learning_rate": 7.65979875419262e-05, + "loss": 0.5194, + "step": 17845 + }, + { + "epoch": 1.209160512229826, + "grad_norm": 6.206781387329102, + "learning_rate": 7.65966185228284e-05, + "loss": 0.6804, + "step": 17846 + }, + { + "epoch": 1.2092282674977979, + "grad_norm": 6.784191608428955, + "learning_rate": 7.659524950373058e-05, + "loss": 0.8282, + "step": 17847 + }, + { + "epoch": 1.20929602276577, + "grad_norm": 7.1671671867370605, + "learning_rate": 7.659388048463276e-05, + "loss": 0.8137, + "step": 17848 + }, + { + "epoch": 1.209363778033742, + "grad_norm": 5.292774200439453, + "learning_rate": 7.659251146553494e-05, + "loss": 0.5384, + "step": 17849 + }, + { + "epoch": 1.2094315333017143, + "grad_norm": 6.33740758895874, + "learning_rate": 7.659114244643713e-05, + "loss": 0.6687, + "step": 17850 + }, + { + "epoch": 1.2094992885696862, + "grad_norm": 8.038208961486816, + "learning_rate": 7.658977342733931e-05, + "loss": 0.8412, + "step": 17851 + }, + { + "epoch": 1.2095670438376585, + "grad_norm": 5.751485347747803, + "learning_rate": 7.658840440824149e-05, + "loss": 0.845, + "step": 17852 + }, + { + "epoch": 1.2096347991056304, + "grad_norm": 5.004260540008545, + "learning_rate": 7.658703538914369e-05, + "loss": 0.594, + "step": 17853 + }, + { + "epoch": 1.2097025543736026, + "grad_norm": 6.479980945587158, + "learning_rate": 7.658566637004587e-05, + "loss": 0.7178, + "step": 17854 + }, + { + "epoch": 1.2097703096415746, + "grad_norm": 5.524226665496826, + "learning_rate": 7.658429735094805e-05, + "loss": 0.5855, + "step": 17855 + }, + { + "epoch": 1.2098380649095466, + "grad_norm": 6.297536849975586, + "learning_rate": 7.658292833185024e-05, + "loss": 0.5762, + "step": 17856 + }, + { + "epoch": 1.2099058201775188, + "grad_norm": 5.452970504760742, + "learning_rate": 7.658155931275242e-05, + "loss": 0.6021, + "step": 17857 + }, + { + "epoch": 1.209973575445491, + "grad_norm": 6.164029121398926, + "learning_rate": 7.65801902936546e-05, + "loss": 0.834, + "step": 17858 + }, + { + "epoch": 1.210041330713463, + "grad_norm": 6.116921901702881, + "learning_rate": 7.65788212745568e-05, + "loss": 0.6033, + "step": 17859 + }, + { + "epoch": 1.210109085981435, + "grad_norm": 4.469160556793213, + "learning_rate": 7.657745225545898e-05, + "loss": 0.8679, + "step": 17860 + }, + { + "epoch": 1.2101768412494072, + "grad_norm": 6.041546821594238, + "learning_rate": 7.657608323636116e-05, + "loss": 0.5729, + "step": 17861 + }, + { + "epoch": 1.2102445965173791, + "grad_norm": 5.222907066345215, + "learning_rate": 7.657471421726334e-05, + "loss": 0.596, + "step": 17862 + }, + { + "epoch": 1.2103123517853513, + "grad_norm": 6.178247451782227, + "learning_rate": 7.657334519816552e-05, + "loss": 0.571, + "step": 17863 + }, + { + "epoch": 1.2103801070533233, + "grad_norm": 5.390204429626465, + "learning_rate": 7.657197617906771e-05, + "loss": 0.8544, + "step": 17864 + }, + { + "epoch": 1.2104478623212955, + "grad_norm": 5.556325435638428, + "learning_rate": 7.657060715996989e-05, + "loss": 0.5408, + "step": 17865 + }, + { + "epoch": 1.2105156175892675, + "grad_norm": 6.003518581390381, + "learning_rate": 7.656923814087207e-05, + "loss": 0.9889, + "step": 17866 + }, + { + "epoch": 1.2105833728572397, + "grad_norm": 8.722765922546387, + "learning_rate": 7.656786912177425e-05, + "loss": 0.5056, + "step": 17867 + }, + { + "epoch": 1.2106511281252117, + "grad_norm": 5.819286823272705, + "learning_rate": 7.656650010267643e-05, + "loss": 0.6311, + "step": 17868 + }, + { + "epoch": 1.210718883393184, + "grad_norm": 5.321593761444092, + "learning_rate": 7.656513108357863e-05, + "loss": 0.5492, + "step": 17869 + }, + { + "epoch": 1.2107866386611559, + "grad_norm": 5.966612339019775, + "learning_rate": 7.65637620644808e-05, + "loss": 0.801, + "step": 17870 + }, + { + "epoch": 1.210854393929128, + "grad_norm": 5.289181232452393, + "learning_rate": 7.656239304538299e-05, + "loss": 0.6989, + "step": 17871 + }, + { + "epoch": 1.2109221491971, + "grad_norm": 5.957951545715332, + "learning_rate": 7.656102402628517e-05, + "loss": 0.7726, + "step": 17872 + }, + { + "epoch": 1.2109899044650723, + "grad_norm": 6.526004314422607, + "learning_rate": 7.655965500718736e-05, + "loss": 0.8481, + "step": 17873 + }, + { + "epoch": 1.2110576597330442, + "grad_norm": 8.721756935119629, + "learning_rate": 7.655828598808954e-05, + "loss": 0.681, + "step": 17874 + }, + { + "epoch": 1.2111254150010162, + "grad_norm": 5.119085311889648, + "learning_rate": 7.655691696899172e-05, + "loss": 0.3988, + "step": 17875 + }, + { + "epoch": 1.2111931702689884, + "grad_norm": 4.622032165527344, + "learning_rate": 7.65555479498939e-05, + "loss": 0.7325, + "step": 17876 + }, + { + "epoch": 1.2112609255369604, + "grad_norm": 5.131158828735352, + "learning_rate": 7.655417893079608e-05, + "loss": 0.58, + "step": 17877 + }, + { + "epoch": 1.2113286808049326, + "grad_norm": 6.189968585968018, + "learning_rate": 7.655280991169828e-05, + "loss": 0.9961, + "step": 17878 + }, + { + "epoch": 1.2113964360729046, + "grad_norm": 5.269293785095215, + "learning_rate": 7.655144089260046e-05, + "loss": 0.705, + "step": 17879 + }, + { + "epoch": 1.2114641913408768, + "grad_norm": 4.969742774963379, + "learning_rate": 7.655007187350264e-05, + "loss": 0.8844, + "step": 17880 + }, + { + "epoch": 1.2115319466088488, + "grad_norm": 6.767873287200928, + "learning_rate": 7.654870285440482e-05, + "loss": 1.0077, + "step": 17881 + }, + { + "epoch": 1.211599701876821, + "grad_norm": 8.03074836730957, + "learning_rate": 7.654733383530701e-05, + "loss": 0.9055, + "step": 17882 + }, + { + "epoch": 1.211667457144793, + "grad_norm": 7.549478054046631, + "learning_rate": 7.654596481620919e-05, + "loss": 0.8107, + "step": 17883 + }, + { + "epoch": 1.2117352124127652, + "grad_norm": 7.665398597717285, + "learning_rate": 7.654459579711137e-05, + "loss": 0.5934, + "step": 17884 + }, + { + "epoch": 1.2118029676807371, + "grad_norm": 7.51274299621582, + "learning_rate": 7.654322677801355e-05, + "loss": 0.6872, + "step": 17885 + }, + { + "epoch": 1.2118707229487093, + "grad_norm": 6.046478748321533, + "learning_rate": 7.654185775891573e-05, + "loss": 0.6458, + "step": 17886 + }, + { + "epoch": 1.2119384782166813, + "grad_norm": 5.589632511138916, + "learning_rate": 7.654048873981793e-05, + "loss": 0.7045, + "step": 17887 + }, + { + "epoch": 1.2120062334846535, + "grad_norm": 7.263500690460205, + "learning_rate": 7.65391197207201e-05, + "loss": 0.7436, + "step": 17888 + }, + { + "epoch": 1.2120739887526255, + "grad_norm": 6.018463611602783, + "learning_rate": 7.653775070162229e-05, + "loss": 0.7441, + "step": 17889 + }, + { + "epoch": 1.2121417440205975, + "grad_norm": 11.365656852722168, + "learning_rate": 7.653638168252447e-05, + "loss": 0.7303, + "step": 17890 + }, + { + "epoch": 1.2122094992885697, + "grad_norm": 6.8662109375, + "learning_rate": 7.653501266342666e-05, + "loss": 0.9822, + "step": 17891 + }, + { + "epoch": 1.212277254556542, + "grad_norm": 5.432251930236816, + "learning_rate": 7.653364364432884e-05, + "loss": 0.718, + "step": 17892 + }, + { + "epoch": 1.2123450098245139, + "grad_norm": 5.230556488037109, + "learning_rate": 7.653227462523102e-05, + "loss": 0.5133, + "step": 17893 + }, + { + "epoch": 1.2124127650924859, + "grad_norm": 5.820934295654297, + "learning_rate": 7.65309056061332e-05, + "loss": 0.7827, + "step": 17894 + }, + { + "epoch": 1.212480520360458, + "grad_norm": 7.665424823760986, + "learning_rate": 7.652953658703538e-05, + "loss": 0.871, + "step": 17895 + }, + { + "epoch": 1.21254827562843, + "grad_norm": 6.874349594116211, + "learning_rate": 7.652816756793758e-05, + "loss": 0.7607, + "step": 17896 + }, + { + "epoch": 1.2126160308964022, + "grad_norm": 4.565303802490234, + "learning_rate": 7.652679854883976e-05, + "loss": 0.4878, + "step": 17897 + }, + { + "epoch": 1.2126837861643742, + "grad_norm": 8.851542472839355, + "learning_rate": 7.652542952974194e-05, + "loss": 0.7029, + "step": 17898 + }, + { + "epoch": 1.2127515414323464, + "grad_norm": 5.475307464599609, + "learning_rate": 7.652406051064413e-05, + "loss": 0.7158, + "step": 17899 + }, + { + "epoch": 1.2128192967003184, + "grad_norm": 5.020805835723877, + "learning_rate": 7.652269149154631e-05, + "loss": 0.607, + "step": 17900 + }, + { + "epoch": 1.2128870519682906, + "grad_norm": 4.76190185546875, + "learning_rate": 7.652132247244849e-05, + "loss": 0.682, + "step": 17901 + }, + { + "epoch": 1.2129548072362626, + "grad_norm": 5.650835037231445, + "learning_rate": 7.651995345335069e-05, + "loss": 0.6631, + "step": 17902 + }, + { + "epoch": 1.2130225625042348, + "grad_norm": 5.15869140625, + "learning_rate": 7.651858443425287e-05, + "loss": 0.7104, + "step": 17903 + }, + { + "epoch": 1.2130903177722068, + "grad_norm": 6.4718804359436035, + "learning_rate": 7.651721541515505e-05, + "loss": 0.828, + "step": 17904 + }, + { + "epoch": 1.2131580730401788, + "grad_norm": 6.307829856872559, + "learning_rate": 7.651584639605724e-05, + "loss": 0.9308, + "step": 17905 + }, + { + "epoch": 1.213225828308151, + "grad_norm": 4.885651111602783, + "learning_rate": 7.651447737695942e-05, + "loss": 0.8558, + "step": 17906 + }, + { + "epoch": 1.2132935835761232, + "grad_norm": 7.297476768493652, + "learning_rate": 7.65131083578616e-05, + "loss": 0.5909, + "step": 17907 + }, + { + "epoch": 1.2133613388440951, + "grad_norm": 6.080501079559326, + "learning_rate": 7.651173933876378e-05, + "loss": 0.7439, + "step": 17908 + }, + { + "epoch": 1.2134290941120671, + "grad_norm": 6.017581939697266, + "learning_rate": 7.651037031966596e-05, + "loss": 0.6583, + "step": 17909 + }, + { + "epoch": 1.2134968493800393, + "grad_norm": 7.0170464515686035, + "learning_rate": 7.650900130056816e-05, + "loss": 0.7311, + "step": 17910 + }, + { + "epoch": 1.2135646046480113, + "grad_norm": 5.122740268707275, + "learning_rate": 7.650763228147034e-05, + "loss": 0.7785, + "step": 17911 + }, + { + "epoch": 1.2136323599159835, + "grad_norm": 4.886419773101807, + "learning_rate": 7.650626326237252e-05, + "loss": 0.6322, + "step": 17912 + }, + { + "epoch": 1.2137001151839555, + "grad_norm": 5.377499103546143, + "learning_rate": 7.65048942432747e-05, + "loss": 0.8079, + "step": 17913 + }, + { + "epoch": 1.2137678704519277, + "grad_norm": 5.473203182220459, + "learning_rate": 7.650352522417689e-05, + "loss": 0.6221, + "step": 17914 + }, + { + "epoch": 1.2138356257198997, + "grad_norm": 6.620959758758545, + "learning_rate": 7.650215620507907e-05, + "loss": 0.6327, + "step": 17915 + }, + { + "epoch": 1.2139033809878719, + "grad_norm": 5.501951694488525, + "learning_rate": 7.650078718598125e-05, + "loss": 0.6191, + "step": 17916 + }, + { + "epoch": 1.2139711362558439, + "grad_norm": 5.863165378570557, + "learning_rate": 7.649941816688343e-05, + "loss": 0.4766, + "step": 17917 + }, + { + "epoch": 1.214038891523816, + "grad_norm": 4.651849269866943, + "learning_rate": 7.649804914778561e-05, + "loss": 0.8014, + "step": 17918 + }, + { + "epoch": 1.214106646791788, + "grad_norm": 5.458044528961182, + "learning_rate": 7.64966801286878e-05, + "loss": 0.6545, + "step": 17919 + }, + { + "epoch": 1.2141744020597602, + "grad_norm": 6.30871057510376, + "learning_rate": 7.649531110958999e-05, + "loss": 0.8052, + "step": 17920 + }, + { + "epoch": 1.2142421573277322, + "grad_norm": 6.08293342590332, + "learning_rate": 7.649394209049217e-05, + "loss": 0.863, + "step": 17921 + }, + { + "epoch": 1.2143099125957044, + "grad_norm": 4.689571857452393, + "learning_rate": 7.649257307139435e-05, + "loss": 0.7528, + "step": 17922 + }, + { + "epoch": 1.2143776678636764, + "grad_norm": 5.698482513427734, + "learning_rate": 7.649120405229653e-05, + "loss": 0.7365, + "step": 17923 + }, + { + "epoch": 1.2144454231316484, + "grad_norm": 6.945766925811768, + "learning_rate": 7.648983503319872e-05, + "loss": 0.7323, + "step": 17924 + }, + { + "epoch": 1.2145131783996206, + "grad_norm": 4.9405670166015625, + "learning_rate": 7.64884660141009e-05, + "loss": 0.5537, + "step": 17925 + }, + { + "epoch": 1.2145809336675926, + "grad_norm": 6.1244001388549805, + "learning_rate": 7.648709699500308e-05, + "loss": 0.9281, + "step": 17926 + }, + { + "epoch": 1.2146486889355648, + "grad_norm": 5.485476970672607, + "learning_rate": 7.648572797590526e-05, + "loss": 0.7444, + "step": 17927 + }, + { + "epoch": 1.2147164442035367, + "grad_norm": 6.439229965209961, + "learning_rate": 7.648435895680746e-05, + "loss": 0.6663, + "step": 17928 + }, + { + "epoch": 1.214784199471509, + "grad_norm": 4.058414936065674, + "learning_rate": 7.648298993770964e-05, + "loss": 0.6314, + "step": 17929 + }, + { + "epoch": 1.214851954739481, + "grad_norm": 4.499625205993652, + "learning_rate": 7.648162091861182e-05, + "loss": 0.6212, + "step": 17930 + }, + { + "epoch": 1.2149197100074531, + "grad_norm": 6.286862850189209, + "learning_rate": 7.6480251899514e-05, + "loss": 0.795, + "step": 17931 + }, + { + "epoch": 1.2149874652754251, + "grad_norm": 5.967475414276123, + "learning_rate": 7.647888288041618e-05, + "loss": 0.6406, + "step": 17932 + }, + { + "epoch": 1.2150552205433973, + "grad_norm": 4.926750659942627, + "learning_rate": 7.647751386131837e-05, + "loss": 0.8561, + "step": 17933 + }, + { + "epoch": 1.2151229758113693, + "grad_norm": 4.823459625244141, + "learning_rate": 7.647614484222055e-05, + "loss": 0.6308, + "step": 17934 + }, + { + "epoch": 1.2151907310793415, + "grad_norm": 6.4650797843933105, + "learning_rate": 7.647477582312273e-05, + "loss": 0.9417, + "step": 17935 + }, + { + "epoch": 1.2152584863473135, + "grad_norm": 6.535785675048828, + "learning_rate": 7.647340680402491e-05, + "loss": 0.6846, + "step": 17936 + }, + { + "epoch": 1.2153262416152857, + "grad_norm": 5.6134819984436035, + "learning_rate": 7.64720377849271e-05, + "loss": 0.8102, + "step": 17937 + }, + { + "epoch": 1.2153939968832577, + "grad_norm": 8.289133071899414, + "learning_rate": 7.647066876582929e-05, + "loss": 0.8256, + "step": 17938 + }, + { + "epoch": 1.2154617521512296, + "grad_norm": 5.382526397705078, + "learning_rate": 7.646929974673147e-05, + "loss": 0.5605, + "step": 17939 + }, + { + "epoch": 1.2155295074192018, + "grad_norm": 4.763079643249512, + "learning_rate": 7.646793072763365e-05, + "loss": 0.4803, + "step": 17940 + }, + { + "epoch": 1.215597262687174, + "grad_norm": 4.224546432495117, + "learning_rate": 7.646656170853583e-05, + "loss": 0.4886, + "step": 17941 + }, + { + "epoch": 1.215665017955146, + "grad_norm": 7.985034465789795, + "learning_rate": 7.646519268943802e-05, + "loss": 0.6896, + "step": 17942 + }, + { + "epoch": 1.215732773223118, + "grad_norm": 7.068222522735596, + "learning_rate": 7.64638236703402e-05, + "loss": 0.7415, + "step": 17943 + }, + { + "epoch": 1.2158005284910902, + "grad_norm": 5.369699001312256, + "learning_rate": 7.646245465124238e-05, + "loss": 0.7562, + "step": 17944 + }, + { + "epoch": 1.2158682837590622, + "grad_norm": 5.549074172973633, + "learning_rate": 7.646108563214456e-05, + "loss": 0.6803, + "step": 17945 + }, + { + "epoch": 1.2159360390270344, + "grad_norm": 5.730227947235107, + "learning_rate": 7.645971661304676e-05, + "loss": 0.5862, + "step": 17946 + }, + { + "epoch": 1.2160037942950064, + "grad_norm": 5.7196478843688965, + "learning_rate": 7.645834759394894e-05, + "loss": 0.4924, + "step": 17947 + }, + { + "epoch": 1.2160715495629786, + "grad_norm": 6.75718355178833, + "learning_rate": 7.645697857485112e-05, + "loss": 0.7233, + "step": 17948 + }, + { + "epoch": 1.2161393048309506, + "grad_norm": 8.140740394592285, + "learning_rate": 7.645560955575331e-05, + "loss": 0.706, + "step": 17949 + }, + { + "epoch": 1.2162070600989228, + "grad_norm": 9.06883716583252, + "learning_rate": 7.645424053665549e-05, + "loss": 0.5888, + "step": 17950 + }, + { + "epoch": 1.2162748153668947, + "grad_norm": 6.231507301330566, + "learning_rate": 7.645287151755767e-05, + "loss": 0.6705, + "step": 17951 + }, + { + "epoch": 1.216342570634867, + "grad_norm": 6.362677097320557, + "learning_rate": 7.645150249845987e-05, + "loss": 0.696, + "step": 17952 + }, + { + "epoch": 1.216410325902839, + "grad_norm": 5.278102397918701, + "learning_rate": 7.645013347936205e-05, + "loss": 0.612, + "step": 17953 + }, + { + "epoch": 1.216478081170811, + "grad_norm": 5.913932800292969, + "learning_rate": 7.644876446026423e-05, + "loss": 0.8063, + "step": 17954 + }, + { + "epoch": 1.2165458364387831, + "grad_norm": 6.743002891540527, + "learning_rate": 7.64473954411664e-05, + "loss": 0.6755, + "step": 17955 + }, + { + "epoch": 1.2166135917067553, + "grad_norm": 6.559525012969971, + "learning_rate": 7.64460264220686e-05, + "loss": 0.7889, + "step": 17956 + }, + { + "epoch": 1.2166813469747273, + "grad_norm": 7.0902228355407715, + "learning_rate": 7.644465740297078e-05, + "loss": 0.5968, + "step": 17957 + }, + { + "epoch": 1.2167491022426993, + "grad_norm": 5.9385576248168945, + "learning_rate": 7.644328838387296e-05, + "loss": 0.7367, + "step": 17958 + }, + { + "epoch": 1.2168168575106715, + "grad_norm": 5.945615768432617, + "learning_rate": 7.644191936477514e-05, + "loss": 0.6871, + "step": 17959 + }, + { + "epoch": 1.2168846127786435, + "grad_norm": 5.105138778686523, + "learning_rate": 7.644055034567734e-05, + "loss": 0.6814, + "step": 17960 + }, + { + "epoch": 1.2169523680466157, + "grad_norm": 7.892396926879883, + "learning_rate": 7.643918132657952e-05, + "loss": 0.7212, + "step": 17961 + }, + { + "epoch": 1.2170201233145876, + "grad_norm": 7.983614444732666, + "learning_rate": 7.64378123074817e-05, + "loss": 0.6391, + "step": 17962 + }, + { + "epoch": 1.2170878785825598, + "grad_norm": 5.618093013763428, + "learning_rate": 7.643644328838388e-05, + "loss": 0.7687, + "step": 17963 + }, + { + "epoch": 1.2171556338505318, + "grad_norm": 7.337813854217529, + "learning_rate": 7.643507426928606e-05, + "loss": 0.6703, + "step": 17964 + }, + { + "epoch": 1.217223389118504, + "grad_norm": 4.311225891113281, + "learning_rate": 7.643370525018825e-05, + "loss": 0.6361, + "step": 17965 + }, + { + "epoch": 1.217291144386476, + "grad_norm": 6.524477958679199, + "learning_rate": 7.643233623109043e-05, + "loss": 0.7418, + "step": 17966 + }, + { + "epoch": 1.2173588996544482, + "grad_norm": 6.941540241241455, + "learning_rate": 7.643096721199261e-05, + "loss": 0.5442, + "step": 17967 + }, + { + "epoch": 1.2174266549224202, + "grad_norm": 5.137349605560303, + "learning_rate": 7.642959819289479e-05, + "loss": 0.7523, + "step": 17968 + }, + { + "epoch": 1.2174944101903924, + "grad_norm": 5.285187721252441, + "learning_rate": 7.642822917379699e-05, + "loss": 0.5724, + "step": 17969 + }, + { + "epoch": 1.2175621654583644, + "grad_norm": 6.266340255737305, + "learning_rate": 7.642686015469917e-05, + "loss": 0.6546, + "step": 17970 + }, + { + "epoch": 1.2176299207263366, + "grad_norm": 5.239572525024414, + "learning_rate": 7.642549113560135e-05, + "loss": 0.6406, + "step": 17971 + }, + { + "epoch": 1.2176976759943086, + "grad_norm": 6.736250400543213, + "learning_rate": 7.642412211650353e-05, + "loss": 1.0865, + "step": 17972 + }, + { + "epoch": 1.2177654312622805, + "grad_norm": 8.947715759277344, + "learning_rate": 7.64227530974057e-05, + "loss": 0.5183, + "step": 17973 + }, + { + "epoch": 1.2178331865302527, + "grad_norm": 8.303337097167969, + "learning_rate": 7.64213840783079e-05, + "loss": 1.1634, + "step": 17974 + }, + { + "epoch": 1.2179009417982247, + "grad_norm": 6.613880634307861, + "learning_rate": 7.642001505921008e-05, + "loss": 0.7042, + "step": 17975 + }, + { + "epoch": 1.217968697066197, + "grad_norm": 6.535386085510254, + "learning_rate": 7.641864604011226e-05, + "loss": 0.7257, + "step": 17976 + }, + { + "epoch": 1.218036452334169, + "grad_norm": 6.180886745452881, + "learning_rate": 7.641727702101444e-05, + "loss": 0.6752, + "step": 17977 + }, + { + "epoch": 1.218104207602141, + "grad_norm": 5.949535369873047, + "learning_rate": 7.641590800191662e-05, + "loss": 0.7168, + "step": 17978 + }, + { + "epoch": 1.218171962870113, + "grad_norm": 7.5198283195495605, + "learning_rate": 7.641453898281882e-05, + "loss": 0.8332, + "step": 17979 + }, + { + "epoch": 1.2182397181380853, + "grad_norm": 6.259788990020752, + "learning_rate": 7.6413169963721e-05, + "loss": 0.7097, + "step": 17980 + }, + { + "epoch": 1.2183074734060573, + "grad_norm": 5.875858306884766, + "learning_rate": 7.641180094462318e-05, + "loss": 0.6978, + "step": 17981 + }, + { + "epoch": 1.2183752286740295, + "grad_norm": 7.679528713226318, + "learning_rate": 7.641043192552536e-05, + "loss": 0.6878, + "step": 17982 + }, + { + "epoch": 1.2184429839420015, + "grad_norm": 5.411177635192871, + "learning_rate": 7.640906290642755e-05, + "loss": 0.6506, + "step": 17983 + }, + { + "epoch": 1.2185107392099737, + "grad_norm": 5.739083766937256, + "learning_rate": 7.640769388732973e-05, + "loss": 0.689, + "step": 17984 + }, + { + "epoch": 1.2185784944779456, + "grad_norm": 6.422370910644531, + "learning_rate": 7.640632486823191e-05, + "loss": 0.7599, + "step": 17985 + }, + { + "epoch": 1.2186462497459178, + "grad_norm": 5.531982421875, + "learning_rate": 7.640495584913409e-05, + "loss": 0.7083, + "step": 17986 + }, + { + "epoch": 1.2187140050138898, + "grad_norm": 9.015816688537598, + "learning_rate": 7.640358683003627e-05, + "loss": 0.8653, + "step": 17987 + }, + { + "epoch": 1.2187817602818618, + "grad_norm": 7.057861804962158, + "learning_rate": 7.640221781093847e-05, + "loss": 0.5141, + "step": 17988 + }, + { + "epoch": 1.218849515549834, + "grad_norm": 7.158889293670654, + "learning_rate": 7.640084879184065e-05, + "loss": 0.6141, + "step": 17989 + }, + { + "epoch": 1.2189172708178062, + "grad_norm": 7.1165771484375, + "learning_rate": 7.639947977274283e-05, + "loss": 0.7902, + "step": 17990 + }, + { + "epoch": 1.2189850260857782, + "grad_norm": 6.267188549041748, + "learning_rate": 7.639811075364501e-05, + "loss": 0.5597, + "step": 17991 + }, + { + "epoch": 1.2190527813537502, + "grad_norm": 5.786649227142334, + "learning_rate": 7.63967417345472e-05, + "loss": 0.6786, + "step": 17992 + }, + { + "epoch": 1.2191205366217224, + "grad_norm": 4.65104341506958, + "learning_rate": 7.639537271544938e-05, + "loss": 0.5413, + "step": 17993 + }, + { + "epoch": 1.2191882918896944, + "grad_norm": 6.078935623168945, + "learning_rate": 7.639400369635156e-05, + "loss": 0.7329, + "step": 17994 + }, + { + "epoch": 1.2192560471576666, + "grad_norm": 5.8307342529296875, + "learning_rate": 7.639263467725376e-05, + "loss": 0.7237, + "step": 17995 + }, + { + "epoch": 1.2193238024256385, + "grad_norm": 5.22831392288208, + "learning_rate": 7.639126565815594e-05, + "loss": 0.6605, + "step": 17996 + }, + { + "epoch": 1.2193915576936107, + "grad_norm": 6.974266529083252, + "learning_rate": 7.638989663905812e-05, + "loss": 0.7335, + "step": 17997 + }, + { + "epoch": 1.2194593129615827, + "grad_norm": 5.890809059143066, + "learning_rate": 7.638852761996031e-05, + "loss": 0.8327, + "step": 17998 + }, + { + "epoch": 1.219527068229555, + "grad_norm": 7.678559303283691, + "learning_rate": 7.638715860086249e-05, + "loss": 0.7564, + "step": 17999 + }, + { + "epoch": 1.219594823497527, + "grad_norm": 4.4165191650390625, + "learning_rate": 7.638578958176467e-05, + "loss": 0.6001, + "step": 18000 + }, + { + "epoch": 1.219662578765499, + "grad_norm": 7.2417731285095215, + "learning_rate": 7.638442056266685e-05, + "loss": 0.8368, + "step": 18001 + }, + { + "epoch": 1.219730334033471, + "grad_norm": 5.695639133453369, + "learning_rate": 7.638305154356905e-05, + "loss": 0.7352, + "step": 18002 + }, + { + "epoch": 1.219798089301443, + "grad_norm": 9.774930953979492, + "learning_rate": 7.638168252447123e-05, + "loss": 0.6938, + "step": 18003 + }, + { + "epoch": 1.2198658445694153, + "grad_norm": 7.178581714630127, + "learning_rate": 7.63803135053734e-05, + "loss": 0.5617, + "step": 18004 + }, + { + "epoch": 1.2199335998373875, + "grad_norm": 4.547571182250977, + "learning_rate": 7.637894448627559e-05, + "loss": 0.6759, + "step": 18005 + }, + { + "epoch": 1.2200013551053595, + "grad_norm": 6.263281345367432, + "learning_rate": 7.637757546717778e-05, + "loss": 0.7855, + "step": 18006 + }, + { + "epoch": 1.2200691103733314, + "grad_norm": 6.402713298797607, + "learning_rate": 7.637620644807996e-05, + "loss": 0.6332, + "step": 18007 + }, + { + "epoch": 1.2201368656413036, + "grad_norm": 7.917328834533691, + "learning_rate": 7.637483742898214e-05, + "loss": 0.8464, + "step": 18008 + }, + { + "epoch": 1.2202046209092756, + "grad_norm": 6.9080891609191895, + "learning_rate": 7.637346840988432e-05, + "loss": 0.6566, + "step": 18009 + }, + { + "epoch": 1.2202723761772478, + "grad_norm": 5.2827467918396, + "learning_rate": 7.63720993907865e-05, + "loss": 0.8047, + "step": 18010 + }, + { + "epoch": 1.2203401314452198, + "grad_norm": 8.067967414855957, + "learning_rate": 7.63707303716887e-05, + "loss": 0.6424, + "step": 18011 + }, + { + "epoch": 1.220407886713192, + "grad_norm": 5.451672554016113, + "learning_rate": 7.636936135259088e-05, + "loss": 0.671, + "step": 18012 + }, + { + "epoch": 1.220475641981164, + "grad_norm": 8.954378128051758, + "learning_rate": 7.636799233349306e-05, + "loss": 0.8216, + "step": 18013 + }, + { + "epoch": 1.2205433972491362, + "grad_norm": 4.8422651290893555, + "learning_rate": 7.636662331439524e-05, + "loss": 0.4488, + "step": 18014 + }, + { + "epoch": 1.2206111525171082, + "grad_norm": 7.705546855926514, + "learning_rate": 7.636525429529743e-05, + "loss": 0.7685, + "step": 18015 + }, + { + "epoch": 1.2206789077850804, + "grad_norm": 7.396956920623779, + "learning_rate": 7.636388527619961e-05, + "loss": 0.6365, + "step": 18016 + }, + { + "epoch": 1.2207466630530523, + "grad_norm": 5.974605083465576, + "learning_rate": 7.636251625710179e-05, + "loss": 0.5927, + "step": 18017 + }, + { + "epoch": 1.2208144183210246, + "grad_norm": 5.919744968414307, + "learning_rate": 7.636114723800397e-05, + "loss": 0.5308, + "step": 18018 + }, + { + "epoch": 1.2208821735889965, + "grad_norm": 5.136290550231934, + "learning_rate": 7.635977821890615e-05, + "loss": 0.5941, + "step": 18019 + }, + { + "epoch": 1.2209499288569687, + "grad_norm": 4.4463210105896, + "learning_rate": 7.635840919980835e-05, + "loss": 0.4817, + "step": 18020 + }, + { + "epoch": 1.2210176841249407, + "grad_norm": 6.176875114440918, + "learning_rate": 7.635704018071053e-05, + "loss": 0.6398, + "step": 18021 + }, + { + "epoch": 1.2210854393929127, + "grad_norm": 5.770840167999268, + "learning_rate": 7.63556711616127e-05, + "loss": 0.6786, + "step": 18022 + }, + { + "epoch": 1.221153194660885, + "grad_norm": 6.276705741882324, + "learning_rate": 7.635430214251489e-05, + "loss": 0.8613, + "step": 18023 + }, + { + "epoch": 1.2212209499288569, + "grad_norm": 4.476863384246826, + "learning_rate": 7.635293312341708e-05, + "loss": 0.6629, + "step": 18024 + }, + { + "epoch": 1.221288705196829, + "grad_norm": 6.741992950439453, + "learning_rate": 7.635156410431926e-05, + "loss": 0.8168, + "step": 18025 + }, + { + "epoch": 1.221356460464801, + "grad_norm": 6.626770973205566, + "learning_rate": 7.635019508522144e-05, + "loss": 0.6608, + "step": 18026 + }, + { + "epoch": 1.2214242157327733, + "grad_norm": 6.388763427734375, + "learning_rate": 7.634882606612362e-05, + "loss": 0.7622, + "step": 18027 + }, + { + "epoch": 1.2214919710007452, + "grad_norm": 7.007261276245117, + "learning_rate": 7.63474570470258e-05, + "loss": 0.8817, + "step": 18028 + }, + { + "epoch": 1.2215597262687174, + "grad_norm": 4.994833469390869, + "learning_rate": 7.6346088027928e-05, + "loss": 0.525, + "step": 18029 + }, + { + "epoch": 1.2216274815366894, + "grad_norm": 5.362818717956543, + "learning_rate": 7.634471900883018e-05, + "loss": 0.6027, + "step": 18030 + }, + { + "epoch": 1.2216952368046616, + "grad_norm": 6.882465362548828, + "learning_rate": 7.634334998973236e-05, + "loss": 0.476, + "step": 18031 + }, + { + "epoch": 1.2217629920726336, + "grad_norm": 8.69393253326416, + "learning_rate": 7.634198097063454e-05, + "loss": 0.6321, + "step": 18032 + }, + { + "epoch": 1.2218307473406058, + "grad_norm": 5.9786529541015625, + "learning_rate": 7.634061195153672e-05, + "loss": 0.7933, + "step": 18033 + }, + { + "epoch": 1.2218985026085778, + "grad_norm": 5.898993968963623, + "learning_rate": 7.633924293243891e-05, + "loss": 0.6235, + "step": 18034 + }, + { + "epoch": 1.22196625787655, + "grad_norm": 4.783423900604248, + "learning_rate": 7.633787391334109e-05, + "loss": 0.5224, + "step": 18035 + }, + { + "epoch": 1.222034013144522, + "grad_norm": 7.560459613800049, + "learning_rate": 7.633650489424327e-05, + "loss": 0.8104, + "step": 18036 + }, + { + "epoch": 1.222101768412494, + "grad_norm": 4.811002731323242, + "learning_rate": 7.633513587514545e-05, + "loss": 0.7236, + "step": 18037 + }, + { + "epoch": 1.2221695236804662, + "grad_norm": 6.19532585144043, + "learning_rate": 7.633376685604765e-05, + "loss": 0.8387, + "step": 18038 + }, + { + "epoch": 1.2222372789484384, + "grad_norm": 5.872559547424316, + "learning_rate": 7.633239783694983e-05, + "loss": 0.7077, + "step": 18039 + }, + { + "epoch": 1.2223050342164103, + "grad_norm": 4.4188232421875, + "learning_rate": 7.6331028817852e-05, + "loss": 0.5792, + "step": 18040 + }, + { + "epoch": 1.2223727894843823, + "grad_norm": 5.051517009735107, + "learning_rate": 7.63296597987542e-05, + "loss": 0.6725, + "step": 18041 + }, + { + "epoch": 1.2224405447523545, + "grad_norm": 5.528559684753418, + "learning_rate": 7.632829077965638e-05, + "loss": 0.8271, + "step": 18042 + }, + { + "epoch": 1.2225083000203265, + "grad_norm": 6.110957145690918, + "learning_rate": 7.632692176055856e-05, + "loss": 0.7396, + "step": 18043 + }, + { + "epoch": 1.2225760552882987, + "grad_norm": 9.123190879821777, + "learning_rate": 7.632555274146075e-05, + "loss": 0.636, + "step": 18044 + }, + { + "epoch": 1.2226438105562707, + "grad_norm": 6.206221103668213, + "learning_rate": 7.632418372236294e-05, + "loss": 0.645, + "step": 18045 + }, + { + "epoch": 1.222711565824243, + "grad_norm": 8.248867988586426, + "learning_rate": 7.632281470326512e-05, + "loss": 0.7355, + "step": 18046 + }, + { + "epoch": 1.2227793210922149, + "grad_norm": 5.231109619140625, + "learning_rate": 7.632144568416731e-05, + "loss": 0.5745, + "step": 18047 + }, + { + "epoch": 1.222847076360187, + "grad_norm": 5.008121967315674, + "learning_rate": 7.632007666506949e-05, + "loss": 0.6544, + "step": 18048 + }, + { + "epoch": 1.222914831628159, + "grad_norm": 5.843269348144531, + "learning_rate": 7.631870764597167e-05, + "loss": 0.6308, + "step": 18049 + }, + { + "epoch": 1.2229825868961313, + "grad_norm": 4.995326042175293, + "learning_rate": 7.631733862687385e-05, + "loss": 0.7161, + "step": 18050 + }, + { + "epoch": 1.2230503421641032, + "grad_norm": 5.701000213623047, + "learning_rate": 7.631596960777603e-05, + "loss": 0.6209, + "step": 18051 + }, + { + "epoch": 1.2231180974320752, + "grad_norm": 5.274313926696777, + "learning_rate": 7.631460058867822e-05, + "loss": 0.6137, + "step": 18052 + }, + { + "epoch": 1.2231858527000474, + "grad_norm": 4.914982318878174, + "learning_rate": 7.63132315695804e-05, + "loss": 0.7078, + "step": 18053 + }, + { + "epoch": 1.2232536079680196, + "grad_norm": 4.857698917388916, + "learning_rate": 7.631186255048259e-05, + "loss": 0.717, + "step": 18054 + }, + { + "epoch": 1.2233213632359916, + "grad_norm": 7.02639627456665, + "learning_rate": 7.631049353138477e-05, + "loss": 0.8204, + "step": 18055 + }, + { + "epoch": 1.2233891185039636, + "grad_norm": 7.193076133728027, + "learning_rate": 7.630912451228695e-05, + "loss": 0.6439, + "step": 18056 + }, + { + "epoch": 1.2234568737719358, + "grad_norm": 5.445677757263184, + "learning_rate": 7.630775549318914e-05, + "loss": 0.656, + "step": 18057 + }, + { + "epoch": 1.2235246290399078, + "grad_norm": 9.708181381225586, + "learning_rate": 7.630638647409132e-05, + "loss": 0.5955, + "step": 18058 + }, + { + "epoch": 1.22359238430788, + "grad_norm": 6.708062171936035, + "learning_rate": 7.63050174549935e-05, + "loss": 0.7779, + "step": 18059 + }, + { + "epoch": 1.223660139575852, + "grad_norm": 4.14343786239624, + "learning_rate": 7.630364843589568e-05, + "loss": 0.6535, + "step": 18060 + }, + { + "epoch": 1.2237278948438242, + "grad_norm": 5.566597938537598, + "learning_rate": 7.630227941679787e-05, + "loss": 0.6028, + "step": 18061 + }, + { + "epoch": 1.2237956501117961, + "grad_norm": 4.511200904846191, + "learning_rate": 7.630091039770006e-05, + "loss": 0.6257, + "step": 18062 + }, + { + "epoch": 1.2238634053797683, + "grad_norm": 5.272220611572266, + "learning_rate": 7.629954137860224e-05, + "loss": 0.7351, + "step": 18063 + }, + { + "epoch": 1.2239311606477403, + "grad_norm": 5.659876823425293, + "learning_rate": 7.629817235950442e-05, + "loss": 0.7477, + "step": 18064 + }, + { + "epoch": 1.2239989159157125, + "grad_norm": 7.069100379943848, + "learning_rate": 7.62968033404066e-05, + "loss": 0.73, + "step": 18065 + }, + { + "epoch": 1.2240666711836845, + "grad_norm": 5.5793304443359375, + "learning_rate": 7.629543432130879e-05, + "loss": 0.8045, + "step": 18066 + }, + { + "epoch": 1.2241344264516567, + "grad_norm": 5.751017093658447, + "learning_rate": 7.629406530221097e-05, + "loss": 0.6574, + "step": 18067 + }, + { + "epoch": 1.2242021817196287, + "grad_norm": 4.8076701164245605, + "learning_rate": 7.629269628311315e-05, + "loss": 0.6907, + "step": 18068 + }, + { + "epoch": 1.224269936987601, + "grad_norm": 7.757662773132324, + "learning_rate": 7.629132726401533e-05, + "loss": 0.7141, + "step": 18069 + }, + { + "epoch": 1.2243376922555729, + "grad_norm": 5.999941349029541, + "learning_rate": 7.628995824491753e-05, + "loss": 0.808, + "step": 18070 + }, + { + "epoch": 1.2244054475235449, + "grad_norm": 5.947088718414307, + "learning_rate": 7.62885892258197e-05, + "loss": 0.6211, + "step": 18071 + }, + { + "epoch": 1.224473202791517, + "grad_norm": 4.921745300292969, + "learning_rate": 7.628722020672189e-05, + "loss": 0.7553, + "step": 18072 + }, + { + "epoch": 1.224540958059489, + "grad_norm": 4.209071636199951, + "learning_rate": 7.628585118762407e-05, + "loss": 0.4931, + "step": 18073 + }, + { + "epoch": 1.2246087133274612, + "grad_norm": 4.7868170738220215, + "learning_rate": 7.628448216852625e-05, + "loss": 0.5719, + "step": 18074 + }, + { + "epoch": 1.2246764685954332, + "grad_norm": 5.3280158042907715, + "learning_rate": 7.628311314942844e-05, + "loss": 0.573, + "step": 18075 + }, + { + "epoch": 1.2247442238634054, + "grad_norm": 5.4629716873168945, + "learning_rate": 7.628174413033062e-05, + "loss": 0.5231, + "step": 18076 + }, + { + "epoch": 1.2248119791313774, + "grad_norm": 4.5787787437438965, + "learning_rate": 7.62803751112328e-05, + "loss": 0.5587, + "step": 18077 + }, + { + "epoch": 1.2248797343993496, + "grad_norm": 4.548202037811279, + "learning_rate": 7.627900609213498e-05, + "loss": 0.5094, + "step": 18078 + }, + { + "epoch": 1.2249474896673216, + "grad_norm": 6.067834377288818, + "learning_rate": 7.627763707303716e-05, + "loss": 0.6347, + "step": 18079 + }, + { + "epoch": 1.2250152449352938, + "grad_norm": 6.163285255432129, + "learning_rate": 7.627626805393936e-05, + "loss": 0.8042, + "step": 18080 + }, + { + "epoch": 1.2250830002032658, + "grad_norm": 6.52103853225708, + "learning_rate": 7.627489903484154e-05, + "loss": 0.7237, + "step": 18081 + }, + { + "epoch": 1.225150755471238, + "grad_norm": 5.5198893547058105, + "learning_rate": 7.627353001574372e-05, + "loss": 0.7413, + "step": 18082 + }, + { + "epoch": 1.22521851073921, + "grad_norm": 5.332125186920166, + "learning_rate": 7.62721609966459e-05, + "loss": 0.7945, + "step": 18083 + }, + { + "epoch": 1.2252862660071822, + "grad_norm": 6.892205238342285, + "learning_rate": 7.627079197754809e-05, + "loss": 0.7906, + "step": 18084 + }, + { + "epoch": 1.2253540212751541, + "grad_norm": 4.1680908203125, + "learning_rate": 7.626942295845027e-05, + "loss": 0.6408, + "step": 18085 + }, + { + "epoch": 1.2254217765431261, + "grad_norm": 8.176078796386719, + "learning_rate": 7.626805393935245e-05, + "loss": 0.6947, + "step": 18086 + }, + { + "epoch": 1.2254895318110983, + "grad_norm": 6.414756774902344, + "learning_rate": 7.626668492025465e-05, + "loss": 0.7691, + "step": 18087 + }, + { + "epoch": 1.2255572870790703, + "grad_norm": 5.7930474281311035, + "learning_rate": 7.626531590115683e-05, + "loss": 0.7177, + "step": 18088 + }, + { + "epoch": 1.2256250423470425, + "grad_norm": 5.2728271484375, + "learning_rate": 7.6263946882059e-05, + "loss": 0.6034, + "step": 18089 + }, + { + "epoch": 1.2256927976150145, + "grad_norm": 5.549907684326172, + "learning_rate": 7.62625778629612e-05, + "loss": 0.6401, + "step": 18090 + }, + { + "epoch": 1.2257605528829867, + "grad_norm": 6.938556671142578, + "learning_rate": 7.626120884386338e-05, + "loss": 0.5395, + "step": 18091 + }, + { + "epoch": 1.2258283081509587, + "grad_norm": 5.350551128387451, + "learning_rate": 7.625983982476556e-05, + "loss": 0.716, + "step": 18092 + }, + { + "epoch": 1.2258960634189309, + "grad_norm": 4.515890598297119, + "learning_rate": 7.625847080566775e-05, + "loss": 0.5628, + "step": 18093 + }, + { + "epoch": 1.2259638186869029, + "grad_norm": 5.260360240936279, + "learning_rate": 7.625710178656993e-05, + "loss": 0.7814, + "step": 18094 + }, + { + "epoch": 1.226031573954875, + "grad_norm": 4.987314224243164, + "learning_rate": 7.625573276747211e-05, + "loss": 0.9556, + "step": 18095 + }, + { + "epoch": 1.226099329222847, + "grad_norm": 6.4795241355896, + "learning_rate": 7.62543637483743e-05, + "loss": 0.7433, + "step": 18096 + }, + { + "epoch": 1.2261670844908192, + "grad_norm": 4.585717678070068, + "learning_rate": 7.625299472927648e-05, + "loss": 0.5516, + "step": 18097 + }, + { + "epoch": 1.2262348397587912, + "grad_norm": 5.807083606719971, + "learning_rate": 7.625162571017867e-05, + "loss": 1.0551, + "step": 18098 + }, + { + "epoch": 1.2263025950267634, + "grad_norm": 7.647436141967773, + "learning_rate": 7.625025669108085e-05, + "loss": 0.5785, + "step": 18099 + }, + { + "epoch": 1.2263703502947354, + "grad_norm": 5.377509117126465, + "learning_rate": 7.624888767198303e-05, + "loss": 0.5938, + "step": 18100 + }, + { + "epoch": 1.2264381055627074, + "grad_norm": 7.228111743927002, + "learning_rate": 7.624751865288521e-05, + "loss": 0.6299, + "step": 18101 + }, + { + "epoch": 1.2265058608306796, + "grad_norm": 6.495123386383057, + "learning_rate": 7.62461496337874e-05, + "loss": 0.7236, + "step": 18102 + }, + { + "epoch": 1.2265736160986518, + "grad_norm": 5.612201690673828, + "learning_rate": 7.624478061468958e-05, + "loss": 0.8444, + "step": 18103 + }, + { + "epoch": 1.2266413713666238, + "grad_norm": 6.513780117034912, + "learning_rate": 7.624341159559177e-05, + "loss": 0.8117, + "step": 18104 + }, + { + "epoch": 1.2267091266345957, + "grad_norm": 6.759029388427734, + "learning_rate": 7.624204257649395e-05, + "loss": 0.9207, + "step": 18105 + }, + { + "epoch": 1.226776881902568, + "grad_norm": 5.229630470275879, + "learning_rate": 7.624067355739613e-05, + "loss": 0.718, + "step": 18106 + }, + { + "epoch": 1.22684463717054, + "grad_norm": 5.184760570526123, + "learning_rate": 7.623930453829832e-05, + "loss": 0.5565, + "step": 18107 + }, + { + "epoch": 1.2269123924385121, + "grad_norm": 5.323483943939209, + "learning_rate": 7.62379355192005e-05, + "loss": 0.5649, + "step": 18108 + }, + { + "epoch": 1.2269801477064841, + "grad_norm": 5.713914394378662, + "learning_rate": 7.623656650010268e-05, + "loss": 1.0169, + "step": 18109 + }, + { + "epoch": 1.2270479029744563, + "grad_norm": 6.060333728790283, + "learning_rate": 7.623519748100486e-05, + "loss": 0.5474, + "step": 18110 + }, + { + "epoch": 1.2271156582424283, + "grad_norm": 7.35471248626709, + "learning_rate": 7.623382846190704e-05, + "loss": 0.9568, + "step": 18111 + }, + { + "epoch": 1.2271834135104005, + "grad_norm": 5.508981227874756, + "learning_rate": 7.623245944280923e-05, + "loss": 0.8809, + "step": 18112 + }, + { + "epoch": 1.2272511687783725, + "grad_norm": 6.507081031799316, + "learning_rate": 7.623109042371142e-05, + "loss": 0.6141, + "step": 18113 + }, + { + "epoch": 1.2273189240463447, + "grad_norm": 4.551576614379883, + "learning_rate": 7.62297214046136e-05, + "loss": 0.5581, + "step": 18114 + }, + { + "epoch": 1.2273866793143167, + "grad_norm": 5.739194393157959, + "learning_rate": 7.622835238551578e-05, + "loss": 0.563, + "step": 18115 + }, + { + "epoch": 1.2274544345822889, + "grad_norm": 7.448077201843262, + "learning_rate": 7.622698336641797e-05, + "loss": 0.8644, + "step": 18116 + }, + { + "epoch": 1.2275221898502608, + "grad_norm": 4.7695770263671875, + "learning_rate": 7.622561434732015e-05, + "loss": 0.6372, + "step": 18117 + }, + { + "epoch": 1.227589945118233, + "grad_norm": 5.259091854095459, + "learning_rate": 7.622424532822233e-05, + "loss": 0.7068, + "step": 18118 + }, + { + "epoch": 1.227657700386205, + "grad_norm": 4.840086936950684, + "learning_rate": 7.622287630912451e-05, + "loss": 0.668, + "step": 18119 + }, + { + "epoch": 1.227725455654177, + "grad_norm": 7.2845282554626465, + "learning_rate": 7.622150729002669e-05, + "loss": 0.5841, + "step": 18120 + }, + { + "epoch": 1.2277932109221492, + "grad_norm": 5.2438249588012695, + "learning_rate": 7.622013827092889e-05, + "loss": 0.5784, + "step": 18121 + }, + { + "epoch": 1.2278609661901212, + "grad_norm": 5.425882816314697, + "learning_rate": 7.621876925183107e-05, + "loss": 0.4798, + "step": 18122 + }, + { + "epoch": 1.2279287214580934, + "grad_norm": 4.423764705657959, + "learning_rate": 7.621740023273325e-05, + "loss": 0.5684, + "step": 18123 + }, + { + "epoch": 1.2279964767260654, + "grad_norm": 6.794591903686523, + "learning_rate": 7.621603121363543e-05, + "loss": 0.3746, + "step": 18124 + }, + { + "epoch": 1.2280642319940376, + "grad_norm": 4.884415626525879, + "learning_rate": 7.621466219453762e-05, + "loss": 0.6326, + "step": 18125 + }, + { + "epoch": 1.2281319872620096, + "grad_norm": 4.998366832733154, + "learning_rate": 7.62132931754398e-05, + "loss": 0.6967, + "step": 18126 + }, + { + "epoch": 1.2281997425299818, + "grad_norm": 5.243693828582764, + "learning_rate": 7.621192415634198e-05, + "loss": 0.8389, + "step": 18127 + }, + { + "epoch": 1.2282674977979537, + "grad_norm": 5.258869171142578, + "learning_rate": 7.621055513724416e-05, + "loss": 0.6824, + "step": 18128 + }, + { + "epoch": 1.228335253065926, + "grad_norm": 6.289614677429199, + "learning_rate": 7.620918611814634e-05, + "loss": 0.9113, + "step": 18129 + }, + { + "epoch": 1.228403008333898, + "grad_norm": 4.787515163421631, + "learning_rate": 7.620781709904854e-05, + "loss": 0.6163, + "step": 18130 + }, + { + "epoch": 1.2284707636018701, + "grad_norm": 6.306053161621094, + "learning_rate": 7.620644807995072e-05, + "loss": 0.6851, + "step": 18131 + }, + { + "epoch": 1.2285385188698421, + "grad_norm": 5.287718772888184, + "learning_rate": 7.62050790608529e-05, + "loss": 0.4938, + "step": 18132 + }, + { + "epoch": 1.2286062741378143, + "grad_norm": 7.48237943649292, + "learning_rate": 7.620371004175509e-05, + "loss": 0.7323, + "step": 18133 + }, + { + "epoch": 1.2286740294057863, + "grad_norm": 5.759404182434082, + "learning_rate": 7.620234102265727e-05, + "loss": 0.6837, + "step": 18134 + }, + { + "epoch": 1.2287417846737583, + "grad_norm": 6.255051136016846, + "learning_rate": 7.620097200355945e-05, + "loss": 0.7534, + "step": 18135 + }, + { + "epoch": 1.2288095399417305, + "grad_norm": 6.460224628448486, + "learning_rate": 7.619960298446164e-05, + "loss": 0.813, + "step": 18136 + }, + { + "epoch": 1.2288772952097025, + "grad_norm": 6.245341777801514, + "learning_rate": 7.619823396536382e-05, + "loss": 0.8548, + "step": 18137 + }, + { + "epoch": 1.2289450504776747, + "grad_norm": 7.165752410888672, + "learning_rate": 7.6196864946266e-05, + "loss": 0.7247, + "step": 18138 + }, + { + "epoch": 1.2290128057456466, + "grad_norm": 5.944732666015625, + "learning_rate": 7.61954959271682e-05, + "loss": 0.7436, + "step": 18139 + }, + { + "epoch": 1.2290805610136188, + "grad_norm": 5.928808212280273, + "learning_rate": 7.619412690807038e-05, + "loss": 0.6332, + "step": 18140 + }, + { + "epoch": 1.2291483162815908, + "grad_norm": 7.637009143829346, + "learning_rate": 7.619275788897256e-05, + "loss": 0.6698, + "step": 18141 + }, + { + "epoch": 1.229216071549563, + "grad_norm": 6.397332668304443, + "learning_rate": 7.619138886987474e-05, + "loss": 0.5942, + "step": 18142 + }, + { + "epoch": 1.229283826817535, + "grad_norm": 7.642723560333252, + "learning_rate": 7.619001985077692e-05, + "loss": 0.6061, + "step": 18143 + }, + { + "epoch": 1.2293515820855072, + "grad_norm": 4.819983959197998, + "learning_rate": 7.618865083167911e-05, + "loss": 0.6466, + "step": 18144 + }, + { + "epoch": 1.2294193373534792, + "grad_norm": 4.973183631896973, + "learning_rate": 7.61872818125813e-05, + "loss": 0.5931, + "step": 18145 + }, + { + "epoch": 1.2294870926214514, + "grad_norm": 5.909261703491211, + "learning_rate": 7.618591279348347e-05, + "loss": 0.5146, + "step": 18146 + }, + { + "epoch": 1.2295548478894234, + "grad_norm": 4.776844501495361, + "learning_rate": 7.618454377438566e-05, + "loss": 0.5519, + "step": 18147 + }, + { + "epoch": 1.2296226031573956, + "grad_norm": 4.408102512359619, + "learning_rate": 7.618317475528785e-05, + "loss": 0.488, + "step": 18148 + }, + { + "epoch": 1.2296903584253676, + "grad_norm": 4.527730464935303, + "learning_rate": 7.618180573619003e-05, + "loss": 0.6212, + "step": 18149 + }, + { + "epoch": 1.2297581136933395, + "grad_norm": 5.151172161102295, + "learning_rate": 7.618043671709221e-05, + "loss": 0.751, + "step": 18150 + }, + { + "epoch": 1.2298258689613117, + "grad_norm": 4.503907680511475, + "learning_rate": 7.617906769799439e-05, + "loss": 0.675, + "step": 18151 + }, + { + "epoch": 1.229893624229284, + "grad_norm": 6.000171661376953, + "learning_rate": 7.617769867889657e-05, + "loss": 0.5313, + "step": 18152 + }, + { + "epoch": 1.229961379497256, + "grad_norm": 5.874682903289795, + "learning_rate": 7.617632965979876e-05, + "loss": 0.8739, + "step": 18153 + }, + { + "epoch": 1.230029134765228, + "grad_norm": 5.422181129455566, + "learning_rate": 7.617496064070094e-05, + "loss": 0.7197, + "step": 18154 + }, + { + "epoch": 1.2300968900332, + "grad_norm": 5.414138317108154, + "learning_rate": 7.617359162160313e-05, + "loss": 0.6733, + "step": 18155 + }, + { + "epoch": 1.230164645301172, + "grad_norm": 6.205505847930908, + "learning_rate": 7.61722226025053e-05, + "loss": 0.8054, + "step": 18156 + }, + { + "epoch": 1.2302324005691443, + "grad_norm": 7.225019454956055, + "learning_rate": 7.61708535834075e-05, + "loss": 0.6045, + "step": 18157 + }, + { + "epoch": 1.2303001558371163, + "grad_norm": 5.018001079559326, + "learning_rate": 7.616948456430968e-05, + "loss": 0.573, + "step": 18158 + }, + { + "epoch": 1.2303679111050885, + "grad_norm": 5.88112735748291, + "learning_rate": 7.616811554521186e-05, + "loss": 0.7554, + "step": 18159 + }, + { + "epoch": 1.2304356663730605, + "grad_norm": 7.625991344451904, + "learning_rate": 7.616674652611404e-05, + "loss": 0.9717, + "step": 18160 + }, + { + "epoch": 1.2305034216410327, + "grad_norm": 5.763574123382568, + "learning_rate": 7.616537750701622e-05, + "loss": 0.9212, + "step": 18161 + }, + { + "epoch": 1.2305711769090046, + "grad_norm": 4.682827472686768, + "learning_rate": 7.616400848791841e-05, + "loss": 0.4873, + "step": 18162 + }, + { + "epoch": 1.2306389321769768, + "grad_norm": 5.930944442749023, + "learning_rate": 7.61626394688206e-05, + "loss": 0.6912, + "step": 18163 + }, + { + "epoch": 1.2307066874449488, + "grad_norm": 6.0003814697265625, + "learning_rate": 7.616127044972278e-05, + "loss": 0.6373, + "step": 18164 + }, + { + "epoch": 1.230774442712921, + "grad_norm": 6.296213626861572, + "learning_rate": 7.615990143062496e-05, + "loss": 0.7201, + "step": 18165 + }, + { + "epoch": 1.230842197980893, + "grad_norm": 5.7189249992370605, + "learning_rate": 7.615853241152714e-05, + "loss": 0.7472, + "step": 18166 + }, + { + "epoch": 1.2309099532488652, + "grad_norm": 6.441144943237305, + "learning_rate": 7.615716339242933e-05, + "loss": 0.9438, + "step": 18167 + }, + { + "epoch": 1.2309777085168372, + "grad_norm": 6.301687717437744, + "learning_rate": 7.615579437333151e-05, + "loss": 0.7934, + "step": 18168 + }, + { + "epoch": 1.2310454637848092, + "grad_norm": 5.537693023681641, + "learning_rate": 7.615442535423369e-05, + "loss": 0.7587, + "step": 18169 + }, + { + "epoch": 1.2311132190527814, + "grad_norm": 9.502969741821289, + "learning_rate": 7.615305633513587e-05, + "loss": 0.6527, + "step": 18170 + }, + { + "epoch": 1.2311809743207534, + "grad_norm": 5.102190971374512, + "learning_rate": 7.615168731603806e-05, + "loss": 0.6343, + "step": 18171 + }, + { + "epoch": 1.2312487295887256, + "grad_norm": 4.21937370300293, + "learning_rate": 7.615031829694025e-05, + "loss": 0.4586, + "step": 18172 + }, + { + "epoch": 1.2313164848566975, + "grad_norm": 6.393093585968018, + "learning_rate": 7.614894927784243e-05, + "loss": 0.7663, + "step": 18173 + }, + { + "epoch": 1.2313842401246697, + "grad_norm": 4.278507232666016, + "learning_rate": 7.61475802587446e-05, + "loss": 0.5946, + "step": 18174 + }, + { + "epoch": 1.2314519953926417, + "grad_norm": 7.228772163391113, + "learning_rate": 7.614621123964679e-05, + "loss": 0.8254, + "step": 18175 + }, + { + "epoch": 1.231519750660614, + "grad_norm": 7.127139091491699, + "learning_rate": 7.614484222054898e-05, + "loss": 0.9439, + "step": 18176 + }, + { + "epoch": 1.231587505928586, + "grad_norm": 5.133269786834717, + "learning_rate": 7.614347320145116e-05, + "loss": 0.5107, + "step": 18177 + }, + { + "epoch": 1.231655261196558, + "grad_norm": 7.366678714752197, + "learning_rate": 7.614210418235334e-05, + "loss": 0.7128, + "step": 18178 + }, + { + "epoch": 1.23172301646453, + "grad_norm": 5.290058135986328, + "learning_rate": 7.614073516325552e-05, + "loss": 0.7522, + "step": 18179 + }, + { + "epoch": 1.2317907717325023, + "grad_norm": 8.452763557434082, + "learning_rate": 7.613936614415771e-05, + "loss": 0.6214, + "step": 18180 + }, + { + "epoch": 1.2318585270004743, + "grad_norm": 5.031332492828369, + "learning_rate": 7.61379971250599e-05, + "loss": 0.6009, + "step": 18181 + }, + { + "epoch": 1.2319262822684465, + "grad_norm": 4.337656497955322, + "learning_rate": 7.613662810596208e-05, + "loss": 0.4222, + "step": 18182 + }, + { + "epoch": 1.2319940375364185, + "grad_norm": 7.403090000152588, + "learning_rate": 7.613525908686427e-05, + "loss": 0.6843, + "step": 18183 + }, + { + "epoch": 1.2320617928043904, + "grad_norm": 5.537214279174805, + "learning_rate": 7.613389006776645e-05, + "loss": 0.6674, + "step": 18184 + }, + { + "epoch": 1.2321295480723626, + "grad_norm": 5.237438201904297, + "learning_rate": 7.613252104866864e-05, + "loss": 0.7471, + "step": 18185 + }, + { + "epoch": 1.2321973033403346, + "grad_norm": 4.745239734649658, + "learning_rate": 7.613115202957082e-05, + "loss": 0.5399, + "step": 18186 + }, + { + "epoch": 1.2322650586083068, + "grad_norm": 5.67287015914917, + "learning_rate": 7.6129783010473e-05, + "loss": 0.6627, + "step": 18187 + }, + { + "epoch": 1.2323328138762788, + "grad_norm": 6.220743179321289, + "learning_rate": 7.612841399137518e-05, + "loss": 0.6627, + "step": 18188 + }, + { + "epoch": 1.232400569144251, + "grad_norm": 7.604142189025879, + "learning_rate": 7.612704497227737e-05, + "loss": 0.6789, + "step": 18189 + }, + { + "epoch": 1.232468324412223, + "grad_norm": 5.372503280639648, + "learning_rate": 7.612567595317956e-05, + "loss": 0.7404, + "step": 18190 + }, + { + "epoch": 1.2325360796801952, + "grad_norm": 5.212665557861328, + "learning_rate": 7.612430693408174e-05, + "loss": 0.6296, + "step": 18191 + }, + { + "epoch": 1.2326038349481672, + "grad_norm": 7.466536521911621, + "learning_rate": 7.612293791498392e-05, + "loss": 0.7127, + "step": 18192 + }, + { + "epoch": 1.2326715902161394, + "grad_norm": 6.254476070404053, + "learning_rate": 7.61215688958861e-05, + "loss": 0.849, + "step": 18193 + }, + { + "epoch": 1.2327393454841113, + "grad_norm": 7.161530494689941, + "learning_rate": 7.61201998767883e-05, + "loss": 0.4574, + "step": 18194 + }, + { + "epoch": 1.2328071007520836, + "grad_norm": 7.048676013946533, + "learning_rate": 7.611883085769047e-05, + "loss": 0.537, + "step": 18195 + }, + { + "epoch": 1.2328748560200555, + "grad_norm": 4.868967533111572, + "learning_rate": 7.611746183859265e-05, + "loss": 0.6313, + "step": 18196 + }, + { + "epoch": 1.2329426112880277, + "grad_norm": 4.794018745422363, + "learning_rate": 7.611609281949483e-05, + "loss": 0.6326, + "step": 18197 + }, + { + "epoch": 1.2330103665559997, + "grad_norm": 7.3240556716918945, + "learning_rate": 7.611472380039702e-05, + "loss": 0.6995, + "step": 18198 + }, + { + "epoch": 1.2330781218239717, + "grad_norm": 6.228128910064697, + "learning_rate": 7.611335478129921e-05, + "loss": 0.5818, + "step": 18199 + }, + { + "epoch": 1.233145877091944, + "grad_norm": 9.069717407226562, + "learning_rate": 7.611198576220139e-05, + "loss": 0.718, + "step": 18200 + }, + { + "epoch": 1.233213632359916, + "grad_norm": 5.857358932495117, + "learning_rate": 7.611061674310357e-05, + "loss": 0.7181, + "step": 18201 + }, + { + "epoch": 1.233281387627888, + "grad_norm": 9.045475959777832, + "learning_rate": 7.610924772400575e-05, + "loss": 0.7415, + "step": 18202 + }, + { + "epoch": 1.23334914289586, + "grad_norm": 6.362603664398193, + "learning_rate": 7.610787870490794e-05, + "loss": 0.6241, + "step": 18203 + }, + { + "epoch": 1.2334168981638323, + "grad_norm": 7.243282318115234, + "learning_rate": 7.610650968581012e-05, + "loss": 0.7114, + "step": 18204 + }, + { + "epoch": 1.2334846534318042, + "grad_norm": 5.471545219421387, + "learning_rate": 7.61051406667123e-05, + "loss": 0.8035, + "step": 18205 + }, + { + "epoch": 1.2335524086997764, + "grad_norm": 6.288739204406738, + "learning_rate": 7.610377164761449e-05, + "loss": 0.7507, + "step": 18206 + }, + { + "epoch": 1.2336201639677484, + "grad_norm": 6.829304218292236, + "learning_rate": 7.610240262851667e-05, + "loss": 0.7863, + "step": 18207 + }, + { + "epoch": 1.2336879192357206, + "grad_norm": 5.105554580688477, + "learning_rate": 7.610103360941886e-05, + "loss": 0.7707, + "step": 18208 + }, + { + "epoch": 1.2337556745036926, + "grad_norm": 7.09926176071167, + "learning_rate": 7.609966459032104e-05, + "loss": 0.8251, + "step": 18209 + }, + { + "epoch": 1.2338234297716648, + "grad_norm": 4.862083911895752, + "learning_rate": 7.609829557122322e-05, + "loss": 0.5155, + "step": 18210 + }, + { + "epoch": 1.2338911850396368, + "grad_norm": 5.866125583648682, + "learning_rate": 7.60969265521254e-05, + "loss": 0.6442, + "step": 18211 + }, + { + "epoch": 1.233958940307609, + "grad_norm": 6.490697860717773, + "learning_rate": 7.609555753302758e-05, + "loss": 0.7055, + "step": 18212 + }, + { + "epoch": 1.234026695575581, + "grad_norm": 6.739508152008057, + "learning_rate": 7.609418851392977e-05, + "loss": 0.8067, + "step": 18213 + }, + { + "epoch": 1.2340944508435532, + "grad_norm": 7.74035120010376, + "learning_rate": 7.609281949483195e-05, + "loss": 0.8372, + "step": 18214 + }, + { + "epoch": 1.2341622061115252, + "grad_norm": 6.445488929748535, + "learning_rate": 7.609145047573414e-05, + "loss": 0.6183, + "step": 18215 + }, + { + "epoch": 1.2342299613794974, + "grad_norm": 6.993857383728027, + "learning_rate": 7.609008145663632e-05, + "loss": 0.6401, + "step": 18216 + }, + { + "epoch": 1.2342977166474693, + "grad_norm": 5.35559606552124, + "learning_rate": 7.608871243753851e-05, + "loss": 0.5708, + "step": 18217 + }, + { + "epoch": 1.2343654719154413, + "grad_norm": 8.084784507751465, + "learning_rate": 7.608734341844069e-05, + "loss": 0.5623, + "step": 18218 + }, + { + "epoch": 1.2344332271834135, + "grad_norm": 7.187109470367432, + "learning_rate": 7.608597439934287e-05, + "loss": 0.7409, + "step": 18219 + }, + { + "epoch": 1.2345009824513855, + "grad_norm": 6.2048563957214355, + "learning_rate": 7.608460538024505e-05, + "loss": 0.7278, + "step": 18220 + }, + { + "epoch": 1.2345687377193577, + "grad_norm": 8.183573722839355, + "learning_rate": 7.608323636114723e-05, + "loss": 0.8511, + "step": 18221 + }, + { + "epoch": 1.2346364929873297, + "grad_norm": 8.200891494750977, + "learning_rate": 7.608186734204942e-05, + "loss": 1.0603, + "step": 18222 + }, + { + "epoch": 1.234704248255302, + "grad_norm": 4.761917591094971, + "learning_rate": 7.60804983229516e-05, + "loss": 0.7157, + "step": 18223 + }, + { + "epoch": 1.2347720035232739, + "grad_norm": 8.363795280456543, + "learning_rate": 7.607912930385379e-05, + "loss": 0.6852, + "step": 18224 + }, + { + "epoch": 1.234839758791246, + "grad_norm": 5.754380702972412, + "learning_rate": 7.607776028475597e-05, + "loss": 0.6346, + "step": 18225 + }, + { + "epoch": 1.234907514059218, + "grad_norm": 5.477109909057617, + "learning_rate": 7.607639126565816e-05, + "loss": 0.7659, + "step": 18226 + }, + { + "epoch": 1.2349752693271903, + "grad_norm": 7.943727493286133, + "learning_rate": 7.607502224656034e-05, + "loss": 0.63, + "step": 18227 + }, + { + "epoch": 1.2350430245951622, + "grad_norm": 4.878358364105225, + "learning_rate": 7.607365322746252e-05, + "loss": 0.6735, + "step": 18228 + }, + { + "epoch": 1.2351107798631344, + "grad_norm": 5.543867111206055, + "learning_rate": 7.607228420836471e-05, + "loss": 0.689, + "step": 18229 + }, + { + "epoch": 1.2351785351311064, + "grad_norm": 5.050788402557373, + "learning_rate": 7.60709151892669e-05, + "loss": 0.7141, + "step": 18230 + }, + { + "epoch": 1.2352462903990786, + "grad_norm": 11.664437294006348, + "learning_rate": 7.606954617016907e-05, + "loss": 0.5473, + "step": 18231 + }, + { + "epoch": 1.2353140456670506, + "grad_norm": 4.634623050689697, + "learning_rate": 7.606817715107127e-05, + "loss": 0.7636, + "step": 18232 + }, + { + "epoch": 1.2353818009350226, + "grad_norm": 7.5991339683532715, + "learning_rate": 7.606680813197345e-05, + "loss": 0.6636, + "step": 18233 + }, + { + "epoch": 1.2354495562029948, + "grad_norm": 5.919665336608887, + "learning_rate": 7.606543911287563e-05, + "loss": 0.6424, + "step": 18234 + }, + { + "epoch": 1.2355173114709668, + "grad_norm": 6.050735950469971, + "learning_rate": 7.606407009377782e-05, + "loss": 0.7623, + "step": 18235 + }, + { + "epoch": 1.235585066738939, + "grad_norm": 5.501585483551025, + "learning_rate": 7.606270107468e-05, + "loss": 0.5839, + "step": 18236 + }, + { + "epoch": 1.235652822006911, + "grad_norm": 5.354194164276123, + "learning_rate": 7.606133205558218e-05, + "loss": 0.8305, + "step": 18237 + }, + { + "epoch": 1.2357205772748832, + "grad_norm": 4.940706729888916, + "learning_rate": 7.605996303648436e-05, + "loss": 0.6149, + "step": 18238 + }, + { + "epoch": 1.2357883325428551, + "grad_norm": 7.159313678741455, + "learning_rate": 7.605859401738654e-05, + "loss": 0.7691, + "step": 18239 + }, + { + "epoch": 1.2358560878108273, + "grad_norm": 6.895895481109619, + "learning_rate": 7.605722499828874e-05, + "loss": 0.6597, + "step": 18240 + }, + { + "epoch": 1.2359238430787993, + "grad_norm": 6.15907621383667, + "learning_rate": 7.605585597919092e-05, + "loss": 0.7909, + "step": 18241 + }, + { + "epoch": 1.2359915983467715, + "grad_norm": 7.679660797119141, + "learning_rate": 7.60544869600931e-05, + "loss": 0.8247, + "step": 18242 + }, + { + "epoch": 1.2360593536147435, + "grad_norm": 4.731861114501953, + "learning_rate": 7.605311794099528e-05, + "loss": 0.7492, + "step": 18243 + }, + { + "epoch": 1.2361271088827157, + "grad_norm": 5.705376625061035, + "learning_rate": 7.605174892189746e-05, + "loss": 0.7343, + "step": 18244 + }, + { + "epoch": 1.2361948641506877, + "grad_norm": 7.357006072998047, + "learning_rate": 7.605037990279965e-05, + "loss": 0.6074, + "step": 18245 + }, + { + "epoch": 1.23626261941866, + "grad_norm": 7.494833469390869, + "learning_rate": 7.604901088370183e-05, + "loss": 0.7074, + "step": 18246 + }, + { + "epoch": 1.2363303746866319, + "grad_norm": 5.229856967926025, + "learning_rate": 7.604764186460401e-05, + "loss": 0.7422, + "step": 18247 + }, + { + "epoch": 1.2363981299546039, + "grad_norm": 4.943398475646973, + "learning_rate": 7.60462728455062e-05, + "loss": 0.5172, + "step": 18248 + }, + { + "epoch": 1.236465885222576, + "grad_norm": 6.097332954406738, + "learning_rate": 7.604490382640839e-05, + "loss": 0.7787, + "step": 18249 + }, + { + "epoch": 1.2365336404905483, + "grad_norm": 5.074896812438965, + "learning_rate": 7.604353480731057e-05, + "loss": 0.631, + "step": 18250 + }, + { + "epoch": 1.2366013957585202, + "grad_norm": 6.022866725921631, + "learning_rate": 7.604216578821275e-05, + "loss": 0.9236, + "step": 18251 + }, + { + "epoch": 1.2366691510264922, + "grad_norm": 5.703517436981201, + "learning_rate": 7.604079676911493e-05, + "loss": 0.66, + "step": 18252 + }, + { + "epoch": 1.2367369062944644, + "grad_norm": 4.512523651123047, + "learning_rate": 7.603942775001711e-05, + "loss": 0.7138, + "step": 18253 + }, + { + "epoch": 1.2368046615624364, + "grad_norm": 4.882974624633789, + "learning_rate": 7.60380587309193e-05, + "loss": 0.7239, + "step": 18254 + }, + { + "epoch": 1.2368724168304086, + "grad_norm": 5.847234725952148, + "learning_rate": 7.603668971182148e-05, + "loss": 0.699, + "step": 18255 + }, + { + "epoch": 1.2369401720983806, + "grad_norm": 7.679762840270996, + "learning_rate": 7.603532069272366e-05, + "loss": 0.7491, + "step": 18256 + }, + { + "epoch": 1.2370079273663528, + "grad_norm": 5.321573257446289, + "learning_rate": 7.603395167362585e-05, + "loss": 0.4653, + "step": 18257 + }, + { + "epoch": 1.2370756826343248, + "grad_norm": 5.447080612182617, + "learning_rate": 7.603258265452804e-05, + "loss": 0.6317, + "step": 18258 + }, + { + "epoch": 1.237143437902297, + "grad_norm": 7.135063648223877, + "learning_rate": 7.603121363543022e-05, + "loss": 0.7021, + "step": 18259 + }, + { + "epoch": 1.237211193170269, + "grad_norm": 4.839768886566162, + "learning_rate": 7.60298446163324e-05, + "loss": 0.5687, + "step": 18260 + }, + { + "epoch": 1.2372789484382412, + "grad_norm": 7.4310078620910645, + "learning_rate": 7.602847559723458e-05, + "loss": 0.811, + "step": 18261 + }, + { + "epoch": 1.2373467037062131, + "grad_norm": 8.49348258972168, + "learning_rate": 7.602710657813676e-05, + "loss": 0.8388, + "step": 18262 + }, + { + "epoch": 1.2374144589741853, + "grad_norm": 7.248007297515869, + "learning_rate": 7.602573755903895e-05, + "loss": 0.6113, + "step": 18263 + }, + { + "epoch": 1.2374822142421573, + "grad_norm": 4.8018598556518555, + "learning_rate": 7.602436853994113e-05, + "loss": 0.5807, + "step": 18264 + }, + { + "epoch": 1.2375499695101295, + "grad_norm": 6.000351428985596, + "learning_rate": 7.602299952084331e-05, + "loss": 0.7009, + "step": 18265 + }, + { + "epoch": 1.2376177247781015, + "grad_norm": 6.559912204742432, + "learning_rate": 7.60216305017455e-05, + "loss": 0.6852, + "step": 18266 + }, + { + "epoch": 1.2376854800460735, + "grad_norm": 6.837792873382568, + "learning_rate": 7.602026148264768e-05, + "loss": 1.0492, + "step": 18267 + }, + { + "epoch": 1.2377532353140457, + "grad_norm": 4.829981803894043, + "learning_rate": 7.601889246354987e-05, + "loss": 0.566, + "step": 18268 + }, + { + "epoch": 1.2378209905820177, + "grad_norm": 7.107032299041748, + "learning_rate": 7.601752344445205e-05, + "loss": 0.6397, + "step": 18269 + }, + { + "epoch": 1.2378887458499899, + "grad_norm": 6.705806255340576, + "learning_rate": 7.601615442535423e-05, + "loss": 0.7072, + "step": 18270 + }, + { + "epoch": 1.2379565011179618, + "grad_norm": 7.603937149047852, + "learning_rate": 7.601478540625641e-05, + "loss": 0.7151, + "step": 18271 + }, + { + "epoch": 1.238024256385934, + "grad_norm": 8.943765640258789, + "learning_rate": 7.60134163871586e-05, + "loss": 0.5127, + "step": 18272 + }, + { + "epoch": 1.238092011653906, + "grad_norm": 5.192441463470459, + "learning_rate": 7.601204736806078e-05, + "loss": 0.5709, + "step": 18273 + }, + { + "epoch": 1.2381597669218782, + "grad_norm": 7.020391941070557, + "learning_rate": 7.601067834896297e-05, + "loss": 0.7772, + "step": 18274 + }, + { + "epoch": 1.2382275221898502, + "grad_norm": 7.921938419342041, + "learning_rate": 7.600930932986516e-05, + "loss": 0.5601, + "step": 18275 + }, + { + "epoch": 1.2382952774578224, + "grad_norm": 5.29582405090332, + "learning_rate": 7.600794031076734e-05, + "loss": 0.7777, + "step": 18276 + }, + { + "epoch": 1.2383630327257944, + "grad_norm": 6.641912937164307, + "learning_rate": 7.600657129166952e-05, + "loss": 0.9943, + "step": 18277 + }, + { + "epoch": 1.2384307879937666, + "grad_norm": 5.117624282836914, + "learning_rate": 7.600520227257171e-05, + "loss": 0.6456, + "step": 18278 + }, + { + "epoch": 1.2384985432617386, + "grad_norm": 7.091884136199951, + "learning_rate": 7.60038332534739e-05, + "loss": 0.7467, + "step": 18279 + }, + { + "epoch": 1.2385662985297108, + "grad_norm": 7.4439005851745605, + "learning_rate": 7.600246423437607e-05, + "loss": 0.6892, + "step": 18280 + }, + { + "epoch": 1.2386340537976828, + "grad_norm": 5.143298149108887, + "learning_rate": 7.600109521527827e-05, + "loss": 0.6211, + "step": 18281 + }, + { + "epoch": 1.2387018090656547, + "grad_norm": 4.91877555847168, + "learning_rate": 7.599972619618045e-05, + "loss": 0.635, + "step": 18282 + }, + { + "epoch": 1.238769564333627, + "grad_norm": 6.771152496337891, + "learning_rate": 7.599835717708263e-05, + "loss": 0.8141, + "step": 18283 + }, + { + "epoch": 1.238837319601599, + "grad_norm": 6.854776859283447, + "learning_rate": 7.599698815798481e-05, + "loss": 0.7984, + "step": 18284 + }, + { + "epoch": 1.2389050748695711, + "grad_norm": 5.357391357421875, + "learning_rate": 7.599561913888699e-05, + "loss": 0.6077, + "step": 18285 + }, + { + "epoch": 1.2389728301375431, + "grad_norm": 7.865238189697266, + "learning_rate": 7.599425011978918e-05, + "loss": 0.8676, + "step": 18286 + }, + { + "epoch": 1.2390405854055153, + "grad_norm": 6.712452411651611, + "learning_rate": 7.599288110069136e-05, + "loss": 0.9323, + "step": 18287 + }, + { + "epoch": 1.2391083406734873, + "grad_norm": 5.430103778839111, + "learning_rate": 7.599151208159354e-05, + "loss": 0.5833, + "step": 18288 + }, + { + "epoch": 1.2391760959414595, + "grad_norm": 5.1226959228515625, + "learning_rate": 7.599014306249572e-05, + "loss": 0.5945, + "step": 18289 + }, + { + "epoch": 1.2392438512094315, + "grad_norm": 5.239380836486816, + "learning_rate": 7.598877404339792e-05, + "loss": 0.5955, + "step": 18290 + }, + { + "epoch": 1.2393116064774037, + "grad_norm": 5.962562561035156, + "learning_rate": 7.59874050243001e-05, + "loss": 0.5801, + "step": 18291 + }, + { + "epoch": 1.2393793617453757, + "grad_norm": 5.283496856689453, + "learning_rate": 7.598603600520228e-05, + "loss": 0.7513, + "step": 18292 + }, + { + "epoch": 1.2394471170133479, + "grad_norm": 5.144606113433838, + "learning_rate": 7.598466698610446e-05, + "loss": 0.6134, + "step": 18293 + }, + { + "epoch": 1.2395148722813198, + "grad_norm": 5.917706489562988, + "learning_rate": 7.598329796700664e-05, + "loss": 0.6122, + "step": 18294 + }, + { + "epoch": 1.239582627549292, + "grad_norm": 6.156318187713623, + "learning_rate": 7.598192894790883e-05, + "loss": 0.7378, + "step": 18295 + }, + { + "epoch": 1.239650382817264, + "grad_norm": 5.589748859405518, + "learning_rate": 7.598055992881101e-05, + "loss": 0.7661, + "step": 18296 + }, + { + "epoch": 1.239718138085236, + "grad_norm": 5.590287685394287, + "learning_rate": 7.59791909097132e-05, + "loss": 0.6075, + "step": 18297 + }, + { + "epoch": 1.2397858933532082, + "grad_norm": 5.150818824768066, + "learning_rate": 7.597782189061537e-05, + "loss": 0.5828, + "step": 18298 + }, + { + "epoch": 1.2398536486211804, + "grad_norm": 6.356131553649902, + "learning_rate": 7.597645287151755e-05, + "loss": 0.735, + "step": 18299 + }, + { + "epoch": 1.2399214038891524, + "grad_norm": 6.432243824005127, + "learning_rate": 7.597508385241975e-05, + "loss": 0.7147, + "step": 18300 + }, + { + "epoch": 1.2399891591571244, + "grad_norm": 6.962221622467041, + "learning_rate": 7.597371483332193e-05, + "loss": 0.6691, + "step": 18301 + }, + { + "epoch": 1.2400569144250966, + "grad_norm": 5.084456443786621, + "learning_rate": 7.597234581422411e-05, + "loss": 0.5687, + "step": 18302 + }, + { + "epoch": 1.2401246696930686, + "grad_norm": 5.874925136566162, + "learning_rate": 7.597097679512629e-05, + "loss": 0.7533, + "step": 18303 + }, + { + "epoch": 1.2401924249610408, + "grad_norm": 7.665027618408203, + "learning_rate": 7.596960777602848e-05, + "loss": 0.706, + "step": 18304 + }, + { + "epoch": 1.2402601802290127, + "grad_norm": 6.6125030517578125, + "learning_rate": 7.596823875693066e-05, + "loss": 0.6882, + "step": 18305 + }, + { + "epoch": 1.240327935496985, + "grad_norm": 5.991425514221191, + "learning_rate": 7.596686973783284e-05, + "loss": 0.6645, + "step": 18306 + }, + { + "epoch": 1.240395690764957, + "grad_norm": 7.070034980773926, + "learning_rate": 7.596550071873502e-05, + "loss": 0.7398, + "step": 18307 + }, + { + "epoch": 1.2404634460329291, + "grad_norm": 4.521208763122559, + "learning_rate": 7.59641316996372e-05, + "loss": 0.498, + "step": 18308 + }, + { + "epoch": 1.240531201300901, + "grad_norm": 5.832149982452393, + "learning_rate": 7.59627626805394e-05, + "loss": 0.5592, + "step": 18309 + }, + { + "epoch": 1.2405989565688733, + "grad_norm": 6.671751976013184, + "learning_rate": 7.596139366144158e-05, + "loss": 0.6682, + "step": 18310 + }, + { + "epoch": 1.2406667118368453, + "grad_norm": 5.849062442779541, + "learning_rate": 7.596002464234376e-05, + "loss": 0.779, + "step": 18311 + }, + { + "epoch": 1.2407344671048175, + "grad_norm": 7.465084552764893, + "learning_rate": 7.595865562324594e-05, + "loss": 0.5536, + "step": 18312 + }, + { + "epoch": 1.2408022223727895, + "grad_norm": 5.540762901306152, + "learning_rate": 7.595728660414813e-05, + "loss": 0.7038, + "step": 18313 + }, + { + "epoch": 1.2408699776407617, + "grad_norm": 6.382152080535889, + "learning_rate": 7.595591758505031e-05, + "loss": 0.7971, + "step": 18314 + }, + { + "epoch": 1.2409377329087337, + "grad_norm": 4.793679237365723, + "learning_rate": 7.59545485659525e-05, + "loss": 0.7614, + "step": 18315 + }, + { + "epoch": 1.2410054881767056, + "grad_norm": 6.767176628112793, + "learning_rate": 7.595317954685467e-05, + "loss": 0.7421, + "step": 18316 + }, + { + "epoch": 1.2410732434446778, + "grad_norm": 6.34992790222168, + "learning_rate": 7.595181052775686e-05, + "loss": 0.7134, + "step": 18317 + }, + { + "epoch": 1.2411409987126498, + "grad_norm": 8.232148170471191, + "learning_rate": 7.595044150865905e-05, + "loss": 0.6995, + "step": 18318 + }, + { + "epoch": 1.241208753980622, + "grad_norm": 4.682238578796387, + "learning_rate": 7.594907248956123e-05, + "loss": 0.7234, + "step": 18319 + }, + { + "epoch": 1.241276509248594, + "grad_norm": 6.7042555809021, + "learning_rate": 7.594770347046341e-05, + "loss": 0.6465, + "step": 18320 + }, + { + "epoch": 1.2413442645165662, + "grad_norm": 4.933433532714844, + "learning_rate": 7.59463344513656e-05, + "loss": 0.5732, + "step": 18321 + }, + { + "epoch": 1.2414120197845382, + "grad_norm": 6.0281171798706055, + "learning_rate": 7.594496543226778e-05, + "loss": 0.8858, + "step": 18322 + }, + { + "epoch": 1.2414797750525104, + "grad_norm": 6.028160095214844, + "learning_rate": 7.594359641316996e-05, + "loss": 0.6607, + "step": 18323 + }, + { + "epoch": 1.2415475303204824, + "grad_norm": 5.355819225311279, + "learning_rate": 7.594222739407216e-05, + "loss": 0.5502, + "step": 18324 + }, + { + "epoch": 1.2416152855884546, + "grad_norm": 4.877525329589844, + "learning_rate": 7.594085837497434e-05, + "loss": 0.6762, + "step": 18325 + }, + { + "epoch": 1.2416830408564266, + "grad_norm": 7.059868335723877, + "learning_rate": 7.593948935587652e-05, + "loss": 0.7334, + "step": 18326 + }, + { + "epoch": 1.2417507961243988, + "grad_norm": 7.107666015625, + "learning_rate": 7.593812033677871e-05, + "loss": 0.7463, + "step": 18327 + }, + { + "epoch": 1.2418185513923707, + "grad_norm": 4.644011497497559, + "learning_rate": 7.593675131768089e-05, + "loss": 0.61, + "step": 18328 + }, + { + "epoch": 1.241886306660343, + "grad_norm": 4.502552509307861, + "learning_rate": 7.593538229858307e-05, + "loss": 0.527, + "step": 18329 + }, + { + "epoch": 1.241954061928315, + "grad_norm": 5.403695583343506, + "learning_rate": 7.593401327948525e-05, + "loss": 0.7903, + "step": 18330 + }, + { + "epoch": 1.242021817196287, + "grad_norm": 8.830596923828125, + "learning_rate": 7.593264426038743e-05, + "loss": 0.6086, + "step": 18331 + }, + { + "epoch": 1.242089572464259, + "grad_norm": 6.801039695739746, + "learning_rate": 7.593127524128963e-05, + "loss": 0.583, + "step": 18332 + }, + { + "epoch": 1.242157327732231, + "grad_norm": 6.03915548324585, + "learning_rate": 7.592990622219181e-05, + "loss": 0.8215, + "step": 18333 + }, + { + "epoch": 1.2422250830002033, + "grad_norm": 7.676496505737305, + "learning_rate": 7.592853720309399e-05, + "loss": 0.6051, + "step": 18334 + }, + { + "epoch": 1.2422928382681753, + "grad_norm": 5.050236701965332, + "learning_rate": 7.592716818399617e-05, + "loss": 0.5496, + "step": 18335 + }, + { + "epoch": 1.2423605935361475, + "grad_norm": 6.2748212814331055, + "learning_rate": 7.592579916489836e-05, + "loss": 0.9785, + "step": 18336 + }, + { + "epoch": 1.2424283488041195, + "grad_norm": 7.210531711578369, + "learning_rate": 7.592443014580054e-05, + "loss": 0.936, + "step": 18337 + }, + { + "epoch": 1.2424961040720917, + "grad_norm": 5.671183109283447, + "learning_rate": 7.592306112670272e-05, + "loss": 0.6438, + "step": 18338 + }, + { + "epoch": 1.2425638593400636, + "grad_norm": 5.4255690574646, + "learning_rate": 7.59216921076049e-05, + "loss": 0.614, + "step": 18339 + }, + { + "epoch": 1.2426316146080358, + "grad_norm": 9.136259078979492, + "learning_rate": 7.592032308850708e-05, + "loss": 0.863, + "step": 18340 + }, + { + "epoch": 1.2426993698760078, + "grad_norm": 9.439860343933105, + "learning_rate": 7.591895406940928e-05, + "loss": 0.5626, + "step": 18341 + }, + { + "epoch": 1.24276712514398, + "grad_norm": 5.82816743850708, + "learning_rate": 7.591758505031146e-05, + "loss": 0.8564, + "step": 18342 + }, + { + "epoch": 1.242834880411952, + "grad_norm": 8.167750358581543, + "learning_rate": 7.591621603121364e-05, + "loss": 0.5207, + "step": 18343 + }, + { + "epoch": 1.2429026356799242, + "grad_norm": 6.8161940574646, + "learning_rate": 7.591484701211582e-05, + "loss": 0.7033, + "step": 18344 + }, + { + "epoch": 1.2429703909478962, + "grad_norm": 6.037930965423584, + "learning_rate": 7.591347799301801e-05, + "loss": 0.9543, + "step": 18345 + }, + { + "epoch": 1.2430381462158682, + "grad_norm": 6.356756687164307, + "learning_rate": 7.59121089739202e-05, + "loss": 0.426, + "step": 18346 + }, + { + "epoch": 1.2431059014838404, + "grad_norm": 6.194242477416992, + "learning_rate": 7.591073995482237e-05, + "loss": 0.6365, + "step": 18347 + }, + { + "epoch": 1.2431736567518126, + "grad_norm": 6.263245105743408, + "learning_rate": 7.590937093572455e-05, + "loss": 0.7125, + "step": 18348 + }, + { + "epoch": 1.2432414120197846, + "grad_norm": 8.249999046325684, + "learning_rate": 7.590800191662673e-05, + "loss": 0.7486, + "step": 18349 + }, + { + "epoch": 1.2433091672877565, + "grad_norm": 6.465185165405273, + "learning_rate": 7.590663289752893e-05, + "loss": 0.588, + "step": 18350 + }, + { + "epoch": 1.2433769225557287, + "grad_norm": 6.615595817565918, + "learning_rate": 7.590526387843111e-05, + "loss": 0.8067, + "step": 18351 + }, + { + "epoch": 1.2434446778237007, + "grad_norm": 5.744843482971191, + "learning_rate": 7.590389485933329e-05, + "loss": 0.7306, + "step": 18352 + }, + { + "epoch": 1.243512433091673, + "grad_norm": 5.4917144775390625, + "learning_rate": 7.590252584023547e-05, + "loss": 0.6532, + "step": 18353 + }, + { + "epoch": 1.243580188359645, + "grad_norm": 4.359947204589844, + "learning_rate": 7.590115682113765e-05, + "loss": 0.7198, + "step": 18354 + }, + { + "epoch": 1.243647943627617, + "grad_norm": 5.287740707397461, + "learning_rate": 7.589978780203984e-05, + "loss": 0.6558, + "step": 18355 + }, + { + "epoch": 1.243715698895589, + "grad_norm": 5.208303928375244, + "learning_rate": 7.589841878294202e-05, + "loss": 0.7272, + "step": 18356 + }, + { + "epoch": 1.2437834541635613, + "grad_norm": 5.912323474884033, + "learning_rate": 7.58970497638442e-05, + "loss": 0.6226, + "step": 18357 + }, + { + "epoch": 1.2438512094315333, + "grad_norm": 4.994621753692627, + "learning_rate": 7.589568074474638e-05, + "loss": 0.6522, + "step": 18358 + }, + { + "epoch": 1.2439189646995055, + "grad_norm": 5.531447410583496, + "learning_rate": 7.589431172564858e-05, + "loss": 0.6913, + "step": 18359 + }, + { + "epoch": 1.2439867199674775, + "grad_norm": 6.614006996154785, + "learning_rate": 7.589294270655076e-05, + "loss": 0.5982, + "step": 18360 + }, + { + "epoch": 1.2440544752354497, + "grad_norm": 6.066467761993408, + "learning_rate": 7.589157368745294e-05, + "loss": 0.7662, + "step": 18361 + }, + { + "epoch": 1.2441222305034216, + "grad_norm": 4.1904826164245605, + "learning_rate": 7.589020466835512e-05, + "loss": 0.3991, + "step": 18362 + }, + { + "epoch": 1.2441899857713938, + "grad_norm": 6.30812931060791, + "learning_rate": 7.58888356492573e-05, + "loss": 0.6071, + "step": 18363 + }, + { + "epoch": 1.2442577410393658, + "grad_norm": 4.418458461761475, + "learning_rate": 7.58874666301595e-05, + "loss": 0.661, + "step": 18364 + }, + { + "epoch": 1.2443254963073378, + "grad_norm": 6.085812568664551, + "learning_rate": 7.588609761106167e-05, + "loss": 0.5891, + "step": 18365 + }, + { + "epoch": 1.24439325157531, + "grad_norm": 5.265291690826416, + "learning_rate": 7.588472859196385e-05, + "loss": 0.6936, + "step": 18366 + }, + { + "epoch": 1.244461006843282, + "grad_norm": 5.662139892578125, + "learning_rate": 7.588335957286605e-05, + "loss": 0.6085, + "step": 18367 + }, + { + "epoch": 1.2445287621112542, + "grad_norm": 4.7674241065979, + "learning_rate": 7.588199055376823e-05, + "loss": 0.5873, + "step": 18368 + }, + { + "epoch": 1.2445965173792262, + "grad_norm": 5.590237617492676, + "learning_rate": 7.588062153467041e-05, + "loss": 0.603, + "step": 18369 + }, + { + "epoch": 1.2446642726471984, + "grad_norm": 5.36829137802124, + "learning_rate": 7.58792525155726e-05, + "loss": 0.5697, + "step": 18370 + }, + { + "epoch": 1.2447320279151703, + "grad_norm": 4.962731838226318, + "learning_rate": 7.587788349647478e-05, + "loss": 0.5829, + "step": 18371 + }, + { + "epoch": 1.2447997831831425, + "grad_norm": 5.7452802658081055, + "learning_rate": 7.587651447737696e-05, + "loss": 0.6695, + "step": 18372 + }, + { + "epoch": 1.2448675384511145, + "grad_norm": 7.008640766143799, + "learning_rate": 7.587514545827916e-05, + "loss": 0.834, + "step": 18373 + }, + { + "epoch": 1.2449352937190867, + "grad_norm": 9.434866905212402, + "learning_rate": 7.587377643918134e-05, + "loss": 0.618, + "step": 18374 + }, + { + "epoch": 1.2450030489870587, + "grad_norm": 5.060585975646973, + "learning_rate": 7.587240742008352e-05, + "loss": 0.618, + "step": 18375 + }, + { + "epoch": 1.245070804255031, + "grad_norm": 5.655799865722656, + "learning_rate": 7.58710384009857e-05, + "loss": 0.8284, + "step": 18376 + }, + { + "epoch": 1.245138559523003, + "grad_norm": 7.925520420074463, + "learning_rate": 7.586966938188788e-05, + "loss": 0.7431, + "step": 18377 + }, + { + "epoch": 1.245206314790975, + "grad_norm": 5.230562686920166, + "learning_rate": 7.586830036279007e-05, + "loss": 0.761, + "step": 18378 + }, + { + "epoch": 1.245274070058947, + "grad_norm": 5.60720157623291, + "learning_rate": 7.586693134369225e-05, + "loss": 0.5065, + "step": 18379 + }, + { + "epoch": 1.245341825326919, + "grad_norm": 6.819098472595215, + "learning_rate": 7.586556232459443e-05, + "loss": 0.6631, + "step": 18380 + }, + { + "epoch": 1.2454095805948913, + "grad_norm": 7.610952377319336, + "learning_rate": 7.586419330549661e-05, + "loss": 0.8025, + "step": 18381 + }, + { + "epoch": 1.2454773358628632, + "grad_norm": 5.915439605712891, + "learning_rate": 7.586282428639881e-05, + "loss": 0.781, + "step": 18382 + }, + { + "epoch": 1.2455450911308354, + "grad_norm": 4.912126541137695, + "learning_rate": 7.586145526730099e-05, + "loss": 0.5188, + "step": 18383 + }, + { + "epoch": 1.2456128463988074, + "grad_norm": 5.676177024841309, + "learning_rate": 7.586008624820317e-05, + "loss": 0.5475, + "step": 18384 + }, + { + "epoch": 1.2456806016667796, + "grad_norm": 4.516726493835449, + "learning_rate": 7.585871722910535e-05, + "loss": 0.6143, + "step": 18385 + }, + { + "epoch": 1.2457483569347516, + "grad_norm": 8.950729370117188, + "learning_rate": 7.585734821000753e-05, + "loss": 0.5928, + "step": 18386 + }, + { + "epoch": 1.2458161122027238, + "grad_norm": 7.224480628967285, + "learning_rate": 7.585597919090972e-05, + "loss": 0.7095, + "step": 18387 + }, + { + "epoch": 1.2458838674706958, + "grad_norm": 6.877074241638184, + "learning_rate": 7.58546101718119e-05, + "loss": 0.7253, + "step": 18388 + }, + { + "epoch": 1.245951622738668, + "grad_norm": 7.589900970458984, + "learning_rate": 7.585324115271408e-05, + "loss": 0.8208, + "step": 18389 + }, + { + "epoch": 1.24601937800664, + "grad_norm": 6.306529998779297, + "learning_rate": 7.585187213361626e-05, + "loss": 0.5556, + "step": 18390 + }, + { + "epoch": 1.2460871332746122, + "grad_norm": 5.165809631347656, + "learning_rate": 7.585050311451846e-05, + "loss": 0.8348, + "step": 18391 + }, + { + "epoch": 1.2461548885425842, + "grad_norm": 6.17510986328125, + "learning_rate": 7.584913409542064e-05, + "loss": 0.7496, + "step": 18392 + }, + { + "epoch": 1.2462226438105564, + "grad_norm": 6.940537929534912, + "learning_rate": 7.584776507632282e-05, + "loss": 0.9307, + "step": 18393 + }, + { + "epoch": 1.2462903990785283, + "grad_norm": 5.258039951324463, + "learning_rate": 7.5846396057225e-05, + "loss": 0.5341, + "step": 18394 + }, + { + "epoch": 1.2463581543465003, + "grad_norm": 5.3323774337768555, + "learning_rate": 7.584502703812718e-05, + "loss": 0.5748, + "step": 18395 + }, + { + "epoch": 1.2464259096144725, + "grad_norm": 6.254161834716797, + "learning_rate": 7.584365801902937e-05, + "loss": 0.7838, + "step": 18396 + }, + { + "epoch": 1.2464936648824447, + "grad_norm": 6.0442214012146, + "learning_rate": 7.584228899993155e-05, + "loss": 0.7778, + "step": 18397 + }, + { + "epoch": 1.2465614201504167, + "grad_norm": 6.557648658752441, + "learning_rate": 7.584091998083373e-05, + "loss": 0.7761, + "step": 18398 + }, + { + "epoch": 1.2466291754183887, + "grad_norm": 6.024246692657471, + "learning_rate": 7.583955096173591e-05, + "loss": 0.8143, + "step": 18399 + }, + { + "epoch": 1.246696930686361, + "grad_norm": 6.548332691192627, + "learning_rate": 7.58381819426381e-05, + "loss": 0.7228, + "step": 18400 + }, + { + "epoch": 1.2467646859543329, + "grad_norm": 8.239317893981934, + "learning_rate": 7.583681292354029e-05, + "loss": 0.681, + "step": 18401 + }, + { + "epoch": 1.246832441222305, + "grad_norm": 6.653371810913086, + "learning_rate": 7.583544390444247e-05, + "loss": 0.5783, + "step": 18402 + }, + { + "epoch": 1.246900196490277, + "grad_norm": 5.489253044128418, + "learning_rate": 7.583407488534465e-05, + "loss": 0.7057, + "step": 18403 + }, + { + "epoch": 1.2469679517582493, + "grad_norm": 10.217595100402832, + "learning_rate": 7.583270586624683e-05, + "loss": 0.821, + "step": 18404 + }, + { + "epoch": 1.2470357070262212, + "grad_norm": 5.781734466552734, + "learning_rate": 7.583133684714902e-05, + "loss": 0.8124, + "step": 18405 + }, + { + "epoch": 1.2471034622941934, + "grad_norm": 5.485482692718506, + "learning_rate": 7.58299678280512e-05, + "loss": 0.3988, + "step": 18406 + }, + { + "epoch": 1.2471712175621654, + "grad_norm": 5.147676944732666, + "learning_rate": 7.582859880895338e-05, + "loss": 0.6227, + "step": 18407 + }, + { + "epoch": 1.2472389728301376, + "grad_norm": 5.972175121307373, + "learning_rate": 7.582722978985556e-05, + "loss": 0.7966, + "step": 18408 + }, + { + "epoch": 1.2473067280981096, + "grad_norm": 4.4100728034973145, + "learning_rate": 7.582586077075774e-05, + "loss": 0.5352, + "step": 18409 + }, + { + "epoch": 1.2473744833660818, + "grad_norm": 8.67353343963623, + "learning_rate": 7.582449175165994e-05, + "loss": 0.7738, + "step": 18410 + }, + { + "epoch": 1.2474422386340538, + "grad_norm": 7.108034133911133, + "learning_rate": 7.582312273256212e-05, + "loss": 0.5752, + "step": 18411 + }, + { + "epoch": 1.247509993902026, + "grad_norm": 5.655974388122559, + "learning_rate": 7.58217537134643e-05, + "loss": 0.7622, + "step": 18412 + }, + { + "epoch": 1.247577749169998, + "grad_norm": 6.0239996910095215, + "learning_rate": 7.582038469436649e-05, + "loss": 0.4806, + "step": 18413 + }, + { + "epoch": 1.24764550443797, + "grad_norm": 7.3847527503967285, + "learning_rate": 7.581901567526867e-05, + "loss": 0.8353, + "step": 18414 + }, + { + "epoch": 1.2477132597059422, + "grad_norm": 5.64340353012085, + "learning_rate": 7.581764665617085e-05, + "loss": 0.6279, + "step": 18415 + }, + { + "epoch": 1.2477810149739141, + "grad_norm": 6.056209564208984, + "learning_rate": 7.581627763707305e-05, + "loss": 0.7538, + "step": 18416 + }, + { + "epoch": 1.2478487702418863, + "grad_norm": 5.869894981384277, + "learning_rate": 7.581490861797523e-05, + "loss": 0.6976, + "step": 18417 + }, + { + "epoch": 1.2479165255098583, + "grad_norm": 5.403618335723877, + "learning_rate": 7.581353959887741e-05, + "loss": 0.618, + "step": 18418 + }, + { + "epoch": 1.2479842807778305, + "grad_norm": 4.804281234741211, + "learning_rate": 7.58121705797796e-05, + "loss": 0.4805, + "step": 18419 + }, + { + "epoch": 1.2480520360458025, + "grad_norm": 5.022444725036621, + "learning_rate": 7.581080156068178e-05, + "loss": 0.9231, + "step": 18420 + }, + { + "epoch": 1.2481197913137747, + "grad_norm": 5.900055885314941, + "learning_rate": 7.580943254158396e-05, + "loss": 0.6079, + "step": 18421 + }, + { + "epoch": 1.2481875465817467, + "grad_norm": 5.404339790344238, + "learning_rate": 7.580806352248614e-05, + "loss": 0.6443, + "step": 18422 + }, + { + "epoch": 1.248255301849719, + "grad_norm": 4.7197184562683105, + "learning_rate": 7.580669450338834e-05, + "loss": 0.6525, + "step": 18423 + }, + { + "epoch": 1.2483230571176909, + "grad_norm": 6.294169902801514, + "learning_rate": 7.580532548429052e-05, + "loss": 0.6796, + "step": 18424 + }, + { + "epoch": 1.248390812385663, + "grad_norm": 5.794606685638428, + "learning_rate": 7.58039564651927e-05, + "loss": 0.5467, + "step": 18425 + }, + { + "epoch": 1.248458567653635, + "grad_norm": 4.591580867767334, + "learning_rate": 7.580258744609488e-05, + "loss": 0.7601, + "step": 18426 + }, + { + "epoch": 1.2485263229216073, + "grad_norm": 4.168753147125244, + "learning_rate": 7.580121842699706e-05, + "loss": 0.5565, + "step": 18427 + }, + { + "epoch": 1.2485940781895792, + "grad_norm": 7.54253625869751, + "learning_rate": 7.579984940789925e-05, + "loss": 0.6358, + "step": 18428 + }, + { + "epoch": 1.2486618334575512, + "grad_norm": 4.858458518981934, + "learning_rate": 7.579848038880143e-05, + "loss": 0.5466, + "step": 18429 + }, + { + "epoch": 1.2487295887255234, + "grad_norm": 7.536377429962158, + "learning_rate": 7.579711136970361e-05, + "loss": 0.6683, + "step": 18430 + }, + { + "epoch": 1.2487973439934954, + "grad_norm": 6.619974613189697, + "learning_rate": 7.57957423506058e-05, + "loss": 0.5221, + "step": 18431 + }, + { + "epoch": 1.2488650992614676, + "grad_norm": 9.248383522033691, + "learning_rate": 7.579437333150797e-05, + "loss": 0.5517, + "step": 18432 + }, + { + "epoch": 1.2489328545294396, + "grad_norm": 7.319504261016846, + "learning_rate": 7.579300431241017e-05, + "loss": 0.7803, + "step": 18433 + }, + { + "epoch": 1.2490006097974118, + "grad_norm": 6.124282360076904, + "learning_rate": 7.579163529331235e-05, + "loss": 0.6649, + "step": 18434 + }, + { + "epoch": 1.2490683650653838, + "grad_norm": 4.615629196166992, + "learning_rate": 7.579026627421453e-05, + "loss": 0.6239, + "step": 18435 + }, + { + "epoch": 1.249136120333356, + "grad_norm": 7.09864616394043, + "learning_rate": 7.578889725511671e-05, + "loss": 0.7135, + "step": 18436 + }, + { + "epoch": 1.249203875601328, + "grad_norm": 5.639509677886963, + "learning_rate": 7.57875282360189e-05, + "loss": 0.8308, + "step": 18437 + }, + { + "epoch": 1.2492716308693002, + "grad_norm": 5.514218807220459, + "learning_rate": 7.578615921692108e-05, + "loss": 0.51, + "step": 18438 + }, + { + "epoch": 1.2493393861372721, + "grad_norm": 5.990070343017578, + "learning_rate": 7.578479019782326e-05, + "loss": 0.5471, + "step": 18439 + }, + { + "epoch": 1.2494071414052443, + "grad_norm": 5.667049407958984, + "learning_rate": 7.578342117872544e-05, + "loss": 0.5892, + "step": 18440 + }, + { + "epoch": 1.2494748966732163, + "grad_norm": 7.905735015869141, + "learning_rate": 7.578205215962762e-05, + "loss": 0.8953, + "step": 18441 + }, + { + "epoch": 1.2495426519411885, + "grad_norm": 6.189059734344482, + "learning_rate": 7.578068314052982e-05, + "loss": 1.0339, + "step": 18442 + }, + { + "epoch": 1.2496104072091605, + "grad_norm": 5.740088939666748, + "learning_rate": 7.5779314121432e-05, + "loss": 0.7114, + "step": 18443 + }, + { + "epoch": 1.2496781624771325, + "grad_norm": 6.6557698249816895, + "learning_rate": 7.577794510233418e-05, + "loss": 0.9342, + "step": 18444 + }, + { + "epoch": 1.2497459177451047, + "grad_norm": 4.974968910217285, + "learning_rate": 7.577657608323636e-05, + "loss": 0.736, + "step": 18445 + }, + { + "epoch": 1.2497459177451047, + "eval_loss": 0.7137033939361572, + "eval_noise_accuracy": 0.0, + "eval_runtime": 1472.2965, + "eval_samples_per_second": 3.49, + "eval_steps_per_second": 0.219, + "eval_wer": 67.47337717729287, + "step": 18445 + } + ], + "logging_steps": 1, + "max_steps": 73795, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 3689, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.26547613884416e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}