diff --git "a/checkpoint-9000/trainer_state.json" "b/checkpoint-9000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-9000/trainer_state.json" @@ -0,0 +1,63033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5016442784683128, + "eval_steps": 500, + "global_step": 9000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.573825316314587e-05, + "grad_norm": 0.5498427152633667, + "learning_rate": 3.3333333333333335e-07, + "loss": 1.7989, + "step": 1 + }, + { + "epoch": 0.00011147650632629174, + "grad_norm": 0.6303576827049255, + "learning_rate": 6.666666666666667e-07, + "loss": 1.996, + "step": 2 + }, + { + "epoch": 0.0001672147594894376, + "grad_norm": 0.5333236455917358, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.8613, + "step": 3 + }, + { + "epoch": 0.00022295301265258348, + "grad_norm": 0.5659189224243164, + "learning_rate": 1.3333333333333334e-06, + "loss": 1.8904, + "step": 4 + }, + { + "epoch": 0.0002786912658157293, + "grad_norm": 0.6221416592597961, + "learning_rate": 1.6666666666666667e-06, + "loss": 2.0151, + "step": 5 + }, + { + "epoch": 0.0003344295189788752, + "grad_norm": 0.6198977828025818, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.9774, + "step": 6 + }, + { + "epoch": 0.0003901677721420211, + "grad_norm": 0.6328762173652649, + "learning_rate": 2.3333333333333336e-06, + "loss": 1.8994, + "step": 7 + }, + { + "epoch": 0.00044590602530516696, + "grad_norm": 0.6075513362884521, + "learning_rate": 2.666666666666667e-06, + "loss": 1.894, + "step": 8 + }, + { + "epoch": 0.0005016442784683128, + "grad_norm": 0.6397244930267334, + "learning_rate": 3e-06, + "loss": 2.0865, + "step": 9 + }, + { + "epoch": 0.0005573825316314586, + "grad_norm": 0.6115519404411316, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.9688, + "step": 10 + }, + { + "epoch": 0.0006131207847946045, + "grad_norm": 0.546791672706604, + "learning_rate": 3.666666666666667e-06, + "loss": 1.8239, + "step": 11 + }, + { + "epoch": 0.0006688590379577504, + "grad_norm": 0.690762996673584, + "learning_rate": 4.000000000000001e-06, + "loss": 2.0367, + "step": 12 + }, + { + "epoch": 0.0007245972911208963, + "grad_norm": 0.7190566062927246, + "learning_rate": 4.333333333333334e-06, + "loss": 1.9817, + "step": 13 + }, + { + "epoch": 0.0007803355442840422, + "grad_norm": 0.6093202233314514, + "learning_rate": 4.666666666666667e-06, + "loss": 2.01, + "step": 14 + }, + { + "epoch": 0.000836073797447188, + "grad_norm": 0.5230669975280762, + "learning_rate": 5e-06, + "loss": 1.8419, + "step": 15 + }, + { + "epoch": 0.0008918120506103339, + "grad_norm": 0.5391668677330017, + "learning_rate": 5.333333333333334e-06, + "loss": 1.8663, + "step": 16 + }, + { + "epoch": 0.0009475503037734797, + "grad_norm": 0.6359019875526428, + "learning_rate": 5.666666666666667e-06, + "loss": 2.2089, + "step": 17 + }, + { + "epoch": 0.0010032885569366257, + "grad_norm": 0.61967533826828, + "learning_rate": 6e-06, + "loss": 2.0842, + "step": 18 + }, + { + "epoch": 0.0010590268100997716, + "grad_norm": 0.491642028093338, + "learning_rate": 6.333333333333334e-06, + "loss": 1.755, + "step": 19 + }, + { + "epoch": 0.0011147650632629172, + "grad_norm": 0.7064740657806396, + "learning_rate": 6.666666666666667e-06, + "loss": 2.2494, + "step": 20 + }, + { + "epoch": 0.0011705033164260631, + "grad_norm": 0.5671775937080383, + "learning_rate": 7.000000000000001e-06, + "loss": 2.0236, + "step": 21 + }, + { + "epoch": 0.001226241569589209, + "grad_norm": 0.5698847770690918, + "learning_rate": 7.333333333333334e-06, + "loss": 1.8295, + "step": 22 + }, + { + "epoch": 0.001281979822752355, + "grad_norm": 0.5910470485687256, + "learning_rate": 7.666666666666667e-06, + "loss": 2.1311, + "step": 23 + }, + { + "epoch": 0.0013377180759155008, + "grad_norm": 0.567130446434021, + "learning_rate": 8.000000000000001e-06, + "loss": 1.888, + "step": 24 + }, + { + "epoch": 0.0013934563290786467, + "grad_norm": 0.5540428757667542, + "learning_rate": 8.333333333333334e-06, + "loss": 1.6625, + "step": 25 + }, + { + "epoch": 0.0014491945822417925, + "grad_norm": 0.5729663372039795, + "learning_rate": 8.666666666666668e-06, + "loss": 2.0062, + "step": 26 + }, + { + "epoch": 0.0015049328354049384, + "grad_norm": 0.5232088565826416, + "learning_rate": 9e-06, + "loss": 1.7991, + "step": 27 + }, + { + "epoch": 0.0015606710885680843, + "grad_norm": 0.5638092160224915, + "learning_rate": 9.333333333333334e-06, + "loss": 2.0728, + "step": 28 + }, + { + "epoch": 0.0016164093417312302, + "grad_norm": 0.5504807829856873, + "learning_rate": 9.666666666666667e-06, + "loss": 1.808, + "step": 29 + }, + { + "epoch": 0.001672147594894376, + "grad_norm": 0.5935587882995605, + "learning_rate": 1e-05, + "loss": 1.9738, + "step": 30 + }, + { + "epoch": 0.001727885848057522, + "grad_norm": 0.6431534886360168, + "learning_rate": 1.0333333333333333e-05, + "loss": 2.0967, + "step": 31 + }, + { + "epoch": 0.0017836241012206678, + "grad_norm": 0.5587693452835083, + "learning_rate": 1.0666666666666667e-05, + "loss": 1.6821, + "step": 32 + }, + { + "epoch": 0.0018393623543838135, + "grad_norm": 0.5473759174346924, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.8442, + "step": 33 + }, + { + "epoch": 0.0018951006075469594, + "grad_norm": 0.6185194849967957, + "learning_rate": 1.1333333333333334e-05, + "loss": 2.0705, + "step": 34 + }, + { + "epoch": 0.0019508388607101053, + "grad_norm": 0.5253747701644897, + "learning_rate": 1.1666666666666668e-05, + "loss": 1.7944, + "step": 35 + }, + { + "epoch": 0.0020065771138732514, + "grad_norm": 0.5742389559745789, + "learning_rate": 1.2e-05, + "loss": 2.0, + "step": 36 + }, + { + "epoch": 0.0020623153670363973, + "grad_norm": 0.6290589570999146, + "learning_rate": 1.2333333333333334e-05, + "loss": 2.1365, + "step": 37 + }, + { + "epoch": 0.002118053620199543, + "grad_norm": 0.5194576382637024, + "learning_rate": 1.2666666666666668e-05, + "loss": 1.8569, + "step": 38 + }, + { + "epoch": 0.0021737918733626886, + "grad_norm": 0.5665763020515442, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.9313, + "step": 39 + }, + { + "epoch": 0.0022295301265258345, + "grad_norm": 0.5268619060516357, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.8843, + "step": 40 + }, + { + "epoch": 0.0022852683796889804, + "grad_norm": 0.7840973734855652, + "learning_rate": 1.3666666666666666e-05, + "loss": 1.929, + "step": 41 + }, + { + "epoch": 0.0023410066328521262, + "grad_norm": 0.5785960555076599, + "learning_rate": 1.4000000000000001e-05, + "loss": 2.0276, + "step": 42 + }, + { + "epoch": 0.002396744886015272, + "grad_norm": 0.5202842354774475, + "learning_rate": 1.4333333333333334e-05, + "loss": 1.949, + "step": 43 + }, + { + "epoch": 0.002452483139178418, + "grad_norm": 0.72431480884552, + "learning_rate": 1.4666666666666668e-05, + "loss": 2.2978, + "step": 44 + }, + { + "epoch": 0.002508221392341564, + "grad_norm": 0.5558940768241882, + "learning_rate": 1.5e-05, + "loss": 1.9125, + "step": 45 + }, + { + "epoch": 0.00256395964550471, + "grad_norm": 0.5687503814697266, + "learning_rate": 1.5333333333333334e-05, + "loss": 1.8533, + "step": 46 + }, + { + "epoch": 0.0026196978986678557, + "grad_norm": 0.5703473091125488, + "learning_rate": 1.5666666666666667e-05, + "loss": 1.9015, + "step": 47 + }, + { + "epoch": 0.0026754361518310016, + "grad_norm": 0.5496488809585571, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.682, + "step": 48 + }, + { + "epoch": 0.0027311744049941474, + "grad_norm": 0.6371431946754456, + "learning_rate": 1.6333333333333335e-05, + "loss": 2.0425, + "step": 49 + }, + { + "epoch": 0.0027869126581572933, + "grad_norm": 0.6071433424949646, + "learning_rate": 1.6666666666666667e-05, + "loss": 1.8745, + "step": 50 + }, + { + "epoch": 0.002842650911320439, + "grad_norm": 0.5981681942939758, + "learning_rate": 1.7000000000000003e-05, + "loss": 1.8872, + "step": 51 + }, + { + "epoch": 0.002898389164483585, + "grad_norm": 0.6591808795928955, + "learning_rate": 1.7333333333333336e-05, + "loss": 2.0187, + "step": 52 + }, + { + "epoch": 0.002954127417646731, + "grad_norm": 0.6213610172271729, + "learning_rate": 1.7666666666666668e-05, + "loss": 2.0231, + "step": 53 + }, + { + "epoch": 0.003009865670809877, + "grad_norm": 0.6377214789390564, + "learning_rate": 1.8e-05, + "loss": 1.8641, + "step": 54 + }, + { + "epoch": 0.0030656039239730227, + "grad_norm": 0.675821840763092, + "learning_rate": 1.8333333333333333e-05, + "loss": 2.215, + "step": 55 + }, + { + "epoch": 0.0031213421771361686, + "grad_norm": 0.5989570021629333, + "learning_rate": 1.866666666666667e-05, + "loss": 1.9232, + "step": 56 + }, + { + "epoch": 0.0031770804302993145, + "grad_norm": 0.6279881596565247, + "learning_rate": 1.9e-05, + "loss": 1.8452, + "step": 57 + }, + { + "epoch": 0.0032328186834624604, + "grad_norm": 0.5670164227485657, + "learning_rate": 1.9333333333333333e-05, + "loss": 1.5623, + "step": 58 + }, + { + "epoch": 0.0032885569366256063, + "grad_norm": 0.5822334289550781, + "learning_rate": 1.9666666666666666e-05, + "loss": 1.7901, + "step": 59 + }, + { + "epoch": 0.003344295189788752, + "grad_norm": 0.6322411298751831, + "learning_rate": 2e-05, + "loss": 1.8802, + "step": 60 + }, + { + "epoch": 0.003400033442951898, + "grad_norm": 0.6066840291023254, + "learning_rate": 2.0333333333333334e-05, + "loss": 1.8334, + "step": 61 + }, + { + "epoch": 0.003455771696115044, + "grad_norm": 0.6801030039787292, + "learning_rate": 2.0666666666666666e-05, + "loss": 2.1029, + "step": 62 + }, + { + "epoch": 0.00351150994927819, + "grad_norm": 0.6445280909538269, + "learning_rate": 2.1e-05, + "loss": 2.0333, + "step": 63 + }, + { + "epoch": 0.0035672482024413357, + "grad_norm": 0.6259938478469849, + "learning_rate": 2.1333333333333335e-05, + "loss": 1.6012, + "step": 64 + }, + { + "epoch": 0.003622986455604481, + "grad_norm": 0.6786999702453613, + "learning_rate": 2.1666666666666667e-05, + "loss": 2.0818, + "step": 65 + }, + { + "epoch": 0.003678724708767627, + "grad_norm": 0.6728941202163696, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.9022, + "step": 66 + }, + { + "epoch": 0.003734462961930773, + "grad_norm": 0.6992253661155701, + "learning_rate": 2.2333333333333335e-05, + "loss": 1.7435, + "step": 67 + }, + { + "epoch": 0.003790201215093919, + "grad_norm": 0.6083998084068298, + "learning_rate": 2.2666666666666668e-05, + "loss": 1.7816, + "step": 68 + }, + { + "epoch": 0.0038459394682570647, + "grad_norm": 0.6070435643196106, + "learning_rate": 2.3000000000000003e-05, + "loss": 1.672, + "step": 69 + }, + { + "epoch": 0.0039016777214202106, + "grad_norm": 0.6032823920249939, + "learning_rate": 2.3333333333333336e-05, + "loss": 1.771, + "step": 70 + }, + { + "epoch": 0.0039574159745833564, + "grad_norm": 0.689372181892395, + "learning_rate": 2.3666666666666668e-05, + "loss": 1.9594, + "step": 71 + }, + { + "epoch": 0.004013154227746503, + "grad_norm": 0.6333785653114319, + "learning_rate": 2.4e-05, + "loss": 1.8492, + "step": 72 + }, + { + "epoch": 0.004068892480909648, + "grad_norm": 0.638140857219696, + "learning_rate": 2.4333333333333336e-05, + "loss": 1.798, + "step": 73 + }, + { + "epoch": 0.0041246307340727945, + "grad_norm": 0.6000136137008667, + "learning_rate": 2.466666666666667e-05, + "loss": 1.6625, + "step": 74 + }, + { + "epoch": 0.00418036898723594, + "grad_norm": 0.7654765248298645, + "learning_rate": 2.5e-05, + "loss": 2.1015, + "step": 75 + }, + { + "epoch": 0.004236107240399086, + "grad_norm": 0.6845409870147705, + "learning_rate": 2.5333333333333337e-05, + "loss": 1.9176, + "step": 76 + }, + { + "epoch": 0.004291845493562232, + "grad_norm": 0.6557128429412842, + "learning_rate": 2.5666666666666666e-05, + "loss": 1.8244, + "step": 77 + }, + { + "epoch": 0.004347583746725377, + "grad_norm": 0.6574406027793884, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.851, + "step": 78 + }, + { + "epoch": 0.0044033219998885235, + "grad_norm": 0.6624826192855835, + "learning_rate": 2.633333333333333e-05, + "loss": 1.8332, + "step": 79 + }, + { + "epoch": 0.004459060253051669, + "grad_norm": 0.7041051983833313, + "learning_rate": 2.6666666666666667e-05, + "loss": 1.8357, + "step": 80 + }, + { + "epoch": 0.004514798506214815, + "grad_norm": 0.6737162470817566, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.8162, + "step": 81 + }, + { + "epoch": 0.004570536759377961, + "grad_norm": 0.6803858280181885, + "learning_rate": 2.733333333333333e-05, + "loss": 1.9187, + "step": 82 + }, + { + "epoch": 0.004626275012541107, + "grad_norm": 0.6441910862922668, + "learning_rate": 2.7666666666666667e-05, + "loss": 1.9235, + "step": 83 + }, + { + "epoch": 0.0046820132657042525, + "grad_norm": 0.6409979462623596, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.9148, + "step": 84 + }, + { + "epoch": 0.004737751518867399, + "grad_norm": 0.722623348236084, + "learning_rate": 2.8333333333333335e-05, + "loss": 1.9738, + "step": 85 + }, + { + "epoch": 0.004793489772030544, + "grad_norm": 0.6637834310531616, + "learning_rate": 2.8666666666666668e-05, + "loss": 1.6872, + "step": 86 + }, + { + "epoch": 0.004849228025193691, + "grad_norm": 0.7143079042434692, + "learning_rate": 2.9e-05, + "loss": 1.9944, + "step": 87 + }, + { + "epoch": 0.004904966278356836, + "grad_norm": 0.7566176652908325, + "learning_rate": 2.9333333333333336e-05, + "loss": 1.7542, + "step": 88 + }, + { + "epoch": 0.004960704531519982, + "grad_norm": 0.6472474932670593, + "learning_rate": 2.9666666666666672e-05, + "loss": 1.9534, + "step": 89 + }, + { + "epoch": 0.005016442784683128, + "grad_norm": 0.6678224205970764, + "learning_rate": 3e-05, + "loss": 1.7684, + "step": 90 + }, + { + "epoch": 0.005072181037846274, + "grad_norm": 0.6665822267532349, + "learning_rate": 3.0333333333333337e-05, + "loss": 1.9028, + "step": 91 + }, + { + "epoch": 0.00512791929100942, + "grad_norm": 0.7907567620277405, + "learning_rate": 3.066666666666667e-05, + "loss": 1.8876, + "step": 92 + }, + { + "epoch": 0.005183657544172566, + "grad_norm": 0.6738147735595703, + "learning_rate": 3.1e-05, + "loss": 1.7623, + "step": 93 + }, + { + "epoch": 0.005239395797335711, + "grad_norm": 0.6898536086082458, + "learning_rate": 3.1333333333333334e-05, + "loss": 1.7103, + "step": 94 + }, + { + "epoch": 0.005295134050498858, + "grad_norm": 0.6961106061935425, + "learning_rate": 3.1666666666666666e-05, + "loss": 1.537, + "step": 95 + }, + { + "epoch": 0.005350872303662003, + "grad_norm": 0.6331319808959961, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.6681, + "step": 96 + }, + { + "epoch": 0.005406610556825149, + "grad_norm": 0.7678634524345398, + "learning_rate": 3.233333333333333e-05, + "loss": 2.1339, + "step": 97 + }, + { + "epoch": 0.005462348809988295, + "grad_norm": 0.7012338638305664, + "learning_rate": 3.266666666666667e-05, + "loss": 1.7591, + "step": 98 + }, + { + "epoch": 0.005518087063151441, + "grad_norm": 0.7289243340492249, + "learning_rate": 3.3e-05, + "loss": 1.901, + "step": 99 + }, + { + "epoch": 0.005573825316314587, + "grad_norm": 0.6416298747062683, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.5989, + "step": 100 + }, + { + "epoch": 0.005629563569477733, + "grad_norm": 0.6193853616714478, + "learning_rate": 3.366666666666667e-05, + "loss": 1.7429, + "step": 101 + }, + { + "epoch": 0.005685301822640878, + "grad_norm": 0.7283613681793213, + "learning_rate": 3.4000000000000007e-05, + "loss": 1.9885, + "step": 102 + }, + { + "epoch": 0.005741040075804025, + "grad_norm": 0.6713369488716125, + "learning_rate": 3.433333333333333e-05, + "loss": 1.8521, + "step": 103 + }, + { + "epoch": 0.00579677832896717, + "grad_norm": 0.6700227856636047, + "learning_rate": 3.466666666666667e-05, + "loss": 1.8404, + "step": 104 + }, + { + "epoch": 0.005852516582130316, + "grad_norm": 0.6885061860084534, + "learning_rate": 3.5e-05, + "loss": 1.8081, + "step": 105 + }, + { + "epoch": 0.005908254835293462, + "grad_norm": 0.6814194917678833, + "learning_rate": 3.5333333333333336e-05, + "loss": 1.8672, + "step": 106 + }, + { + "epoch": 0.005963993088456607, + "grad_norm": 0.6492342948913574, + "learning_rate": 3.566666666666667e-05, + "loss": 1.7029, + "step": 107 + }, + { + "epoch": 0.006019731341619754, + "grad_norm": 0.5920109748840332, + "learning_rate": 3.6e-05, + "loss": 1.5455, + "step": 108 + }, + { + "epoch": 0.006075469594782899, + "grad_norm": 0.6685107946395874, + "learning_rate": 3.633333333333333e-05, + "loss": 1.9576, + "step": 109 + }, + { + "epoch": 0.0061312078479460455, + "grad_norm": 0.6917557716369629, + "learning_rate": 3.6666666666666666e-05, + "loss": 1.9341, + "step": 110 + }, + { + "epoch": 0.006186946101109191, + "grad_norm": 0.730872631072998, + "learning_rate": 3.7e-05, + "loss": 1.9828, + "step": 111 + }, + { + "epoch": 0.006242684354272337, + "grad_norm": 0.7139527797698975, + "learning_rate": 3.733333333333334e-05, + "loss": 2.0277, + "step": 112 + }, + { + "epoch": 0.006298422607435483, + "grad_norm": 0.6276320219039917, + "learning_rate": 3.766666666666667e-05, + "loss": 1.7702, + "step": 113 + }, + { + "epoch": 0.006354160860598629, + "grad_norm": 0.6891281008720398, + "learning_rate": 3.8e-05, + "loss": 1.9062, + "step": 114 + }, + { + "epoch": 0.0064098991137617745, + "grad_norm": 0.7155683636665344, + "learning_rate": 3.8333333333333334e-05, + "loss": 1.8527, + "step": 115 + }, + { + "epoch": 0.006465637366924921, + "grad_norm": 0.6917515397071838, + "learning_rate": 3.866666666666667e-05, + "loss": 1.8439, + "step": 116 + }, + { + "epoch": 0.006521375620088066, + "grad_norm": 0.7216237783432007, + "learning_rate": 3.9000000000000006e-05, + "loss": 2.0114, + "step": 117 + }, + { + "epoch": 0.0065771138732512125, + "grad_norm": 0.6636412739753723, + "learning_rate": 3.933333333333333e-05, + "loss": 1.6951, + "step": 118 + }, + { + "epoch": 0.006632852126414358, + "grad_norm": 0.7715172171592712, + "learning_rate": 3.966666666666667e-05, + "loss": 1.9907, + "step": 119 + }, + { + "epoch": 0.006688590379577504, + "grad_norm": 0.6481485366821289, + "learning_rate": 4e-05, + "loss": 1.7934, + "step": 120 + }, + { + "epoch": 0.00674432863274065, + "grad_norm": 0.6104344725608826, + "learning_rate": 4.0333333333333336e-05, + "loss": 1.6549, + "step": 121 + }, + { + "epoch": 0.006800066885903796, + "grad_norm": 0.706912100315094, + "learning_rate": 4.066666666666667e-05, + "loss": 1.9666, + "step": 122 + }, + { + "epoch": 0.0068558051390669415, + "grad_norm": 0.7835676670074463, + "learning_rate": 4.1e-05, + "loss": 2.024, + "step": 123 + }, + { + "epoch": 0.006911543392230088, + "grad_norm": 0.6462398171424866, + "learning_rate": 4.133333333333333e-05, + "loss": 1.6993, + "step": 124 + }, + { + "epoch": 0.006967281645393233, + "grad_norm": 0.7756698727607727, + "learning_rate": 4.166666666666667e-05, + "loss": 2.0135, + "step": 125 + }, + { + "epoch": 0.00702301989855638, + "grad_norm": 0.6666940450668335, + "learning_rate": 4.2e-05, + "loss": 1.9444, + "step": 126 + }, + { + "epoch": 0.007078758151719525, + "grad_norm": 0.6363375782966614, + "learning_rate": 4.233333333333334e-05, + "loss": 1.6977, + "step": 127 + }, + { + "epoch": 0.007134496404882671, + "grad_norm": 0.6881687045097351, + "learning_rate": 4.266666666666667e-05, + "loss": 1.7938, + "step": 128 + }, + { + "epoch": 0.007190234658045817, + "grad_norm": 0.7950214147567749, + "learning_rate": 4.3e-05, + "loss": 2.1036, + "step": 129 + }, + { + "epoch": 0.007245972911208962, + "grad_norm": 0.6743674874305725, + "learning_rate": 4.3333333333333334e-05, + "loss": 2.0052, + "step": 130 + }, + { + "epoch": 0.007301711164372109, + "grad_norm": 0.7302188277244568, + "learning_rate": 4.3666666666666666e-05, + "loss": 1.7815, + "step": 131 + }, + { + "epoch": 0.007357449417535254, + "grad_norm": 0.691747784614563, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.7225, + "step": 132 + }, + { + "epoch": 0.0074131876706984, + "grad_norm": 0.6021103262901306, + "learning_rate": 4.433333333333334e-05, + "loss": 1.5821, + "step": 133 + }, + { + "epoch": 0.007468925923861546, + "grad_norm": 0.7083866000175476, + "learning_rate": 4.466666666666667e-05, + "loss": 1.7831, + "step": 134 + }, + { + "epoch": 0.007524664177024692, + "grad_norm": 0.6396238207817078, + "learning_rate": 4.5e-05, + "loss": 1.7933, + "step": 135 + }, + { + "epoch": 0.007580402430187838, + "grad_norm": 0.6446027159690857, + "learning_rate": 4.5333333333333335e-05, + "loss": 1.697, + "step": 136 + }, + { + "epoch": 0.007636140683350984, + "grad_norm": 0.6570568084716797, + "learning_rate": 4.566666666666667e-05, + "loss": 1.8226, + "step": 137 + }, + { + "epoch": 0.007691878936514129, + "grad_norm": 0.7829813361167908, + "learning_rate": 4.600000000000001e-05, + "loss": 1.9071, + "step": 138 + }, + { + "epoch": 0.007747617189677276, + "grad_norm": 0.6894962787628174, + "learning_rate": 4.633333333333333e-05, + "loss": 1.8796, + "step": 139 + }, + { + "epoch": 0.007803355442840421, + "grad_norm": 0.6631702184677124, + "learning_rate": 4.666666666666667e-05, + "loss": 1.7765, + "step": 140 + }, + { + "epoch": 0.007859093696003567, + "grad_norm": 0.7325467467308044, + "learning_rate": 4.7e-05, + "loss": 1.9653, + "step": 141 + }, + { + "epoch": 0.007914831949166713, + "grad_norm": 0.7264820337295532, + "learning_rate": 4.7333333333333336e-05, + "loss": 1.9019, + "step": 142 + }, + { + "epoch": 0.00797057020232986, + "grad_norm": 0.6573049426078796, + "learning_rate": 4.766666666666667e-05, + "loss": 1.8028, + "step": 143 + }, + { + "epoch": 0.008026308455493006, + "grad_norm": 0.6475189328193665, + "learning_rate": 4.8e-05, + "loss": 1.8229, + "step": 144 + }, + { + "epoch": 0.00808204670865615, + "grad_norm": 0.6277217864990234, + "learning_rate": 4.8333333333333334e-05, + "loss": 1.8648, + "step": 145 + }, + { + "epoch": 0.008137784961819296, + "grad_norm": 0.6631461381912231, + "learning_rate": 4.866666666666667e-05, + "loss": 1.7499, + "step": 146 + }, + { + "epoch": 0.008193523214982443, + "grad_norm": 0.8212792873382568, + "learning_rate": 4.9e-05, + "loss": 1.9345, + "step": 147 + }, + { + "epoch": 0.008249261468145589, + "grad_norm": 0.6783550977706909, + "learning_rate": 4.933333333333334e-05, + "loss": 2.0028, + "step": 148 + }, + { + "epoch": 0.008304999721308734, + "grad_norm": 0.7066723704338074, + "learning_rate": 4.966666666666667e-05, + "loss": 2.0291, + "step": 149 + }, + { + "epoch": 0.00836073797447188, + "grad_norm": 0.772089958190918, + "learning_rate": 5e-05, + "loss": 2.0909, + "step": 150 + }, + { + "epoch": 0.008416476227635026, + "grad_norm": 0.6396070718765259, + "learning_rate": 5.0333333333333335e-05, + "loss": 1.75, + "step": 151 + }, + { + "epoch": 0.008472214480798173, + "grad_norm": 0.6549371480941772, + "learning_rate": 5.0666666666666674e-05, + "loss": 1.8499, + "step": 152 + }, + { + "epoch": 0.008527952733961317, + "grad_norm": 0.7041524648666382, + "learning_rate": 5.1000000000000006e-05, + "loss": 1.9604, + "step": 153 + }, + { + "epoch": 0.008583690987124463, + "grad_norm": 0.6144838929176331, + "learning_rate": 5.133333333333333e-05, + "loss": 1.813, + "step": 154 + }, + { + "epoch": 0.00863942924028761, + "grad_norm": 0.5433954000473022, + "learning_rate": 5.166666666666667e-05, + "loss": 1.7692, + "step": 155 + }, + { + "epoch": 0.008695167493450754, + "grad_norm": 0.6341120600700378, + "learning_rate": 5.2000000000000004e-05, + "loss": 1.8756, + "step": 156 + }, + { + "epoch": 0.0087509057466139, + "grad_norm": 0.6475428938865662, + "learning_rate": 5.2333333333333336e-05, + "loss": 2.0465, + "step": 157 + }, + { + "epoch": 0.008806643999777047, + "grad_norm": 0.6457498669624329, + "learning_rate": 5.266666666666666e-05, + "loss": 1.9387, + "step": 158 + }, + { + "epoch": 0.008862382252940193, + "grad_norm": 0.562533974647522, + "learning_rate": 5.300000000000001e-05, + "loss": 1.7746, + "step": 159 + }, + { + "epoch": 0.008918120506103338, + "grad_norm": 0.6415228247642517, + "learning_rate": 5.333333333333333e-05, + "loss": 1.7729, + "step": 160 + }, + { + "epoch": 0.008973858759266484, + "grad_norm": 0.6404130458831787, + "learning_rate": 5.3666666666666666e-05, + "loss": 1.7488, + "step": 161 + }, + { + "epoch": 0.00902959701242963, + "grad_norm": 0.6626627445220947, + "learning_rate": 5.4000000000000005e-05, + "loss": 1.8962, + "step": 162 + }, + { + "epoch": 0.009085335265592777, + "grad_norm": 0.6191387176513672, + "learning_rate": 5.433333333333334e-05, + "loss": 1.8141, + "step": 163 + }, + { + "epoch": 0.009141073518755921, + "grad_norm": 0.5454838871955872, + "learning_rate": 5.466666666666666e-05, + "loss": 1.5107, + "step": 164 + }, + { + "epoch": 0.009196811771919068, + "grad_norm": 0.6767019033432007, + "learning_rate": 5.500000000000001e-05, + "loss": 2.1324, + "step": 165 + }, + { + "epoch": 0.009252550025082214, + "grad_norm": 0.6267591714859009, + "learning_rate": 5.5333333333333334e-05, + "loss": 1.7378, + "step": 166 + }, + { + "epoch": 0.00930828827824536, + "grad_norm": 0.5743867754936218, + "learning_rate": 5.566666666666667e-05, + "loss": 1.7654, + "step": 167 + }, + { + "epoch": 0.009364026531408505, + "grad_norm": 0.5550642013549805, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.8091, + "step": 168 + }, + { + "epoch": 0.009419764784571651, + "grad_norm": 0.5943305492401123, + "learning_rate": 5.633333333333334e-05, + "loss": 1.6823, + "step": 169 + }, + { + "epoch": 0.009475503037734798, + "grad_norm": 0.6027736663818359, + "learning_rate": 5.666666666666667e-05, + "loss": 1.7736, + "step": 170 + }, + { + "epoch": 0.009531241290897944, + "grad_norm": 0.6379444003105164, + "learning_rate": 5.6999999999999996e-05, + "loss": 2.0331, + "step": 171 + }, + { + "epoch": 0.009586979544061089, + "grad_norm": 0.6117588877677917, + "learning_rate": 5.7333333333333336e-05, + "loss": 1.8546, + "step": 172 + }, + { + "epoch": 0.009642717797224235, + "grad_norm": 0.6109329462051392, + "learning_rate": 5.766666666666667e-05, + "loss": 2.0427, + "step": 173 + }, + { + "epoch": 0.009698456050387381, + "grad_norm": 0.5530399084091187, + "learning_rate": 5.8e-05, + "loss": 1.7323, + "step": 174 + }, + { + "epoch": 0.009754194303550527, + "grad_norm": 0.7092908024787903, + "learning_rate": 5.833333333333334e-05, + "loss": 2.2, + "step": 175 + }, + { + "epoch": 0.009809932556713672, + "grad_norm": 0.5897237658500671, + "learning_rate": 5.866666666666667e-05, + "loss": 1.5879, + "step": 176 + }, + { + "epoch": 0.009865670809876818, + "grad_norm": 0.5485551357269287, + "learning_rate": 5.9e-05, + "loss": 1.6043, + "step": 177 + }, + { + "epoch": 0.009921409063039965, + "grad_norm": 0.5792586803436279, + "learning_rate": 5.9333333333333343e-05, + "loss": 1.8772, + "step": 178 + }, + { + "epoch": 0.009977147316203111, + "grad_norm": 0.6716285943984985, + "learning_rate": 5.966666666666667e-05, + "loss": 1.7887, + "step": 179 + }, + { + "epoch": 0.010032885569366256, + "grad_norm": 0.5866957902908325, + "learning_rate": 6e-05, + "loss": 1.7228, + "step": 180 + }, + { + "epoch": 0.010088623822529402, + "grad_norm": 0.6197178363800049, + "learning_rate": 6.033333333333334e-05, + "loss": 1.7767, + "step": 181 + }, + { + "epoch": 0.010144362075692548, + "grad_norm": 0.6811436414718628, + "learning_rate": 6.066666666666667e-05, + "loss": 2.002, + "step": 182 + }, + { + "epoch": 0.010200100328855693, + "grad_norm": 0.6519239544868469, + "learning_rate": 6.1e-05, + "loss": 1.7755, + "step": 183 + }, + { + "epoch": 0.01025583858201884, + "grad_norm": 0.5758973360061646, + "learning_rate": 6.133333333333334e-05, + "loss": 1.7244, + "step": 184 + }, + { + "epoch": 0.010311576835181985, + "grad_norm": 0.5882923007011414, + "learning_rate": 6.166666666666667e-05, + "loss": 1.8041, + "step": 185 + }, + { + "epoch": 0.010367315088345132, + "grad_norm": 0.5509873032569885, + "learning_rate": 6.2e-05, + "loss": 1.7813, + "step": 186 + }, + { + "epoch": 0.010423053341508276, + "grad_norm": 0.5870537757873535, + "learning_rate": 6.233333333333334e-05, + "loss": 1.9419, + "step": 187 + }, + { + "epoch": 0.010478791594671423, + "grad_norm": 0.5315700173377991, + "learning_rate": 6.266666666666667e-05, + "loss": 1.6804, + "step": 188 + }, + { + "epoch": 0.010534529847834569, + "grad_norm": 0.5694735646247864, + "learning_rate": 6.3e-05, + "loss": 1.8406, + "step": 189 + }, + { + "epoch": 0.010590268100997715, + "grad_norm": 0.5579227209091187, + "learning_rate": 6.333333333333333e-05, + "loss": 1.9451, + "step": 190 + }, + { + "epoch": 0.01064600635416086, + "grad_norm": 0.5777730941772461, + "learning_rate": 6.366666666666668e-05, + "loss": 1.7783, + "step": 191 + }, + { + "epoch": 0.010701744607324006, + "grad_norm": 0.5626804828643799, + "learning_rate": 6.400000000000001e-05, + "loss": 1.8944, + "step": 192 + }, + { + "epoch": 0.010757482860487153, + "grad_norm": 0.5726325511932373, + "learning_rate": 6.433333333333333e-05, + "loss": 1.8799, + "step": 193 + }, + { + "epoch": 0.010813221113650299, + "grad_norm": 0.6156812906265259, + "learning_rate": 6.466666666666666e-05, + "loss": 1.8651, + "step": 194 + }, + { + "epoch": 0.010868959366813443, + "grad_norm": 0.545893669128418, + "learning_rate": 6.500000000000001e-05, + "loss": 1.6938, + "step": 195 + }, + { + "epoch": 0.01092469761997659, + "grad_norm": 0.5374442934989929, + "learning_rate": 6.533333333333334e-05, + "loss": 1.756, + "step": 196 + }, + { + "epoch": 0.010980435873139736, + "grad_norm": 0.5943235754966736, + "learning_rate": 6.566666666666666e-05, + "loss": 1.8388, + "step": 197 + }, + { + "epoch": 0.011036174126302882, + "grad_norm": 0.7199476361274719, + "learning_rate": 6.6e-05, + "loss": 2.0311, + "step": 198 + }, + { + "epoch": 0.011091912379466027, + "grad_norm": 0.65143883228302, + "learning_rate": 6.633333333333334e-05, + "loss": 2.0285, + "step": 199 + }, + { + "epoch": 0.011147650632629173, + "grad_norm": 0.5984755754470825, + "learning_rate": 6.666666666666667e-05, + "loss": 1.7062, + "step": 200 + }, + { + "epoch": 0.01120338888579232, + "grad_norm": 0.5733404755592346, + "learning_rate": 6.7e-05, + "loss": 1.916, + "step": 201 + }, + { + "epoch": 0.011259127138955466, + "grad_norm": 0.5946204662322998, + "learning_rate": 6.733333333333333e-05, + "loss": 1.9394, + "step": 202 + }, + { + "epoch": 0.01131486539211861, + "grad_norm": 0.677741527557373, + "learning_rate": 6.766666666666667e-05, + "loss": 2.248, + "step": 203 + }, + { + "epoch": 0.011370603645281757, + "grad_norm": 0.5983121991157532, + "learning_rate": 6.800000000000001e-05, + "loss": 1.835, + "step": 204 + }, + { + "epoch": 0.011426341898444903, + "grad_norm": 0.5219351053237915, + "learning_rate": 6.833333333333333e-05, + "loss": 1.7373, + "step": 205 + }, + { + "epoch": 0.01148208015160805, + "grad_norm": 0.657131552696228, + "learning_rate": 6.866666666666666e-05, + "loss": 2.1801, + "step": 206 + }, + { + "epoch": 0.011537818404771194, + "grad_norm": 0.6068251132965088, + "learning_rate": 6.9e-05, + "loss": 1.7873, + "step": 207 + }, + { + "epoch": 0.01159355665793434, + "grad_norm": 0.5744972825050354, + "learning_rate": 6.933333333333334e-05, + "loss": 1.9491, + "step": 208 + }, + { + "epoch": 0.011649294911097487, + "grad_norm": 0.5395380854606628, + "learning_rate": 6.966666666666668e-05, + "loss": 1.7532, + "step": 209 + }, + { + "epoch": 0.011705033164260631, + "grad_norm": 0.5843316912651062, + "learning_rate": 7e-05, + "loss": 1.7694, + "step": 210 + }, + { + "epoch": 0.011760771417423778, + "grad_norm": 0.6699615716934204, + "learning_rate": 7.033333333333334e-05, + "loss": 2.2063, + "step": 211 + }, + { + "epoch": 0.011816509670586924, + "grad_norm": 0.5723788738250732, + "learning_rate": 7.066666666666667e-05, + "loss": 1.8842, + "step": 212 + }, + { + "epoch": 0.01187224792375007, + "grad_norm": 0.5478008985519409, + "learning_rate": 7.1e-05, + "loss": 1.7411, + "step": 213 + }, + { + "epoch": 0.011927986176913215, + "grad_norm": 0.567477285861969, + "learning_rate": 7.133333333333334e-05, + "loss": 1.8457, + "step": 214 + }, + { + "epoch": 0.011983724430076361, + "grad_norm": 0.5568417310714722, + "learning_rate": 7.166666666666667e-05, + "loss": 1.8425, + "step": 215 + }, + { + "epoch": 0.012039462683239507, + "grad_norm": 0.552416205406189, + "learning_rate": 7.2e-05, + "loss": 1.9535, + "step": 216 + }, + { + "epoch": 0.012095200936402654, + "grad_norm": 0.6089819073677063, + "learning_rate": 7.233333333333335e-05, + "loss": 1.8465, + "step": 217 + }, + { + "epoch": 0.012150939189565798, + "grad_norm": 0.6218812465667725, + "learning_rate": 7.266666666666667e-05, + "loss": 2.1711, + "step": 218 + }, + { + "epoch": 0.012206677442728945, + "grad_norm": 0.5704020261764526, + "learning_rate": 7.3e-05, + "loss": 1.7793, + "step": 219 + }, + { + "epoch": 0.012262415695892091, + "grad_norm": 0.5598061084747314, + "learning_rate": 7.333333333333333e-05, + "loss": 1.9454, + "step": 220 + }, + { + "epoch": 0.012318153949055237, + "grad_norm": 0.5439260601997375, + "learning_rate": 7.366666666666668e-05, + "loss": 1.8544, + "step": 221 + }, + { + "epoch": 0.012373892202218382, + "grad_norm": 0.5953371524810791, + "learning_rate": 7.4e-05, + "loss": 1.8335, + "step": 222 + }, + { + "epoch": 0.012429630455381528, + "grad_norm": 0.5699326395988464, + "learning_rate": 7.433333333333333e-05, + "loss": 1.6647, + "step": 223 + }, + { + "epoch": 0.012485368708544674, + "grad_norm": 0.5833302140235901, + "learning_rate": 7.466666666666667e-05, + "loss": 1.9092, + "step": 224 + }, + { + "epoch": 0.01254110696170782, + "grad_norm": 0.5663686394691467, + "learning_rate": 7.500000000000001e-05, + "loss": 1.7344, + "step": 225 + }, + { + "epoch": 0.012596845214870965, + "grad_norm": 0.5459832549095154, + "learning_rate": 7.533333333333334e-05, + "loss": 1.6805, + "step": 226 + }, + { + "epoch": 0.012652583468034112, + "grad_norm": 0.6193357110023499, + "learning_rate": 7.566666666666667e-05, + "loss": 1.6711, + "step": 227 + }, + { + "epoch": 0.012708321721197258, + "grad_norm": 0.6414167284965515, + "learning_rate": 7.6e-05, + "loss": 1.9194, + "step": 228 + }, + { + "epoch": 0.012764059974360404, + "grad_norm": 0.541812539100647, + "learning_rate": 7.633333333333334e-05, + "loss": 1.9374, + "step": 229 + }, + { + "epoch": 0.012819798227523549, + "grad_norm": 0.5368767976760864, + "learning_rate": 7.666666666666667e-05, + "loss": 1.605, + "step": 230 + }, + { + "epoch": 0.012875536480686695, + "grad_norm": 0.622112512588501, + "learning_rate": 7.7e-05, + "loss": 1.804, + "step": 231 + }, + { + "epoch": 0.012931274733849842, + "grad_norm": 0.5820221900939941, + "learning_rate": 7.733333333333333e-05, + "loss": 1.796, + "step": 232 + }, + { + "epoch": 0.012987012987012988, + "grad_norm": 0.5530866980552673, + "learning_rate": 7.766666666666667e-05, + "loss": 1.704, + "step": 233 + }, + { + "epoch": 0.013042751240176132, + "grad_norm": 0.5967001914978027, + "learning_rate": 7.800000000000001e-05, + "loss": 2.0598, + "step": 234 + }, + { + "epoch": 0.013098489493339279, + "grad_norm": 0.5761673450469971, + "learning_rate": 7.833333333333333e-05, + "loss": 1.9391, + "step": 235 + }, + { + "epoch": 0.013154227746502425, + "grad_norm": 0.582139253616333, + "learning_rate": 7.866666666666666e-05, + "loss": 1.851, + "step": 236 + }, + { + "epoch": 0.01320996599966557, + "grad_norm": 0.6047868132591248, + "learning_rate": 7.900000000000001e-05, + "loss": 1.9757, + "step": 237 + }, + { + "epoch": 0.013265704252828716, + "grad_norm": 0.6394466757774353, + "learning_rate": 7.933333333333334e-05, + "loss": 2.2063, + "step": 238 + }, + { + "epoch": 0.013321442505991862, + "grad_norm": 0.6129965782165527, + "learning_rate": 7.966666666666666e-05, + "loss": 1.8813, + "step": 239 + }, + { + "epoch": 0.013377180759155009, + "grad_norm": 0.5982023477554321, + "learning_rate": 8e-05, + "loss": 1.928, + "step": 240 + }, + { + "epoch": 0.013432919012318153, + "grad_norm": 0.515180230140686, + "learning_rate": 8.033333333333334e-05, + "loss": 1.5582, + "step": 241 + }, + { + "epoch": 0.0134886572654813, + "grad_norm": 0.669916033744812, + "learning_rate": 8.066666666666667e-05, + "loss": 2.1044, + "step": 242 + }, + { + "epoch": 0.013544395518644446, + "grad_norm": 0.5825132131576538, + "learning_rate": 8.1e-05, + "loss": 1.7521, + "step": 243 + }, + { + "epoch": 0.013600133771807592, + "grad_norm": 0.6118985414505005, + "learning_rate": 8.133333333333334e-05, + "loss": 1.9605, + "step": 244 + }, + { + "epoch": 0.013655872024970737, + "grad_norm": 0.5747547745704651, + "learning_rate": 8.166666666666667e-05, + "loss": 1.8198, + "step": 245 + }, + { + "epoch": 0.013711610278133883, + "grad_norm": 0.609553337097168, + "learning_rate": 8.2e-05, + "loss": 2.0001, + "step": 246 + }, + { + "epoch": 0.01376734853129703, + "grad_norm": 0.5751491189002991, + "learning_rate": 8.233333333333333e-05, + "loss": 1.9317, + "step": 247 + }, + { + "epoch": 0.013823086784460176, + "grad_norm": 0.599029541015625, + "learning_rate": 8.266666666666667e-05, + "loss": 1.7716, + "step": 248 + }, + { + "epoch": 0.01387882503762332, + "grad_norm": 0.5347121953964233, + "learning_rate": 8.3e-05, + "loss": 1.82, + "step": 249 + }, + { + "epoch": 0.013934563290786467, + "grad_norm": 0.5724605917930603, + "learning_rate": 8.333333333333334e-05, + "loss": 1.8309, + "step": 250 + }, + { + "epoch": 0.013990301543949613, + "grad_norm": 0.531136691570282, + "learning_rate": 8.366666666666668e-05, + "loss": 1.682, + "step": 251 + }, + { + "epoch": 0.01404603979711276, + "grad_norm": 0.5464481115341187, + "learning_rate": 8.4e-05, + "loss": 2.001, + "step": 252 + }, + { + "epoch": 0.014101778050275904, + "grad_norm": 0.5945254564285278, + "learning_rate": 8.433333333333334e-05, + "loss": 1.7766, + "step": 253 + }, + { + "epoch": 0.01415751630343905, + "grad_norm": 0.5452976226806641, + "learning_rate": 8.466666666666667e-05, + "loss": 1.6948, + "step": 254 + }, + { + "epoch": 0.014213254556602196, + "grad_norm": 0.5722144842147827, + "learning_rate": 8.5e-05, + "loss": 1.8978, + "step": 255 + }, + { + "epoch": 0.014268992809765343, + "grad_norm": 0.5629029870033264, + "learning_rate": 8.533333333333334e-05, + "loss": 1.7381, + "step": 256 + }, + { + "epoch": 0.014324731062928487, + "grad_norm": 0.584661066532135, + "learning_rate": 8.566666666666667e-05, + "loss": 1.7016, + "step": 257 + }, + { + "epoch": 0.014380469316091634, + "grad_norm": 0.544104814529419, + "learning_rate": 8.6e-05, + "loss": 1.8649, + "step": 258 + }, + { + "epoch": 0.01443620756925478, + "grad_norm": 0.5734279751777649, + "learning_rate": 8.633333333333334e-05, + "loss": 1.7844, + "step": 259 + }, + { + "epoch": 0.014491945822417925, + "grad_norm": 0.5523878335952759, + "learning_rate": 8.666666666666667e-05, + "loss": 2.0572, + "step": 260 + }, + { + "epoch": 0.014547684075581071, + "grad_norm": 0.5634390115737915, + "learning_rate": 8.7e-05, + "loss": 1.8073, + "step": 261 + }, + { + "epoch": 0.014603422328744217, + "grad_norm": 0.5875604152679443, + "learning_rate": 8.733333333333333e-05, + "loss": 1.9706, + "step": 262 + }, + { + "epoch": 0.014659160581907364, + "grad_norm": 0.534288227558136, + "learning_rate": 8.766666666666668e-05, + "loss": 1.7742, + "step": 263 + }, + { + "epoch": 0.014714898835070508, + "grad_norm": 0.5286023020744324, + "learning_rate": 8.800000000000001e-05, + "loss": 1.6763, + "step": 264 + }, + { + "epoch": 0.014770637088233654, + "grad_norm": 0.5768111944198608, + "learning_rate": 8.833333333333333e-05, + "loss": 1.5731, + "step": 265 + }, + { + "epoch": 0.0148263753413968, + "grad_norm": 0.552629292011261, + "learning_rate": 8.866666666666668e-05, + "loss": 1.9837, + "step": 266 + }, + { + "epoch": 0.014882113594559947, + "grad_norm": 0.5081507563591003, + "learning_rate": 8.900000000000001e-05, + "loss": 1.8844, + "step": 267 + }, + { + "epoch": 0.014937851847723092, + "grad_norm": 0.563845694065094, + "learning_rate": 8.933333333333334e-05, + "loss": 1.9141, + "step": 268 + }, + { + "epoch": 0.014993590100886238, + "grad_norm": 0.5855246186256409, + "learning_rate": 8.966666666666666e-05, + "loss": 2.1101, + "step": 269 + }, + { + "epoch": 0.015049328354049384, + "grad_norm": 0.5010532736778259, + "learning_rate": 9e-05, + "loss": 1.8388, + "step": 270 + }, + { + "epoch": 0.01510506660721253, + "grad_norm": 0.5565475225448608, + "learning_rate": 9.033333333333334e-05, + "loss": 1.8648, + "step": 271 + }, + { + "epoch": 0.015160804860375675, + "grad_norm": 0.5293692350387573, + "learning_rate": 9.066666666666667e-05, + "loss": 1.7059, + "step": 272 + }, + { + "epoch": 0.015216543113538821, + "grad_norm": 0.5180760025978088, + "learning_rate": 9.1e-05, + "loss": 1.8659, + "step": 273 + }, + { + "epoch": 0.015272281366701968, + "grad_norm": 0.5416427254676819, + "learning_rate": 9.133333333333334e-05, + "loss": 1.6187, + "step": 274 + }, + { + "epoch": 0.015328019619865114, + "grad_norm": 0.603060781955719, + "learning_rate": 9.166666666666667e-05, + "loss": 1.8554, + "step": 275 + }, + { + "epoch": 0.015383757873028259, + "grad_norm": 0.5260182023048401, + "learning_rate": 9.200000000000001e-05, + "loss": 1.8108, + "step": 276 + }, + { + "epoch": 0.015439496126191405, + "grad_norm": 0.5307485461235046, + "learning_rate": 9.233333333333333e-05, + "loss": 1.7369, + "step": 277 + }, + { + "epoch": 0.015495234379354551, + "grad_norm": 0.5671928524971008, + "learning_rate": 9.266666666666666e-05, + "loss": 1.7879, + "step": 278 + }, + { + "epoch": 0.015550972632517698, + "grad_norm": 0.5482888221740723, + "learning_rate": 9.300000000000001e-05, + "loss": 1.8687, + "step": 279 + }, + { + "epoch": 0.015606710885680842, + "grad_norm": 0.5492271184921265, + "learning_rate": 9.333333333333334e-05, + "loss": 2.0486, + "step": 280 + }, + { + "epoch": 0.01566244913884399, + "grad_norm": 0.5533493757247925, + "learning_rate": 9.366666666666668e-05, + "loss": 1.8764, + "step": 281 + }, + { + "epoch": 0.015718187392007133, + "grad_norm": 0.5373388528823853, + "learning_rate": 9.4e-05, + "loss": 1.8098, + "step": 282 + }, + { + "epoch": 0.01577392564517028, + "grad_norm": 0.5737355351448059, + "learning_rate": 9.433333333333334e-05, + "loss": 1.8023, + "step": 283 + }, + { + "epoch": 0.015829663898333426, + "grad_norm": 0.6059421896934509, + "learning_rate": 9.466666666666667e-05, + "loss": 1.9003, + "step": 284 + }, + { + "epoch": 0.015885402151496572, + "grad_norm": 0.545070230960846, + "learning_rate": 9.5e-05, + "loss": 1.6793, + "step": 285 + }, + { + "epoch": 0.01594114040465972, + "grad_norm": 0.5391154885292053, + "learning_rate": 9.533333333333334e-05, + "loss": 1.7691, + "step": 286 + }, + { + "epoch": 0.015996878657822865, + "grad_norm": 0.5233768820762634, + "learning_rate": 9.566666666666667e-05, + "loss": 1.8312, + "step": 287 + }, + { + "epoch": 0.01605261691098601, + "grad_norm": 0.5520955920219421, + "learning_rate": 9.6e-05, + "loss": 1.9652, + "step": 288 + }, + { + "epoch": 0.016108355164149154, + "grad_norm": 0.5521306991577148, + "learning_rate": 9.633333333333335e-05, + "loss": 1.8264, + "step": 289 + }, + { + "epoch": 0.0161640934173123, + "grad_norm": 0.5325077176094055, + "learning_rate": 9.666666666666667e-05, + "loss": 1.9074, + "step": 290 + }, + { + "epoch": 0.016219831670475447, + "grad_norm": 0.5402048230171204, + "learning_rate": 9.7e-05, + "loss": 1.9993, + "step": 291 + }, + { + "epoch": 0.016275569923638593, + "grad_norm": 0.5164310336112976, + "learning_rate": 9.733333333333335e-05, + "loss": 1.6385, + "step": 292 + }, + { + "epoch": 0.01633130817680174, + "grad_norm": 0.5265329480171204, + "learning_rate": 9.766666666666668e-05, + "loss": 1.8513, + "step": 293 + }, + { + "epoch": 0.016387046429964885, + "grad_norm": 0.5051769614219666, + "learning_rate": 9.8e-05, + "loss": 1.7628, + "step": 294 + }, + { + "epoch": 0.016442784683128032, + "grad_norm": 0.5061401128768921, + "learning_rate": 9.833333333333333e-05, + "loss": 1.8406, + "step": 295 + }, + { + "epoch": 0.016498522936291178, + "grad_norm": 0.6622328162193298, + "learning_rate": 9.866666666666668e-05, + "loss": 1.9504, + "step": 296 + }, + { + "epoch": 0.01655426118945432, + "grad_norm": 0.5525157451629639, + "learning_rate": 9.900000000000001e-05, + "loss": 1.9845, + "step": 297 + }, + { + "epoch": 0.016609999442617467, + "grad_norm": 0.5412437319755554, + "learning_rate": 9.933333333333334e-05, + "loss": 1.8234, + "step": 298 + }, + { + "epoch": 0.016665737695780614, + "grad_norm": 0.53217613697052, + "learning_rate": 9.966666666666667e-05, + "loss": 1.6132, + "step": 299 + }, + { + "epoch": 0.01672147594894376, + "grad_norm": 0.6531130075454712, + "learning_rate": 0.0001, + "loss": 2.0395, + "step": 300 + }, + { + "epoch": 0.016777214202106906, + "grad_norm": 0.49301308393478394, + "learning_rate": 9.999999920714576e-05, + "loss": 1.6945, + "step": 301 + }, + { + "epoch": 0.016832952455270053, + "grad_norm": 0.49394482374191284, + "learning_rate": 9.999999682858307e-05, + "loss": 1.6877, + "step": 302 + }, + { + "epoch": 0.0168886907084332, + "grad_norm": 0.504688024520874, + "learning_rate": 9.9999992864312e-05, + "loss": 1.6779, + "step": 303 + }, + { + "epoch": 0.016944428961596345, + "grad_norm": 0.5286409258842468, + "learning_rate": 9.999998731433267e-05, + "loss": 1.64, + "step": 304 + }, + { + "epoch": 0.017000167214759488, + "grad_norm": 0.4911554157733917, + "learning_rate": 9.999998017864527e-05, + "loss": 1.66, + "step": 305 + }, + { + "epoch": 0.017055905467922634, + "grad_norm": 0.4851885735988617, + "learning_rate": 9.999997145725001e-05, + "loss": 1.8884, + "step": 306 + }, + { + "epoch": 0.01711164372108578, + "grad_norm": 0.521120011806488, + "learning_rate": 9.999996115014719e-05, + "loss": 1.6844, + "step": 307 + }, + { + "epoch": 0.017167381974248927, + "grad_norm": 0.5494885444641113, + "learning_rate": 9.99999492573371e-05, + "loss": 1.7733, + "step": 308 + }, + { + "epoch": 0.017223120227412073, + "grad_norm": 0.4475904703140259, + "learning_rate": 9.999993577882016e-05, + "loss": 1.6295, + "step": 309 + }, + { + "epoch": 0.01727885848057522, + "grad_norm": 0.4610547721385956, + "learning_rate": 9.999992071459676e-05, + "loss": 1.6118, + "step": 310 + }, + { + "epoch": 0.017334596733738366, + "grad_norm": 0.49445369839668274, + "learning_rate": 9.999990406466741e-05, + "loss": 1.594, + "step": 311 + }, + { + "epoch": 0.01739033498690151, + "grad_norm": 0.5013507008552551, + "learning_rate": 9.999988582903262e-05, + "loss": 1.6829, + "step": 312 + }, + { + "epoch": 0.017446073240064655, + "grad_norm": 0.5492314100265503, + "learning_rate": 9.999986600769295e-05, + "loss": 1.662, + "step": 313 + }, + { + "epoch": 0.0175018114932278, + "grad_norm": 0.49456071853637695, + "learning_rate": 9.999984460064908e-05, + "loss": 1.7087, + "step": 314 + }, + { + "epoch": 0.017557549746390948, + "grad_norm": 0.587954580783844, + "learning_rate": 9.999982160790164e-05, + "loss": 1.8628, + "step": 315 + }, + { + "epoch": 0.017613287999554094, + "grad_norm": 0.6061418652534485, + "learning_rate": 9.999979702945138e-05, + "loss": 2.143, + "step": 316 + }, + { + "epoch": 0.01766902625271724, + "grad_norm": 0.52556973695755, + "learning_rate": 9.999977086529909e-05, + "loss": 1.6862, + "step": 317 + }, + { + "epoch": 0.017724764505880387, + "grad_norm": 0.5804201364517212, + "learning_rate": 9.999974311544556e-05, + "loss": 1.8495, + "step": 318 + }, + { + "epoch": 0.017780502759043533, + "grad_norm": 0.5533789396286011, + "learning_rate": 9.999971377989172e-05, + "loss": 1.9501, + "step": 319 + }, + { + "epoch": 0.017836241012206676, + "grad_norm": 0.5596528649330139, + "learning_rate": 9.999968285863848e-05, + "loss": 1.981, + "step": 320 + }, + { + "epoch": 0.017891979265369822, + "grad_norm": 0.538735568523407, + "learning_rate": 9.99996503516868e-05, + "loss": 1.9126, + "step": 321 + }, + { + "epoch": 0.01794771751853297, + "grad_norm": 0.48604801297187805, + "learning_rate": 9.999961625903774e-05, + "loss": 1.7568, + "step": 322 + }, + { + "epoch": 0.018003455771696115, + "grad_norm": 0.5091099143028259, + "learning_rate": 9.999958058069237e-05, + "loss": 1.9625, + "step": 323 + }, + { + "epoch": 0.01805919402485926, + "grad_norm": 0.4944256842136383, + "learning_rate": 9.999954331665182e-05, + "loss": 1.6326, + "step": 324 + }, + { + "epoch": 0.018114932278022407, + "grad_norm": 0.5379263162612915, + "learning_rate": 9.999950446691728e-05, + "loss": 1.8484, + "step": 325 + }, + { + "epoch": 0.018170670531185554, + "grad_norm": 0.5548909306526184, + "learning_rate": 9.999946403148997e-05, + "loss": 1.8855, + "step": 326 + }, + { + "epoch": 0.0182264087843487, + "grad_norm": 0.5878908634185791, + "learning_rate": 9.999942201037118e-05, + "loss": 1.8222, + "step": 327 + }, + { + "epoch": 0.018282147037511843, + "grad_norm": 0.48953092098236084, + "learning_rate": 9.999937840356224e-05, + "loss": 1.4395, + "step": 328 + }, + { + "epoch": 0.01833788529067499, + "grad_norm": 0.503923237323761, + "learning_rate": 9.999933321106452e-05, + "loss": 1.7122, + "step": 329 + }, + { + "epoch": 0.018393623543838136, + "grad_norm": 0.5150753855705261, + "learning_rate": 9.999928643287948e-05, + "loss": 1.8863, + "step": 330 + }, + { + "epoch": 0.018449361797001282, + "grad_norm": 0.5160688757896423, + "learning_rate": 9.999923806900859e-05, + "loss": 1.8184, + "step": 331 + }, + { + "epoch": 0.018505100050164428, + "grad_norm": 0.5423057079315186, + "learning_rate": 9.99991881194534e-05, + "loss": 1.843, + "step": 332 + }, + { + "epoch": 0.018560838303327575, + "grad_norm": 0.5026907324790955, + "learning_rate": 9.999913658421544e-05, + "loss": 1.7728, + "step": 333 + }, + { + "epoch": 0.01861657655649072, + "grad_norm": 0.5391967296600342, + "learning_rate": 9.999908346329642e-05, + "loss": 1.9225, + "step": 334 + }, + { + "epoch": 0.018672314809653867, + "grad_norm": 0.5050860047340393, + "learning_rate": 9.999902875669797e-05, + "loss": 1.7579, + "step": 335 + }, + { + "epoch": 0.01872805306281701, + "grad_norm": 0.48109737038612366, + "learning_rate": 9.999897246442184e-05, + "loss": 1.8859, + "step": 336 + }, + { + "epoch": 0.018783791315980156, + "grad_norm": 0.5002635717391968, + "learning_rate": 9.999891458646983e-05, + "loss": 1.6809, + "step": 337 + }, + { + "epoch": 0.018839529569143303, + "grad_norm": 0.5138371586799622, + "learning_rate": 9.999885512284375e-05, + "loss": 1.7961, + "step": 338 + }, + { + "epoch": 0.01889526782230645, + "grad_norm": 0.47246232628822327, + "learning_rate": 9.999879407354551e-05, + "loss": 1.6943, + "step": 339 + }, + { + "epoch": 0.018951006075469595, + "grad_norm": 0.47807106375694275, + "learning_rate": 9.999873143857704e-05, + "loss": 1.7652, + "step": 340 + }, + { + "epoch": 0.01900674432863274, + "grad_norm": 0.4725436270236969, + "learning_rate": 9.99986672179403e-05, + "loss": 1.7483, + "step": 341 + }, + { + "epoch": 0.019062482581795888, + "grad_norm": 0.5131480693817139, + "learning_rate": 9.999860141163736e-05, + "loss": 1.8883, + "step": 342 + }, + { + "epoch": 0.01911822083495903, + "grad_norm": 0.6150394678115845, + "learning_rate": 9.99985340196703e-05, + "loss": 2.1536, + "step": 343 + }, + { + "epoch": 0.019173959088122177, + "grad_norm": 0.5729528069496155, + "learning_rate": 9.999846504204124e-05, + "loss": 1.9443, + "step": 344 + }, + { + "epoch": 0.019229697341285323, + "grad_norm": 0.4936676323413849, + "learning_rate": 9.999839447875238e-05, + "loss": 1.7273, + "step": 345 + }, + { + "epoch": 0.01928543559444847, + "grad_norm": 0.5480337738990784, + "learning_rate": 9.999832232980597e-05, + "loss": 1.8024, + "step": 346 + }, + { + "epoch": 0.019341173847611616, + "grad_norm": 0.4883441925048828, + "learning_rate": 9.999824859520428e-05, + "loss": 1.6531, + "step": 347 + }, + { + "epoch": 0.019396912100774762, + "grad_norm": 0.6438686847686768, + "learning_rate": 9.999817327494967e-05, + "loss": 2.1477, + "step": 348 + }, + { + "epoch": 0.01945265035393791, + "grad_norm": 0.540684700012207, + "learning_rate": 9.999809636904449e-05, + "loss": 2.0333, + "step": 349 + }, + { + "epoch": 0.019508388607101055, + "grad_norm": 0.5322266221046448, + "learning_rate": 9.999801787749121e-05, + "loss": 1.7542, + "step": 350 + }, + { + "epoch": 0.019564126860264198, + "grad_norm": 0.5497377514839172, + "learning_rate": 9.999793780029232e-05, + "loss": 1.9207, + "step": 351 + }, + { + "epoch": 0.019619865113427344, + "grad_norm": 0.5375553369522095, + "learning_rate": 9.999785613745035e-05, + "loss": 1.8293, + "step": 352 + }, + { + "epoch": 0.01967560336659049, + "grad_norm": 0.5242462754249573, + "learning_rate": 9.999777288896787e-05, + "loss": 1.8176, + "step": 353 + }, + { + "epoch": 0.019731341619753637, + "grad_norm": 0.5194500088691711, + "learning_rate": 9.999768805484757e-05, + "loss": 1.961, + "step": 354 + }, + { + "epoch": 0.019787079872916783, + "grad_norm": 0.4952162504196167, + "learning_rate": 9.999760163509209e-05, + "loss": 1.6902, + "step": 355 + }, + { + "epoch": 0.01984281812607993, + "grad_norm": 0.4688204824924469, + "learning_rate": 9.99975136297042e-05, + "loss": 1.352, + "step": 356 + }, + { + "epoch": 0.019898556379243076, + "grad_norm": 0.5171904563903809, + "learning_rate": 9.999742403868668e-05, + "loss": 1.952, + "step": 357 + }, + { + "epoch": 0.019954294632406222, + "grad_norm": 0.542300283908844, + "learning_rate": 9.999733286204238e-05, + "loss": 1.8768, + "step": 358 + }, + { + "epoch": 0.020010032885569365, + "grad_norm": 0.5278236865997314, + "learning_rate": 9.99972400997742e-05, + "loss": 1.8014, + "step": 359 + }, + { + "epoch": 0.02006577113873251, + "grad_norm": 0.587790846824646, + "learning_rate": 9.999714575188505e-05, + "loss": 1.9884, + "step": 360 + }, + { + "epoch": 0.020121509391895658, + "grad_norm": 0.5114203095436096, + "learning_rate": 9.999704981837794e-05, + "loss": 1.9038, + "step": 361 + }, + { + "epoch": 0.020177247645058804, + "grad_norm": 0.538783609867096, + "learning_rate": 9.999695229925591e-05, + "loss": 1.9049, + "step": 362 + }, + { + "epoch": 0.02023298589822195, + "grad_norm": 0.5289005637168884, + "learning_rate": 9.999685319452208e-05, + "loss": 1.7111, + "step": 363 + }, + { + "epoch": 0.020288724151385096, + "grad_norm": 0.5257157683372498, + "learning_rate": 9.999675250417954e-05, + "loss": 1.6416, + "step": 364 + }, + { + "epoch": 0.020344462404548243, + "grad_norm": 0.480473130941391, + "learning_rate": 9.999665022823152e-05, + "loss": 1.7197, + "step": 365 + }, + { + "epoch": 0.020400200657711386, + "grad_norm": 0.5564152598381042, + "learning_rate": 9.999654636668125e-05, + "loss": 1.8762, + "step": 366 + }, + { + "epoch": 0.020455938910874532, + "grad_norm": 0.6517108082771301, + "learning_rate": 9.999644091953204e-05, + "loss": 2.4684, + "step": 367 + }, + { + "epoch": 0.02051167716403768, + "grad_norm": 0.5357886552810669, + "learning_rate": 9.999633388678723e-05, + "loss": 1.8079, + "step": 368 + }, + { + "epoch": 0.020567415417200825, + "grad_norm": 0.498740553855896, + "learning_rate": 9.999622526845021e-05, + "loss": 1.6885, + "step": 369 + }, + { + "epoch": 0.02062315367036397, + "grad_norm": 0.49749207496643066, + "learning_rate": 9.999611506452439e-05, + "loss": 1.8686, + "step": 370 + }, + { + "epoch": 0.020678891923527117, + "grad_norm": 0.5339593291282654, + "learning_rate": 9.999600327501333e-05, + "loss": 1.8592, + "step": 371 + }, + { + "epoch": 0.020734630176690264, + "grad_norm": 0.5533782839775085, + "learning_rate": 9.999588989992052e-05, + "loss": 1.8752, + "step": 372 + }, + { + "epoch": 0.02079036842985341, + "grad_norm": 0.459504634141922, + "learning_rate": 9.99957749392496e-05, + "loss": 1.7596, + "step": 373 + }, + { + "epoch": 0.020846106683016553, + "grad_norm": 0.4722179174423218, + "learning_rate": 9.999565839300419e-05, + "loss": 1.7573, + "step": 374 + }, + { + "epoch": 0.0209018449361797, + "grad_norm": 0.49677354097366333, + "learning_rate": 9.999554026118798e-05, + "loss": 1.9692, + "step": 375 + }, + { + "epoch": 0.020957583189342845, + "grad_norm": 0.49444639682769775, + "learning_rate": 9.999542054380473e-05, + "loss": 1.8881, + "step": 376 + }, + { + "epoch": 0.02101332144250599, + "grad_norm": 0.4882863461971283, + "learning_rate": 9.999529924085824e-05, + "loss": 1.8369, + "step": 377 + }, + { + "epoch": 0.021069059695669138, + "grad_norm": 0.475211501121521, + "learning_rate": 9.999517635235237e-05, + "loss": 1.3352, + "step": 378 + }, + { + "epoch": 0.021124797948832284, + "grad_norm": 0.5699715614318848, + "learning_rate": 9.999505187829096e-05, + "loss": 1.763, + "step": 379 + }, + { + "epoch": 0.02118053620199543, + "grad_norm": 0.5538257360458374, + "learning_rate": 9.9994925818678e-05, + "loss": 1.7431, + "step": 380 + }, + { + "epoch": 0.021236274455158577, + "grad_norm": 0.48163720965385437, + "learning_rate": 9.99947981735175e-05, + "loss": 1.7356, + "step": 381 + }, + { + "epoch": 0.02129201270832172, + "grad_norm": 0.5482640266418457, + "learning_rate": 9.99946689428135e-05, + "loss": 1.861, + "step": 382 + }, + { + "epoch": 0.021347750961484866, + "grad_norm": 0.5083199739456177, + "learning_rate": 9.999453812657007e-05, + "loss": 1.9594, + "step": 383 + }, + { + "epoch": 0.021403489214648012, + "grad_norm": 0.513034999370575, + "learning_rate": 9.99944057247914e-05, + "loss": 2.0073, + "step": 384 + }, + { + "epoch": 0.02145922746781116, + "grad_norm": 0.5045239329338074, + "learning_rate": 9.999427173748164e-05, + "loss": 1.6862, + "step": 385 + }, + { + "epoch": 0.021514965720974305, + "grad_norm": 0.5097934603691101, + "learning_rate": 9.999413616464508e-05, + "loss": 1.8631, + "step": 386 + }, + { + "epoch": 0.02157070397413745, + "grad_norm": 0.522888720035553, + "learning_rate": 9.999399900628601e-05, + "loss": 1.8636, + "step": 387 + }, + { + "epoch": 0.021626442227300598, + "grad_norm": 0.49189141392707825, + "learning_rate": 9.999386026240878e-05, + "loss": 1.7465, + "step": 388 + }, + { + "epoch": 0.021682180480463744, + "grad_norm": 0.5114362239837646, + "learning_rate": 9.999371993301779e-05, + "loss": 1.6336, + "step": 389 + }, + { + "epoch": 0.021737918733626887, + "grad_norm": 0.4647996723651886, + "learning_rate": 9.999357801811748e-05, + "loss": 1.6755, + "step": 390 + }, + { + "epoch": 0.021793656986790033, + "grad_norm": 0.5380472540855408, + "learning_rate": 9.999343451771234e-05, + "loss": 1.9477, + "step": 391 + }, + { + "epoch": 0.02184939523995318, + "grad_norm": 0.4583854377269745, + "learning_rate": 9.999328943180697e-05, + "loss": 1.7902, + "step": 392 + }, + { + "epoch": 0.021905133493116326, + "grad_norm": 0.45304641127586365, + "learning_rate": 9.999314276040592e-05, + "loss": 1.6744, + "step": 393 + }, + { + "epoch": 0.021960871746279472, + "grad_norm": 0.49699023365974426, + "learning_rate": 9.999299450351387e-05, + "loss": 1.8258, + "step": 394 + }, + { + "epoch": 0.02201660999944262, + "grad_norm": 0.49681130051612854, + "learning_rate": 9.999284466113552e-05, + "loss": 1.8488, + "step": 395 + }, + { + "epoch": 0.022072348252605765, + "grad_norm": 0.5959085822105408, + "learning_rate": 9.999269323327561e-05, + "loss": 2.1775, + "step": 396 + }, + { + "epoch": 0.022128086505768908, + "grad_norm": 0.5063357949256897, + "learning_rate": 9.999254021993895e-05, + "loss": 1.6503, + "step": 397 + }, + { + "epoch": 0.022183824758932054, + "grad_norm": 0.5273301005363464, + "learning_rate": 9.999238562113038e-05, + "loss": 1.8169, + "step": 398 + }, + { + "epoch": 0.0222395630120952, + "grad_norm": 0.5033614635467529, + "learning_rate": 9.999222943685482e-05, + "loss": 1.647, + "step": 399 + }, + { + "epoch": 0.022295301265258347, + "grad_norm": 0.5118756890296936, + "learning_rate": 9.999207166711723e-05, + "loss": 1.6712, + "step": 400 + }, + { + "epoch": 0.022351039518421493, + "grad_norm": 0.5338667035102844, + "learning_rate": 9.999191231192258e-05, + "loss": 1.8125, + "step": 401 + }, + { + "epoch": 0.02240677777158464, + "grad_norm": 0.5460575819015503, + "learning_rate": 9.999175137127596e-05, + "loss": 1.8486, + "step": 402 + }, + { + "epoch": 0.022462516024747785, + "grad_norm": 0.4892098009586334, + "learning_rate": 9.999158884518245e-05, + "loss": 1.6692, + "step": 403 + }, + { + "epoch": 0.022518254277910932, + "grad_norm": 0.4894774258136749, + "learning_rate": 9.999142473364722e-05, + "loss": 1.5916, + "step": 404 + }, + { + "epoch": 0.022573992531074075, + "grad_norm": 0.4909743070602417, + "learning_rate": 9.999125903667545e-05, + "loss": 1.646, + "step": 405 + }, + { + "epoch": 0.02262973078423722, + "grad_norm": 0.48369649052619934, + "learning_rate": 9.999109175427243e-05, + "loss": 1.6874, + "step": 406 + }, + { + "epoch": 0.022685469037400367, + "grad_norm": 0.4719717502593994, + "learning_rate": 9.999092288644345e-05, + "loss": 1.9116, + "step": 407 + }, + { + "epoch": 0.022741207290563514, + "grad_norm": 0.4719882309436798, + "learning_rate": 9.999075243319386e-05, + "loss": 1.4898, + "step": 408 + }, + { + "epoch": 0.02279694554372666, + "grad_norm": 0.5169988870620728, + "learning_rate": 9.999058039452906e-05, + "loss": 1.7671, + "step": 409 + }, + { + "epoch": 0.022852683796889806, + "grad_norm": 0.4469069540500641, + "learning_rate": 9.999040677045453e-05, + "loss": 1.7068, + "step": 410 + }, + { + "epoch": 0.022908422050052953, + "grad_norm": 0.508651077747345, + "learning_rate": 9.999023156097575e-05, + "loss": 1.912, + "step": 411 + }, + { + "epoch": 0.0229641603032161, + "grad_norm": 0.48365309834480286, + "learning_rate": 9.99900547660983e-05, + "loss": 1.7907, + "step": 412 + }, + { + "epoch": 0.02301989855637924, + "grad_norm": 0.5189946889877319, + "learning_rate": 9.998987638582775e-05, + "loss": 1.8333, + "step": 413 + }, + { + "epoch": 0.023075636809542388, + "grad_norm": 0.5238891839981079, + "learning_rate": 9.99896964201698e-05, + "loss": 2.0069, + "step": 414 + }, + { + "epoch": 0.023131375062705534, + "grad_norm": 0.5390001535415649, + "learning_rate": 9.998951486913015e-05, + "loss": 1.8571, + "step": 415 + }, + { + "epoch": 0.02318711331586868, + "grad_norm": 0.5339745283126831, + "learning_rate": 9.998933173271453e-05, + "loss": 1.6536, + "step": 416 + }, + { + "epoch": 0.023242851569031827, + "grad_norm": 0.48661404848098755, + "learning_rate": 9.998914701092877e-05, + "loss": 1.8969, + "step": 417 + }, + { + "epoch": 0.023298589822194973, + "grad_norm": 0.5701104402542114, + "learning_rate": 9.998896070377873e-05, + "loss": 1.9305, + "step": 418 + }, + { + "epoch": 0.02335432807535812, + "grad_norm": 0.5289365649223328, + "learning_rate": 9.99887728112703e-05, + "loss": 1.9801, + "step": 419 + }, + { + "epoch": 0.023410066328521262, + "grad_norm": 0.4870493412017822, + "learning_rate": 9.998858333340945e-05, + "loss": 1.879, + "step": 420 + }, + { + "epoch": 0.02346580458168441, + "grad_norm": 0.46179860830307007, + "learning_rate": 9.998839227020221e-05, + "loss": 1.6029, + "step": 421 + }, + { + "epoch": 0.023521542834847555, + "grad_norm": 0.5245276689529419, + "learning_rate": 9.998819962165462e-05, + "loss": 1.9165, + "step": 422 + }, + { + "epoch": 0.0235772810880107, + "grad_norm": 0.4952642321586609, + "learning_rate": 9.998800538777278e-05, + "loss": 1.6276, + "step": 423 + }, + { + "epoch": 0.023633019341173848, + "grad_norm": 0.48968929052352905, + "learning_rate": 9.998780956856285e-05, + "loss": 1.5287, + "step": 424 + }, + { + "epoch": 0.023688757594336994, + "grad_norm": 0.4968630373477936, + "learning_rate": 9.998761216403106e-05, + "loss": 1.8008, + "step": 425 + }, + { + "epoch": 0.02374449584750014, + "grad_norm": 0.5983918309211731, + "learning_rate": 9.998741317418366e-05, + "loss": 2.0055, + "step": 426 + }, + { + "epoch": 0.023800234100663287, + "grad_norm": 0.49322110414505005, + "learning_rate": 9.998721259902694e-05, + "loss": 1.6324, + "step": 427 + }, + { + "epoch": 0.02385597235382643, + "grad_norm": 0.4888675808906555, + "learning_rate": 9.99870104385673e-05, + "loss": 1.6075, + "step": 428 + }, + { + "epoch": 0.023911710606989576, + "grad_norm": 0.4783425033092499, + "learning_rate": 9.998680669281116e-05, + "loss": 1.6517, + "step": 429 + }, + { + "epoch": 0.023967448860152722, + "grad_norm": 0.5173685550689697, + "learning_rate": 9.998660136176492e-05, + "loss": 1.6884, + "step": 430 + }, + { + "epoch": 0.02402318711331587, + "grad_norm": 0.518741250038147, + "learning_rate": 9.998639444543514e-05, + "loss": 1.7113, + "step": 431 + }, + { + "epoch": 0.024078925366479015, + "grad_norm": 0.446850448846817, + "learning_rate": 9.998618594382836e-05, + "loss": 1.5067, + "step": 432 + }, + { + "epoch": 0.02413466361964216, + "grad_norm": 0.46661272644996643, + "learning_rate": 9.99859758569512e-05, + "loss": 1.6967, + "step": 433 + }, + { + "epoch": 0.024190401872805307, + "grad_norm": 0.5824592709541321, + "learning_rate": 9.998576418481033e-05, + "loss": 2.0151, + "step": 434 + }, + { + "epoch": 0.024246140125968454, + "grad_norm": 0.4715226888656616, + "learning_rate": 9.998555092741247e-05, + "loss": 1.6199, + "step": 435 + }, + { + "epoch": 0.024301878379131597, + "grad_norm": 0.5396628975868225, + "learning_rate": 9.998533608476435e-05, + "loss": 1.8874, + "step": 436 + }, + { + "epoch": 0.024357616632294743, + "grad_norm": 0.4999384582042694, + "learning_rate": 9.99851196568728e-05, + "loss": 1.8761, + "step": 437 + }, + { + "epoch": 0.02441335488545789, + "grad_norm": 0.4719383418560028, + "learning_rate": 9.998490164374472e-05, + "loss": 1.6399, + "step": 438 + }, + { + "epoch": 0.024469093138621036, + "grad_norm": 0.49223801493644714, + "learning_rate": 9.998468204538696e-05, + "loss": 1.8343, + "step": 439 + }, + { + "epoch": 0.024524831391784182, + "grad_norm": 0.5116458535194397, + "learning_rate": 9.998446086180653e-05, + "loss": 2.0423, + "step": 440 + }, + { + "epoch": 0.024580569644947328, + "grad_norm": 0.48448118567466736, + "learning_rate": 9.998423809301043e-05, + "loss": 1.5796, + "step": 441 + }, + { + "epoch": 0.024636307898110475, + "grad_norm": 0.48682916164398193, + "learning_rate": 9.998401373900573e-05, + "loss": 1.661, + "step": 442 + }, + { + "epoch": 0.024692046151273617, + "grad_norm": 0.5474771857261658, + "learning_rate": 9.998378779979954e-05, + "loss": 1.9646, + "step": 443 + }, + { + "epoch": 0.024747784404436764, + "grad_norm": 0.48878610134124756, + "learning_rate": 9.998356027539901e-05, + "loss": 1.7896, + "step": 444 + }, + { + "epoch": 0.02480352265759991, + "grad_norm": 0.49135512113571167, + "learning_rate": 9.99833311658114e-05, + "loss": 1.7329, + "step": 445 + }, + { + "epoch": 0.024859260910763056, + "grad_norm": 0.5220357775688171, + "learning_rate": 9.998310047104393e-05, + "loss": 2.0303, + "step": 446 + }, + { + "epoch": 0.024914999163926203, + "grad_norm": 0.4597051739692688, + "learning_rate": 9.998286819110394e-05, + "loss": 1.6114, + "step": 447 + }, + { + "epoch": 0.02497073741708935, + "grad_norm": 0.5005029439926147, + "learning_rate": 9.99826343259988e-05, + "loss": 1.8658, + "step": 448 + }, + { + "epoch": 0.025026475670252495, + "grad_norm": 0.5835437774658203, + "learning_rate": 9.99823988757359e-05, + "loss": 1.8958, + "step": 449 + }, + { + "epoch": 0.02508221392341564, + "grad_norm": 0.4960596263408661, + "learning_rate": 9.998216184032274e-05, + "loss": 1.7768, + "step": 450 + }, + { + "epoch": 0.025137952176578784, + "grad_norm": 0.4787440299987793, + "learning_rate": 9.99819232197668e-05, + "loss": 1.7367, + "step": 451 + }, + { + "epoch": 0.02519369042974193, + "grad_norm": 0.4575479030609131, + "learning_rate": 9.99816830140757e-05, + "loss": 1.6027, + "step": 452 + }, + { + "epoch": 0.025249428682905077, + "grad_norm": 0.5182919502258301, + "learning_rate": 9.998144122325702e-05, + "loss": 1.8879, + "step": 453 + }, + { + "epoch": 0.025305166936068223, + "grad_norm": 0.49592286348342896, + "learning_rate": 9.998119784731843e-05, + "loss": 1.954, + "step": 454 + }, + { + "epoch": 0.02536090518923137, + "grad_norm": 0.4686327576637268, + "learning_rate": 9.998095288626765e-05, + "loss": 1.6971, + "step": 455 + }, + { + "epoch": 0.025416643442394516, + "grad_norm": 0.5634790658950806, + "learning_rate": 9.998070634011246e-05, + "loss": 1.8801, + "step": 456 + }, + { + "epoch": 0.025472381695557662, + "grad_norm": 0.49380773305892944, + "learning_rate": 9.998045820886068e-05, + "loss": 1.8882, + "step": 457 + }, + { + "epoch": 0.02552811994872081, + "grad_norm": 0.5319178104400635, + "learning_rate": 9.998020849252017e-05, + "loss": 1.7204, + "step": 458 + }, + { + "epoch": 0.02558385820188395, + "grad_norm": 0.4578639268875122, + "learning_rate": 9.997995719109884e-05, + "loss": 1.6934, + "step": 459 + }, + { + "epoch": 0.025639596455047098, + "grad_norm": 0.4672851264476776, + "learning_rate": 9.997970430460468e-05, + "loss": 1.5534, + "step": 460 + }, + { + "epoch": 0.025695334708210244, + "grad_norm": 0.4967419505119324, + "learning_rate": 9.99794498330457e-05, + "loss": 1.7817, + "step": 461 + }, + { + "epoch": 0.02575107296137339, + "grad_norm": 0.494781494140625, + "learning_rate": 9.997919377642997e-05, + "loss": 1.759, + "step": 462 + }, + { + "epoch": 0.025806811214536537, + "grad_norm": 0.47715312242507935, + "learning_rate": 9.997893613476561e-05, + "loss": 1.6342, + "step": 463 + }, + { + "epoch": 0.025862549467699683, + "grad_norm": 0.5014367699623108, + "learning_rate": 9.99786769080608e-05, + "loss": 1.7754, + "step": 464 + }, + { + "epoch": 0.02591828772086283, + "grad_norm": 0.503808319568634, + "learning_rate": 9.997841609632375e-05, + "loss": 1.9323, + "step": 465 + }, + { + "epoch": 0.025974025974025976, + "grad_norm": 0.4935349225997925, + "learning_rate": 9.997815369956273e-05, + "loss": 1.945, + "step": 466 + }, + { + "epoch": 0.02602976422718912, + "grad_norm": 0.45313507318496704, + "learning_rate": 9.997788971778608e-05, + "loss": 1.5908, + "step": 467 + }, + { + "epoch": 0.026085502480352265, + "grad_norm": 0.48407676815986633, + "learning_rate": 9.997762415100214e-05, + "loss": 1.449, + "step": 468 + }, + { + "epoch": 0.02614124073351541, + "grad_norm": 0.4917304813861847, + "learning_rate": 9.997735699921938e-05, + "loss": 1.7667, + "step": 469 + }, + { + "epoch": 0.026196978986678558, + "grad_norm": 0.5684965252876282, + "learning_rate": 9.997708826244623e-05, + "loss": 2.0801, + "step": 470 + }, + { + "epoch": 0.026252717239841704, + "grad_norm": 0.5034363865852356, + "learning_rate": 9.997681794069123e-05, + "loss": 1.9385, + "step": 471 + }, + { + "epoch": 0.02630845549300485, + "grad_norm": 0.5185155272483826, + "learning_rate": 9.997654603396294e-05, + "loss": 1.9021, + "step": 472 + }, + { + "epoch": 0.026364193746167996, + "grad_norm": 0.4756320118904114, + "learning_rate": 9.997627254227e-05, + "loss": 1.7698, + "step": 473 + }, + { + "epoch": 0.02641993199933114, + "grad_norm": 0.47013306617736816, + "learning_rate": 9.997599746562108e-05, + "loss": 1.6786, + "step": 474 + }, + { + "epoch": 0.026475670252494286, + "grad_norm": 0.4797370731830597, + "learning_rate": 9.997572080402488e-05, + "loss": 1.8663, + "step": 475 + }, + { + "epoch": 0.026531408505657432, + "grad_norm": 0.4647987186908722, + "learning_rate": 9.997544255749021e-05, + "loss": 1.6064, + "step": 476 + }, + { + "epoch": 0.02658714675882058, + "grad_norm": 0.5362509489059448, + "learning_rate": 9.99751627260259e-05, + "loss": 2.035, + "step": 477 + }, + { + "epoch": 0.026642885011983725, + "grad_norm": 0.501615047454834, + "learning_rate": 9.997488130964077e-05, + "loss": 1.7838, + "step": 478 + }, + { + "epoch": 0.02669862326514687, + "grad_norm": 0.48956695199012756, + "learning_rate": 9.997459830834379e-05, + "loss": 1.7242, + "step": 479 + }, + { + "epoch": 0.026754361518310017, + "grad_norm": 0.518091082572937, + "learning_rate": 9.997431372214394e-05, + "loss": 1.8634, + "step": 480 + }, + { + "epoch": 0.026810099771473164, + "grad_norm": 0.5070821642875671, + "learning_rate": 9.997402755105022e-05, + "loss": 1.678, + "step": 481 + }, + { + "epoch": 0.026865838024636306, + "grad_norm": 0.49108657240867615, + "learning_rate": 9.997373979507169e-05, + "loss": 1.6952, + "step": 482 + }, + { + "epoch": 0.026921576277799453, + "grad_norm": 0.4824698269367218, + "learning_rate": 9.997345045421753e-05, + "loss": 1.6948, + "step": 483 + }, + { + "epoch": 0.0269773145309626, + "grad_norm": 0.537356972694397, + "learning_rate": 9.997315952849688e-05, + "loss": 1.9746, + "step": 484 + }, + { + "epoch": 0.027033052784125745, + "grad_norm": 0.5354846119880676, + "learning_rate": 9.997286701791896e-05, + "loss": 1.9413, + "step": 485 + }, + { + "epoch": 0.02708879103728889, + "grad_norm": 0.49684658646583557, + "learning_rate": 9.99725729224931e-05, + "loss": 1.7646, + "step": 486 + }, + { + "epoch": 0.027144529290452038, + "grad_norm": 0.5149616599082947, + "learning_rate": 9.997227724222855e-05, + "loss": 1.6941, + "step": 487 + }, + { + "epoch": 0.027200267543615184, + "grad_norm": 0.48285308480262756, + "learning_rate": 9.997197997713473e-05, + "loss": 1.6994, + "step": 488 + }, + { + "epoch": 0.02725600579677833, + "grad_norm": 0.47129902243614197, + "learning_rate": 9.997168112722107e-05, + "loss": 1.8408, + "step": 489 + }, + { + "epoch": 0.027311744049941473, + "grad_norm": 0.44259312748908997, + "learning_rate": 9.997138069249703e-05, + "loss": 1.636, + "step": 490 + }, + { + "epoch": 0.02736748230310462, + "grad_norm": 0.4475281238555908, + "learning_rate": 9.997107867297216e-05, + "loss": 1.5011, + "step": 491 + }, + { + "epoch": 0.027423220556267766, + "grad_norm": 0.5637838244438171, + "learning_rate": 9.997077506865602e-05, + "loss": 2.0265, + "step": 492 + }, + { + "epoch": 0.027478958809430912, + "grad_norm": 0.5333039164543152, + "learning_rate": 9.997046987955824e-05, + "loss": 2.0372, + "step": 493 + }, + { + "epoch": 0.02753469706259406, + "grad_norm": 0.49768728017807007, + "learning_rate": 9.997016310568851e-05, + "loss": 1.8226, + "step": 494 + }, + { + "epoch": 0.027590435315757205, + "grad_norm": 0.5524271130561829, + "learning_rate": 9.996985474705654e-05, + "loss": 1.7598, + "step": 495 + }, + { + "epoch": 0.02764617356892035, + "grad_norm": 0.5334012508392334, + "learning_rate": 9.996954480367214e-05, + "loss": 1.9021, + "step": 496 + }, + { + "epoch": 0.027701911822083494, + "grad_norm": 0.5297475457191467, + "learning_rate": 9.996923327554511e-05, + "loss": 1.7989, + "step": 497 + }, + { + "epoch": 0.02775765007524664, + "grad_norm": 0.5096792578697205, + "learning_rate": 9.996892016268535e-05, + "loss": 1.7904, + "step": 498 + }, + { + "epoch": 0.027813388328409787, + "grad_norm": 0.47295787930488586, + "learning_rate": 9.996860546510278e-05, + "loss": 1.5494, + "step": 499 + }, + { + "epoch": 0.027869126581572933, + "grad_norm": 0.48092177510261536, + "learning_rate": 9.996828918280737e-05, + "loss": 1.6759, + "step": 500 + }, + { + "epoch": 0.02792486483473608, + "grad_norm": 0.4752250611782074, + "learning_rate": 9.996797131580917e-05, + "loss": 1.7032, + "step": 501 + }, + { + "epoch": 0.027980603087899226, + "grad_norm": 0.49519795179367065, + "learning_rate": 9.996765186411827e-05, + "loss": 1.7786, + "step": 502 + }, + { + "epoch": 0.028036341341062372, + "grad_norm": 0.5053145289421082, + "learning_rate": 9.996733082774477e-05, + "loss": 1.9493, + "step": 503 + }, + { + "epoch": 0.02809207959422552, + "grad_norm": 0.5514931678771973, + "learning_rate": 9.996700820669886e-05, + "loss": 2.0257, + "step": 504 + }, + { + "epoch": 0.02814781784738866, + "grad_norm": 0.5103058218955994, + "learning_rate": 9.996668400099077e-05, + "loss": 1.8291, + "step": 505 + }, + { + "epoch": 0.028203556100551808, + "grad_norm": 0.4987359941005707, + "learning_rate": 9.99663582106308e-05, + "loss": 1.6841, + "step": 506 + }, + { + "epoch": 0.028259294353714954, + "grad_norm": 0.570788562297821, + "learning_rate": 9.996603083562928e-05, + "loss": 2.1915, + "step": 507 + }, + { + "epoch": 0.0283150326068781, + "grad_norm": 0.4610704481601715, + "learning_rate": 9.996570187599658e-05, + "loss": 1.6893, + "step": 508 + }, + { + "epoch": 0.028370770860041247, + "grad_norm": 0.4623680114746094, + "learning_rate": 9.996537133174313e-05, + "loss": 1.5927, + "step": 509 + }, + { + "epoch": 0.028426509113204393, + "grad_norm": 0.4911310076713562, + "learning_rate": 9.996503920287942e-05, + "loss": 1.6685, + "step": 510 + }, + { + "epoch": 0.02848224736636754, + "grad_norm": 0.4995778799057007, + "learning_rate": 9.996470548941598e-05, + "loss": 1.8294, + "step": 511 + }, + { + "epoch": 0.028537985619530686, + "grad_norm": 0.518905758857727, + "learning_rate": 9.996437019136342e-05, + "loss": 1.6819, + "step": 512 + }, + { + "epoch": 0.02859372387269383, + "grad_norm": 0.5348454117774963, + "learning_rate": 9.996403330873233e-05, + "loss": 1.8129, + "step": 513 + }, + { + "epoch": 0.028649462125856975, + "grad_norm": 0.49906015396118164, + "learning_rate": 9.996369484153342e-05, + "loss": 1.8961, + "step": 514 + }, + { + "epoch": 0.02870520037902012, + "grad_norm": 0.5471760034561157, + "learning_rate": 9.996335478977741e-05, + "loss": 1.7716, + "step": 515 + }, + { + "epoch": 0.028760938632183267, + "grad_norm": 0.4836637079715729, + "learning_rate": 9.99630131534751e-05, + "loss": 1.7395, + "step": 516 + }, + { + "epoch": 0.028816676885346414, + "grad_norm": 0.4034901261329651, + "learning_rate": 9.996266993263732e-05, + "loss": 0.9524, + "step": 517 + }, + { + "epoch": 0.02887241513850956, + "grad_norm": 0.5080105662345886, + "learning_rate": 9.996232512727495e-05, + "loss": 1.5957, + "step": 518 + }, + { + "epoch": 0.028928153391672706, + "grad_norm": 0.4828059673309326, + "learning_rate": 9.996197873739892e-05, + "loss": 1.8356, + "step": 519 + }, + { + "epoch": 0.02898389164483585, + "grad_norm": 0.47908416390419006, + "learning_rate": 9.996163076302023e-05, + "loss": 1.7832, + "step": 520 + }, + { + "epoch": 0.029039629897998995, + "grad_norm": 0.5064157247543335, + "learning_rate": 9.996128120414989e-05, + "loss": 1.696, + "step": 521 + }, + { + "epoch": 0.029095368151162142, + "grad_norm": 0.5058413147926331, + "learning_rate": 9.996093006079903e-05, + "loss": 1.8185, + "step": 522 + }, + { + "epoch": 0.029151106404325288, + "grad_norm": 0.5816233158111572, + "learning_rate": 9.996057733297876e-05, + "loss": 2.0013, + "step": 523 + }, + { + "epoch": 0.029206844657488434, + "grad_norm": 0.506596028804779, + "learning_rate": 9.996022302070025e-05, + "loss": 1.7923, + "step": 524 + }, + { + "epoch": 0.02926258291065158, + "grad_norm": 0.48481589555740356, + "learning_rate": 9.995986712397477e-05, + "loss": 1.674, + "step": 525 + }, + { + "epoch": 0.029318321163814727, + "grad_norm": 0.6215664148330688, + "learning_rate": 9.995950964281357e-05, + "loss": 2.041, + "step": 526 + }, + { + "epoch": 0.029374059416977873, + "grad_norm": 0.5243876576423645, + "learning_rate": 9.995915057722804e-05, + "loss": 1.9253, + "step": 527 + }, + { + "epoch": 0.029429797670141016, + "grad_norm": 0.4525597393512726, + "learning_rate": 9.995878992722951e-05, + "loss": 1.5032, + "step": 528 + }, + { + "epoch": 0.029485535923304163, + "grad_norm": 0.5035833716392517, + "learning_rate": 9.995842769282946e-05, + "loss": 1.8901, + "step": 529 + }, + { + "epoch": 0.02954127417646731, + "grad_norm": 0.5944721698760986, + "learning_rate": 9.995806387403934e-05, + "loss": 2.1208, + "step": 530 + }, + { + "epoch": 0.029597012429630455, + "grad_norm": 0.5121837854385376, + "learning_rate": 9.995769847087073e-05, + "loss": 1.9563, + "step": 531 + }, + { + "epoch": 0.0296527506827936, + "grad_norm": 0.5083540678024292, + "learning_rate": 9.99573314833352e-05, + "loss": 2.0126, + "step": 532 + }, + { + "epoch": 0.029708488935956748, + "grad_norm": 0.4877237379550934, + "learning_rate": 9.995696291144438e-05, + "loss": 1.92, + "step": 533 + }, + { + "epoch": 0.029764227189119894, + "grad_norm": 0.4935770034790039, + "learning_rate": 9.995659275520995e-05, + "loss": 1.5072, + "step": 534 + }, + { + "epoch": 0.02981996544228304, + "grad_norm": 0.5800178050994873, + "learning_rate": 9.995622101464368e-05, + "loss": 2.0751, + "step": 535 + }, + { + "epoch": 0.029875703695446183, + "grad_norm": 0.5653755068778992, + "learning_rate": 9.995584768975734e-05, + "loss": 2.0538, + "step": 536 + }, + { + "epoch": 0.02993144194860933, + "grad_norm": 0.463131844997406, + "learning_rate": 9.995547278056279e-05, + "loss": 1.6813, + "step": 537 + }, + { + "epoch": 0.029987180201772476, + "grad_norm": 0.5227254629135132, + "learning_rate": 9.995509628707189e-05, + "loss": 1.9213, + "step": 538 + }, + { + "epoch": 0.030042918454935622, + "grad_norm": 0.49530157446861267, + "learning_rate": 9.99547182092966e-05, + "loss": 1.7977, + "step": 539 + }, + { + "epoch": 0.03009865670809877, + "grad_norm": 0.5396206974983215, + "learning_rate": 9.99543385472489e-05, + "loss": 1.9346, + "step": 540 + }, + { + "epoch": 0.030154394961261915, + "grad_norm": 0.517638087272644, + "learning_rate": 9.995395730094083e-05, + "loss": 1.7214, + "step": 541 + }, + { + "epoch": 0.03021013321442506, + "grad_norm": 0.5086343884468079, + "learning_rate": 9.99535744703845e-05, + "loss": 1.6459, + "step": 542 + }, + { + "epoch": 0.030265871467588207, + "grad_norm": 0.49579426646232605, + "learning_rate": 9.995319005559202e-05, + "loss": 1.7781, + "step": 543 + }, + { + "epoch": 0.03032160972075135, + "grad_norm": 0.500481128692627, + "learning_rate": 9.995280405657561e-05, + "loss": 1.8662, + "step": 544 + }, + { + "epoch": 0.030377347973914497, + "grad_norm": 0.47389981150627136, + "learning_rate": 9.99524164733475e-05, + "loss": 1.7803, + "step": 545 + }, + { + "epoch": 0.030433086227077643, + "grad_norm": 0.4981273114681244, + "learning_rate": 9.995202730591997e-05, + "loss": 1.7344, + "step": 546 + }, + { + "epoch": 0.03048882448024079, + "grad_norm": 0.507570207118988, + "learning_rate": 9.995163655430539e-05, + "loss": 1.864, + "step": 547 + }, + { + "epoch": 0.030544562733403936, + "grad_norm": 0.4923110008239746, + "learning_rate": 9.995124421851614e-05, + "loss": 1.711, + "step": 548 + }, + { + "epoch": 0.030600300986567082, + "grad_norm": 0.42948779463768005, + "learning_rate": 9.995085029856464e-05, + "loss": 1.4136, + "step": 549 + }, + { + "epoch": 0.030656039239730228, + "grad_norm": 0.5023720264434814, + "learning_rate": 9.99504547944634e-05, + "loss": 1.8524, + "step": 550 + }, + { + "epoch": 0.03071177749289337, + "grad_norm": 0.4656638205051422, + "learning_rate": 9.995005770622499e-05, + "loss": 1.5452, + "step": 551 + }, + { + "epoch": 0.030767515746056517, + "grad_norm": 0.49939560890197754, + "learning_rate": 9.994965903386198e-05, + "loss": 1.8935, + "step": 552 + }, + { + "epoch": 0.030823253999219664, + "grad_norm": 0.5469990372657776, + "learning_rate": 9.994925877738698e-05, + "loss": 1.9558, + "step": 553 + }, + { + "epoch": 0.03087899225238281, + "grad_norm": 0.46579065918922424, + "learning_rate": 9.994885693681274e-05, + "loss": 1.6339, + "step": 554 + }, + { + "epoch": 0.030934730505545956, + "grad_norm": 0.4826100468635559, + "learning_rate": 9.994845351215199e-05, + "loss": 1.6943, + "step": 555 + }, + { + "epoch": 0.030990468758709103, + "grad_norm": 0.527716338634491, + "learning_rate": 9.994804850341748e-05, + "loss": 1.9641, + "step": 556 + }, + { + "epoch": 0.03104620701187225, + "grad_norm": 0.4857400059700012, + "learning_rate": 9.994764191062212e-05, + "loss": 1.9041, + "step": 557 + }, + { + "epoch": 0.031101945265035395, + "grad_norm": 0.483614057302475, + "learning_rate": 9.994723373377876e-05, + "loss": 1.6671, + "step": 558 + }, + { + "epoch": 0.031157683518198538, + "grad_norm": 0.46863991022109985, + "learning_rate": 9.994682397290036e-05, + "loss": 1.6415, + "step": 559 + }, + { + "epoch": 0.031213421771361684, + "grad_norm": 0.5118616223335266, + "learning_rate": 9.99464126279999e-05, + "loss": 1.9253, + "step": 560 + }, + { + "epoch": 0.03126916002452483, + "grad_norm": 0.4958517849445343, + "learning_rate": 9.994599969909047e-05, + "loss": 1.5449, + "step": 561 + }, + { + "epoch": 0.03132489827768798, + "grad_norm": 0.513558030128479, + "learning_rate": 9.99455851861851e-05, + "loss": 1.8665, + "step": 562 + }, + { + "epoch": 0.03138063653085112, + "grad_norm": 0.49571189284324646, + "learning_rate": 9.9945169089297e-05, + "loss": 1.8442, + "step": 563 + }, + { + "epoch": 0.031436374784014266, + "grad_norm": 0.550983190536499, + "learning_rate": 9.994475140843933e-05, + "loss": 1.8436, + "step": 564 + }, + { + "epoch": 0.031492113037177416, + "grad_norm": 0.4547099173069, + "learning_rate": 9.994433214362532e-05, + "loss": 1.7172, + "step": 565 + }, + { + "epoch": 0.03154785129034056, + "grad_norm": 0.4933796525001526, + "learning_rate": 9.994391129486833e-05, + "loss": 1.6919, + "step": 566 + }, + { + "epoch": 0.03160358954350371, + "grad_norm": 0.5890671610832214, + "learning_rate": 9.994348886218163e-05, + "loss": 2.1026, + "step": 567 + }, + { + "epoch": 0.03165932779666685, + "grad_norm": 0.5334300398826599, + "learning_rate": 9.994306484557868e-05, + "loss": 1.9232, + "step": 568 + }, + { + "epoch": 0.03171506604983, + "grad_norm": 0.4899601340293884, + "learning_rate": 9.99426392450729e-05, + "loss": 1.6408, + "step": 569 + }, + { + "epoch": 0.031770804302993144, + "grad_norm": 0.5135582089424133, + "learning_rate": 9.994221206067777e-05, + "loss": 1.8562, + "step": 570 + }, + { + "epoch": 0.03182654255615629, + "grad_norm": 0.5050702095031738, + "learning_rate": 9.994178329240686e-05, + "loss": 1.7045, + "step": 571 + }, + { + "epoch": 0.03188228080931944, + "grad_norm": 0.4874882102012634, + "learning_rate": 9.994135294027378e-05, + "loss": 1.8015, + "step": 572 + }, + { + "epoch": 0.03193801906248258, + "grad_norm": 0.6017099022865295, + "learning_rate": 9.994092100429215e-05, + "loss": 2.1681, + "step": 573 + }, + { + "epoch": 0.03199375731564573, + "grad_norm": 0.4922308325767517, + "learning_rate": 9.994048748447569e-05, + "loss": 1.6771, + "step": 574 + }, + { + "epoch": 0.03204949556880887, + "grad_norm": 0.5013367533683777, + "learning_rate": 9.994005238083815e-05, + "loss": 1.7157, + "step": 575 + }, + { + "epoch": 0.03210523382197202, + "grad_norm": 0.47761455178260803, + "learning_rate": 9.99396156933933e-05, + "loss": 1.8095, + "step": 576 + }, + { + "epoch": 0.032160972075135165, + "grad_norm": 0.5500997304916382, + "learning_rate": 9.993917742215502e-05, + "loss": 2.2013, + "step": 577 + }, + { + "epoch": 0.03221671032829831, + "grad_norm": 0.5222569108009338, + "learning_rate": 9.993873756713719e-05, + "loss": 1.9967, + "step": 578 + }, + { + "epoch": 0.03227244858146146, + "grad_norm": 0.520000696182251, + "learning_rate": 9.993829612835378e-05, + "loss": 1.6328, + "step": 579 + }, + { + "epoch": 0.0323281868346246, + "grad_norm": 0.501677930355072, + "learning_rate": 9.993785310581875e-05, + "loss": 1.9793, + "step": 580 + }, + { + "epoch": 0.03238392508778775, + "grad_norm": 0.4832457900047302, + "learning_rate": 9.993740849954619e-05, + "loss": 1.7687, + "step": 581 + }, + { + "epoch": 0.03243966334095089, + "grad_norm": 0.4854641556739807, + "learning_rate": 9.99369623095502e-05, + "loss": 1.8983, + "step": 582 + }, + { + "epoch": 0.03249540159411404, + "grad_norm": 0.48794299364089966, + "learning_rate": 9.993651453584491e-05, + "loss": 1.6625, + "step": 583 + }, + { + "epoch": 0.032551139847277186, + "grad_norm": 0.4691779911518097, + "learning_rate": 9.993606517844452e-05, + "loss": 1.7413, + "step": 584 + }, + { + "epoch": 0.032606878100440335, + "grad_norm": 0.531639039516449, + "learning_rate": 9.993561423736331e-05, + "loss": 1.875, + "step": 585 + }, + { + "epoch": 0.03266261635360348, + "grad_norm": 0.5259484648704529, + "learning_rate": 9.993516171261555e-05, + "loss": 1.9669, + "step": 586 + }, + { + "epoch": 0.03271835460676662, + "grad_norm": 0.4976826012134552, + "learning_rate": 9.993470760421559e-05, + "loss": 1.808, + "step": 587 + }, + { + "epoch": 0.03277409285992977, + "grad_norm": 0.4722268283367157, + "learning_rate": 9.993425191217787e-05, + "loss": 1.7654, + "step": 588 + }, + { + "epoch": 0.032829831113092914, + "grad_norm": 0.4951403737068176, + "learning_rate": 9.993379463651679e-05, + "loss": 1.8282, + "step": 589 + }, + { + "epoch": 0.032885569366256064, + "grad_norm": 0.4893924295902252, + "learning_rate": 9.99333357772469e-05, + "loss": 1.6477, + "step": 590 + }, + { + "epoch": 0.032941307619419206, + "grad_norm": 0.4877261519432068, + "learning_rate": 9.993287533438273e-05, + "loss": 1.6518, + "step": 591 + }, + { + "epoch": 0.032997045872582356, + "grad_norm": 0.48906272649765015, + "learning_rate": 9.993241330793888e-05, + "loss": 1.6485, + "step": 592 + }, + { + "epoch": 0.0330527841257455, + "grad_norm": 0.5735100507736206, + "learning_rate": 9.993194969792999e-05, + "loss": 2.0397, + "step": 593 + }, + { + "epoch": 0.03310852237890864, + "grad_norm": 0.45156189799308777, + "learning_rate": 9.99314845043708e-05, + "loss": 1.6368, + "step": 594 + }, + { + "epoch": 0.03316426063207179, + "grad_norm": 0.4821372628211975, + "learning_rate": 9.993101772727602e-05, + "loss": 1.6886, + "step": 595 + }, + { + "epoch": 0.033219998885234935, + "grad_norm": 0.501278817653656, + "learning_rate": 9.993054936666048e-05, + "loss": 1.7587, + "step": 596 + }, + { + "epoch": 0.033275737138398084, + "grad_norm": 0.5598791241645813, + "learning_rate": 9.993007942253905e-05, + "loss": 1.8861, + "step": 597 + }, + { + "epoch": 0.03333147539156123, + "grad_norm": 0.48821693658828735, + "learning_rate": 9.99296078949266e-05, + "loss": 1.6563, + "step": 598 + }, + { + "epoch": 0.03338721364472438, + "grad_norm": 0.4853152632713318, + "learning_rate": 9.99291347838381e-05, + "loss": 1.5493, + "step": 599 + }, + { + "epoch": 0.03344295189788752, + "grad_norm": 0.5629671812057495, + "learning_rate": 9.992866008928855e-05, + "loss": 2.1359, + "step": 600 + }, + { + "epoch": 0.03349869015105066, + "grad_norm": 0.5176377892494202, + "learning_rate": 9.9928183811293e-05, + "loss": 2.0139, + "step": 601 + }, + { + "epoch": 0.03355442840421381, + "grad_norm": 0.46964964270591736, + "learning_rate": 9.992770594986658e-05, + "loss": 1.6594, + "step": 602 + }, + { + "epoch": 0.033610166657376955, + "grad_norm": 0.49720609188079834, + "learning_rate": 9.992722650502442e-05, + "loss": 1.8432, + "step": 603 + }, + { + "epoch": 0.033665904910540105, + "grad_norm": 0.4787680506706238, + "learning_rate": 9.992674547678171e-05, + "loss": 1.8071, + "step": 604 + }, + { + "epoch": 0.03372164316370325, + "grad_norm": 0.4432480037212372, + "learning_rate": 9.992626286515373e-05, + "loss": 1.6391, + "step": 605 + }, + { + "epoch": 0.0337773814168664, + "grad_norm": 0.5781794786453247, + "learning_rate": 9.992577867015581e-05, + "loss": 2.0711, + "step": 606 + }, + { + "epoch": 0.03383311967002954, + "grad_norm": 0.45807138085365295, + "learning_rate": 9.992529289180326e-05, + "loss": 1.5886, + "step": 607 + }, + { + "epoch": 0.03388885792319269, + "grad_norm": 0.5234102606773376, + "learning_rate": 9.992480553011151e-05, + "loss": 1.9211, + "step": 608 + }, + { + "epoch": 0.03394459617635583, + "grad_norm": 0.5202253460884094, + "learning_rate": 9.9924316585096e-05, + "loss": 1.819, + "step": 609 + }, + { + "epoch": 0.034000334429518976, + "grad_norm": 0.4516846537590027, + "learning_rate": 9.992382605677226e-05, + "loss": 1.6631, + "step": 610 + }, + { + "epoch": 0.034056072682682126, + "grad_norm": 0.5501968860626221, + "learning_rate": 9.992333394515583e-05, + "loss": 2.0759, + "step": 611 + }, + { + "epoch": 0.03411181093584527, + "grad_norm": 0.4812159836292267, + "learning_rate": 9.992284025026231e-05, + "loss": 1.6721, + "step": 612 + }, + { + "epoch": 0.03416754918900842, + "grad_norm": 0.5236145257949829, + "learning_rate": 9.992234497210737e-05, + "loss": 1.807, + "step": 613 + }, + { + "epoch": 0.03422328744217156, + "grad_norm": 0.5123412609100342, + "learning_rate": 9.992184811070673e-05, + "loss": 1.9095, + "step": 614 + }, + { + "epoch": 0.03427902569533471, + "grad_norm": 0.49797573685646057, + "learning_rate": 9.992134966607612e-05, + "loss": 1.7303, + "step": 615 + }, + { + "epoch": 0.034334763948497854, + "grad_norm": 0.48441436886787415, + "learning_rate": 9.992084963823136e-05, + "loss": 1.6339, + "step": 616 + }, + { + "epoch": 0.034390502201661, + "grad_norm": 0.5459060668945312, + "learning_rate": 9.992034802718832e-05, + "loss": 1.8881, + "step": 617 + }, + { + "epoch": 0.03444624045482415, + "grad_norm": 0.5051499009132385, + "learning_rate": 9.991984483296288e-05, + "loss": 1.9386, + "step": 618 + }, + { + "epoch": 0.03450197870798729, + "grad_norm": 0.5421403050422668, + "learning_rate": 9.991934005557103e-05, + "loss": 2.0836, + "step": 619 + }, + { + "epoch": 0.03455771696115044, + "grad_norm": 0.4838196933269501, + "learning_rate": 9.991883369502874e-05, + "loss": 1.6526, + "step": 620 + }, + { + "epoch": 0.03461345521431358, + "grad_norm": 0.49810105562210083, + "learning_rate": 9.991832575135211e-05, + "loss": 1.7326, + "step": 621 + }, + { + "epoch": 0.03466919346747673, + "grad_norm": 0.46195507049560547, + "learning_rate": 9.991781622455723e-05, + "loss": 1.6398, + "step": 622 + }, + { + "epoch": 0.034724931720639875, + "grad_norm": 0.46615251898765564, + "learning_rate": 9.991730511466026e-05, + "loss": 1.7927, + "step": 623 + }, + { + "epoch": 0.03478066997380302, + "grad_norm": 0.5302008390426636, + "learning_rate": 9.991679242167741e-05, + "loss": 1.8047, + "step": 624 + }, + { + "epoch": 0.03483640822696617, + "grad_norm": 0.49787190556526184, + "learning_rate": 9.991627814562494e-05, + "loss": 1.9146, + "step": 625 + }, + { + "epoch": 0.03489214648012931, + "grad_norm": 0.5156252384185791, + "learning_rate": 9.991576228651915e-05, + "loss": 1.9453, + "step": 626 + }, + { + "epoch": 0.03494788473329246, + "grad_norm": 0.45635107159614563, + "learning_rate": 9.991524484437642e-05, + "loss": 1.7143, + "step": 627 + }, + { + "epoch": 0.0350036229864556, + "grad_norm": 0.48797038197517395, + "learning_rate": 9.991472581921316e-05, + "loss": 1.7371, + "step": 628 + }, + { + "epoch": 0.03505936123961875, + "grad_norm": 0.549708366394043, + "learning_rate": 9.99142052110458e-05, + "loss": 1.9569, + "step": 629 + }, + { + "epoch": 0.035115099492781895, + "grad_norm": 0.4693654179573059, + "learning_rate": 9.991368301989088e-05, + "loss": 1.4609, + "step": 630 + }, + { + "epoch": 0.035170837745945045, + "grad_norm": 0.5259846448898315, + "learning_rate": 9.991315924576495e-05, + "loss": 1.7577, + "step": 631 + }, + { + "epoch": 0.03522657599910819, + "grad_norm": 0.49805745482444763, + "learning_rate": 9.991263388868461e-05, + "loss": 1.8534, + "step": 632 + }, + { + "epoch": 0.03528231425227133, + "grad_norm": 0.4565132260322571, + "learning_rate": 9.991210694866654e-05, + "loss": 1.6853, + "step": 633 + }, + { + "epoch": 0.03533805250543448, + "grad_norm": 0.5158933401107788, + "learning_rate": 9.991157842572747e-05, + "loss": 1.8088, + "step": 634 + }, + { + "epoch": 0.035393790758597624, + "grad_norm": 0.49667277932167053, + "learning_rate": 9.991104831988412e-05, + "loss": 1.9148, + "step": 635 + }, + { + "epoch": 0.03544952901176077, + "grad_norm": 0.48701363801956177, + "learning_rate": 9.991051663115331e-05, + "loss": 1.7816, + "step": 636 + }, + { + "epoch": 0.035505267264923916, + "grad_norm": 0.5608890056610107, + "learning_rate": 9.990998335955193e-05, + "loss": 1.8764, + "step": 637 + }, + { + "epoch": 0.035561005518087066, + "grad_norm": 0.49871060252189636, + "learning_rate": 9.990944850509685e-05, + "loss": 1.6103, + "step": 638 + }, + { + "epoch": 0.03561674377125021, + "grad_norm": 0.46610593795776367, + "learning_rate": 9.990891206780506e-05, + "loss": 1.7798, + "step": 639 + }, + { + "epoch": 0.03567248202441335, + "grad_norm": 0.5284513831138611, + "learning_rate": 9.990837404769358e-05, + "loss": 1.8771, + "step": 640 + }, + { + "epoch": 0.0357282202775765, + "grad_norm": 0.5929260849952698, + "learning_rate": 9.990783444477946e-05, + "loss": 2.0712, + "step": 641 + }, + { + "epoch": 0.035783958530739644, + "grad_norm": 0.5146616697311401, + "learning_rate": 9.990729325907981e-05, + "loss": 1.7693, + "step": 642 + }, + { + "epoch": 0.035839696783902794, + "grad_norm": 0.5243765711784363, + "learning_rate": 9.99067504906118e-05, + "loss": 1.8675, + "step": 643 + }, + { + "epoch": 0.03589543503706594, + "grad_norm": 0.48738136887550354, + "learning_rate": 9.990620613939263e-05, + "loss": 1.7557, + "step": 644 + }, + { + "epoch": 0.03595117329022909, + "grad_norm": 0.5006791353225708, + "learning_rate": 9.990566020543959e-05, + "loss": 1.7199, + "step": 645 + }, + { + "epoch": 0.03600691154339223, + "grad_norm": 0.5283340811729431, + "learning_rate": 9.990511268876998e-05, + "loss": 1.9156, + "step": 646 + }, + { + "epoch": 0.03606264979655538, + "grad_norm": 0.47615885734558105, + "learning_rate": 9.990456358940115e-05, + "loss": 1.6183, + "step": 647 + }, + { + "epoch": 0.03611838804971852, + "grad_norm": 0.48326513171195984, + "learning_rate": 9.990401290735053e-05, + "loss": 1.8159, + "step": 648 + }, + { + "epoch": 0.036174126302881665, + "grad_norm": 0.489183247089386, + "learning_rate": 9.990346064263558e-05, + "loss": 1.9306, + "step": 649 + }, + { + "epoch": 0.036229864556044815, + "grad_norm": 0.44880211353302, + "learning_rate": 9.990290679527382e-05, + "loss": 1.4257, + "step": 650 + }, + { + "epoch": 0.03628560280920796, + "grad_norm": 0.49666327238082886, + "learning_rate": 9.990235136528281e-05, + "loss": 1.6587, + "step": 651 + }, + { + "epoch": 0.03634134106237111, + "grad_norm": 0.5396116971969604, + "learning_rate": 9.990179435268017e-05, + "loss": 1.9138, + "step": 652 + }, + { + "epoch": 0.03639707931553425, + "grad_norm": 0.512506902217865, + "learning_rate": 9.990123575748355e-05, + "loss": 2.0153, + "step": 653 + }, + { + "epoch": 0.0364528175686974, + "grad_norm": 0.48785391449928284, + "learning_rate": 9.990067557971068e-05, + "loss": 1.9489, + "step": 654 + }, + { + "epoch": 0.03650855582186054, + "grad_norm": 0.49123311042785645, + "learning_rate": 9.990011381937933e-05, + "loss": 1.6926, + "step": 655 + }, + { + "epoch": 0.036564294075023686, + "grad_norm": 0.4744409918785095, + "learning_rate": 9.98995504765073e-05, + "loss": 1.7961, + "step": 656 + }, + { + "epoch": 0.036620032328186836, + "grad_norm": 0.5175344944000244, + "learning_rate": 9.989898555111245e-05, + "loss": 1.8846, + "step": 657 + }, + { + "epoch": 0.03667577058134998, + "grad_norm": 0.4825249910354614, + "learning_rate": 9.989841904321274e-05, + "loss": 1.7094, + "step": 658 + }, + { + "epoch": 0.03673150883451313, + "grad_norm": 0.5392758250236511, + "learning_rate": 9.989785095282609e-05, + "loss": 1.8777, + "step": 659 + }, + { + "epoch": 0.03678724708767627, + "grad_norm": 0.5122122764587402, + "learning_rate": 9.989728127997052e-05, + "loss": 1.8686, + "step": 660 + }, + { + "epoch": 0.03684298534083942, + "grad_norm": 0.4976766109466553, + "learning_rate": 9.989671002466412e-05, + "loss": 1.7542, + "step": 661 + }, + { + "epoch": 0.036898723594002564, + "grad_norm": 0.4618877172470093, + "learning_rate": 9.989613718692501e-05, + "loss": 1.4741, + "step": 662 + }, + { + "epoch": 0.03695446184716571, + "grad_norm": 0.4870270788669586, + "learning_rate": 9.989556276677133e-05, + "loss": 1.6816, + "step": 663 + }, + { + "epoch": 0.037010200100328856, + "grad_norm": 0.5549145936965942, + "learning_rate": 9.989498676422131e-05, + "loss": 1.8716, + "step": 664 + }, + { + "epoch": 0.037065938353492, + "grad_norm": 0.501438319683075, + "learning_rate": 9.989440917929321e-05, + "loss": 1.7686, + "step": 665 + }, + { + "epoch": 0.03712167660665515, + "grad_norm": 0.5713873505592346, + "learning_rate": 9.989383001200536e-05, + "loss": 2.116, + "step": 666 + }, + { + "epoch": 0.03717741485981829, + "grad_norm": 0.4839586615562439, + "learning_rate": 9.989324926237613e-05, + "loss": 1.8245, + "step": 667 + }, + { + "epoch": 0.03723315311298144, + "grad_norm": 0.5154809355735779, + "learning_rate": 9.989266693042394e-05, + "loss": 1.661, + "step": 668 + }, + { + "epoch": 0.037288891366144584, + "grad_norm": 0.4965420365333557, + "learning_rate": 9.989208301616724e-05, + "loss": 1.6531, + "step": 669 + }, + { + "epoch": 0.037344629619307734, + "grad_norm": 0.4850505292415619, + "learning_rate": 9.989149751962455e-05, + "loss": 1.8691, + "step": 670 + }, + { + "epoch": 0.03740036787247088, + "grad_norm": 0.47275611758232117, + "learning_rate": 9.989091044081445e-05, + "loss": 1.7718, + "step": 671 + }, + { + "epoch": 0.03745610612563402, + "grad_norm": 0.5606955885887146, + "learning_rate": 9.989032177975554e-05, + "loss": 2.2129, + "step": 672 + }, + { + "epoch": 0.03751184437879717, + "grad_norm": 0.49657538533210754, + "learning_rate": 9.988973153646654e-05, + "loss": 1.9084, + "step": 673 + }, + { + "epoch": 0.03756758263196031, + "grad_norm": 0.5135958790779114, + "learning_rate": 9.988913971096611e-05, + "loss": 1.9491, + "step": 674 + }, + { + "epoch": 0.03762332088512346, + "grad_norm": 0.48900923132896423, + "learning_rate": 9.988854630327305e-05, + "loss": 1.7176, + "step": 675 + }, + { + "epoch": 0.037679059138286605, + "grad_norm": 0.463521808385849, + "learning_rate": 9.988795131340616e-05, + "loss": 1.5625, + "step": 676 + }, + { + "epoch": 0.037734797391449755, + "grad_norm": 0.48082444071769714, + "learning_rate": 9.988735474138433e-05, + "loss": 1.7208, + "step": 677 + }, + { + "epoch": 0.0377905356446129, + "grad_norm": 0.5012754201889038, + "learning_rate": 9.988675658722648e-05, + "loss": 1.9678, + "step": 678 + }, + { + "epoch": 0.03784627389777604, + "grad_norm": 0.5888019800186157, + "learning_rate": 9.988615685095155e-05, + "loss": 2.2239, + "step": 679 + }, + { + "epoch": 0.03790201215093919, + "grad_norm": 0.47830748558044434, + "learning_rate": 9.98855555325786e-05, + "loss": 1.6574, + "step": 680 + }, + { + "epoch": 0.03795775040410233, + "grad_norm": 0.47648170590400696, + "learning_rate": 9.988495263212667e-05, + "loss": 1.6185, + "step": 681 + }, + { + "epoch": 0.03801348865726548, + "grad_norm": 0.5321143269538879, + "learning_rate": 9.98843481496149e-05, + "loss": 2.0788, + "step": 682 + }, + { + "epoch": 0.038069226910428626, + "grad_norm": 0.4451909363269806, + "learning_rate": 9.988374208506243e-05, + "loss": 1.7213, + "step": 683 + }, + { + "epoch": 0.038124965163591776, + "grad_norm": 0.4888899028301239, + "learning_rate": 9.988313443848853e-05, + "loss": 1.9524, + "step": 684 + }, + { + "epoch": 0.03818070341675492, + "grad_norm": 0.5075884461402893, + "learning_rate": 9.988252520991244e-05, + "loss": 1.9489, + "step": 685 + }, + { + "epoch": 0.03823644166991806, + "grad_norm": 0.5244428515434265, + "learning_rate": 9.988191439935348e-05, + "loss": 1.8805, + "step": 686 + }, + { + "epoch": 0.03829217992308121, + "grad_norm": 0.5269452333450317, + "learning_rate": 9.988130200683103e-05, + "loss": 1.916, + "step": 687 + }, + { + "epoch": 0.038347918176244354, + "grad_norm": 0.40096086263656616, + "learning_rate": 9.98806880323645e-05, + "loss": 1.3248, + "step": 688 + }, + { + "epoch": 0.038403656429407504, + "grad_norm": 0.555325984954834, + "learning_rate": 9.988007247597337e-05, + "loss": 1.945, + "step": 689 + }, + { + "epoch": 0.03845939468257065, + "grad_norm": 0.4987097382545471, + "learning_rate": 9.987945533767717e-05, + "loss": 1.9159, + "step": 690 + }, + { + "epoch": 0.0385151329357338, + "grad_norm": 0.46860477328300476, + "learning_rate": 9.987883661749548e-05, + "loss": 1.7105, + "step": 691 + }, + { + "epoch": 0.03857087118889694, + "grad_norm": 0.4867911636829376, + "learning_rate": 9.987821631544789e-05, + "loss": 1.6607, + "step": 692 + }, + { + "epoch": 0.03862660944206009, + "grad_norm": 0.5149185061454773, + "learning_rate": 9.987759443155409e-05, + "loss": 1.8422, + "step": 693 + }, + { + "epoch": 0.03868234769522323, + "grad_norm": 0.508399248123169, + "learning_rate": 9.98769709658338e-05, + "loss": 1.8393, + "step": 694 + }, + { + "epoch": 0.038738085948386375, + "grad_norm": 0.4841381907463074, + "learning_rate": 9.987634591830679e-05, + "loss": 1.8819, + "step": 695 + }, + { + "epoch": 0.038793824201549525, + "grad_norm": 0.4869403541088104, + "learning_rate": 9.987571928899288e-05, + "loss": 1.7872, + "step": 696 + }, + { + "epoch": 0.03884956245471267, + "grad_norm": 0.49572715163230896, + "learning_rate": 9.987509107791196e-05, + "loss": 1.8078, + "step": 697 + }, + { + "epoch": 0.03890530070787582, + "grad_norm": 0.5188158750534058, + "learning_rate": 9.987446128508396e-05, + "loss": 1.7838, + "step": 698 + }, + { + "epoch": 0.03896103896103896, + "grad_norm": 0.4589369595050812, + "learning_rate": 9.98738299105288e-05, + "loss": 1.7299, + "step": 699 + }, + { + "epoch": 0.03901677721420211, + "grad_norm": 0.5023289322853088, + "learning_rate": 9.987319695426657e-05, + "loss": 1.7414, + "step": 700 + }, + { + "epoch": 0.03907251546736525, + "grad_norm": 0.5241897702217102, + "learning_rate": 9.98725624163173e-05, + "loss": 1.8223, + "step": 701 + }, + { + "epoch": 0.039128253720528396, + "grad_norm": 0.4720919728279114, + "learning_rate": 9.987192629670112e-05, + "loss": 1.791, + "step": 702 + }, + { + "epoch": 0.039183991973691545, + "grad_norm": 0.5045210719108582, + "learning_rate": 9.987128859543824e-05, + "loss": 1.7428, + "step": 703 + }, + { + "epoch": 0.03923973022685469, + "grad_norm": 0.5130773782730103, + "learning_rate": 9.987064931254884e-05, + "loss": 1.6701, + "step": 704 + }, + { + "epoch": 0.03929546848001784, + "grad_norm": 0.5155162215232849, + "learning_rate": 9.987000844805319e-05, + "loss": 1.9592, + "step": 705 + }, + { + "epoch": 0.03935120673318098, + "grad_norm": 0.46410509943962097, + "learning_rate": 9.986936600197165e-05, + "loss": 1.786, + "step": 706 + }, + { + "epoch": 0.03940694498634413, + "grad_norm": 0.5000941157341003, + "learning_rate": 9.986872197432459e-05, + "loss": 1.7937, + "step": 707 + }, + { + "epoch": 0.039462683239507274, + "grad_norm": 0.4663851261138916, + "learning_rate": 9.986807636513241e-05, + "loss": 1.8019, + "step": 708 + }, + { + "epoch": 0.039518421492670416, + "grad_norm": 0.5445390343666077, + "learning_rate": 9.986742917441561e-05, + "loss": 1.9214, + "step": 709 + }, + { + "epoch": 0.039574159745833566, + "grad_norm": 0.49968406558036804, + "learning_rate": 9.986678040219469e-05, + "loss": 1.7621, + "step": 710 + }, + { + "epoch": 0.03962989799899671, + "grad_norm": 0.514168381690979, + "learning_rate": 9.986613004849024e-05, + "loss": 1.7435, + "step": 711 + }, + { + "epoch": 0.03968563625215986, + "grad_norm": 0.4899461269378662, + "learning_rate": 9.986547811332289e-05, + "loss": 1.7199, + "step": 712 + }, + { + "epoch": 0.039741374505323, + "grad_norm": 0.5172072052955627, + "learning_rate": 9.986482459671332e-05, + "loss": 1.9435, + "step": 713 + }, + { + "epoch": 0.03979711275848615, + "grad_norm": 0.5198094844818115, + "learning_rate": 9.986416949868223e-05, + "loss": 1.799, + "step": 714 + }, + { + "epoch": 0.039852851011649294, + "grad_norm": 0.47976863384246826, + "learning_rate": 9.986351281925042e-05, + "loss": 1.8455, + "step": 715 + }, + { + "epoch": 0.039908589264812444, + "grad_norm": 0.4702402949333191, + "learning_rate": 9.986285455843872e-05, + "loss": 1.5848, + "step": 716 + }, + { + "epoch": 0.03996432751797559, + "grad_norm": 0.4698415994644165, + "learning_rate": 9.986219471626797e-05, + "loss": 1.6527, + "step": 717 + }, + { + "epoch": 0.04002006577113873, + "grad_norm": 0.5518625974655151, + "learning_rate": 9.986153329275913e-05, + "loss": 1.8773, + "step": 718 + }, + { + "epoch": 0.04007580402430188, + "grad_norm": 0.5149457454681396, + "learning_rate": 9.986087028793316e-05, + "loss": 1.8737, + "step": 719 + }, + { + "epoch": 0.04013154227746502, + "grad_norm": 0.527282178401947, + "learning_rate": 9.98602057018111e-05, + "loss": 1.9581, + "step": 720 + }, + { + "epoch": 0.04018728053062817, + "grad_norm": 0.48371025919914246, + "learning_rate": 9.985953953441402e-05, + "loss": 1.887, + "step": 721 + }, + { + "epoch": 0.040243018783791315, + "grad_norm": 0.5474866032600403, + "learning_rate": 9.985887178576305e-05, + "loss": 1.9981, + "step": 722 + }, + { + "epoch": 0.040298757036954465, + "grad_norm": 0.5417437553405762, + "learning_rate": 9.985820245587936e-05, + "loss": 2.0195, + "step": 723 + }, + { + "epoch": 0.04035449529011761, + "grad_norm": 0.458363801240921, + "learning_rate": 9.985753154478418e-05, + "loss": 1.6134, + "step": 724 + }, + { + "epoch": 0.04041023354328075, + "grad_norm": 0.49649447202682495, + "learning_rate": 9.98568590524988e-05, + "loss": 1.7501, + "step": 725 + }, + { + "epoch": 0.0404659717964439, + "grad_norm": 0.5304057002067566, + "learning_rate": 9.985618497904453e-05, + "loss": 1.9164, + "step": 726 + }, + { + "epoch": 0.04052171004960704, + "grad_norm": 0.4757838249206543, + "learning_rate": 9.985550932444275e-05, + "loss": 1.8159, + "step": 727 + }, + { + "epoch": 0.04057744830277019, + "grad_norm": 0.48324036598205566, + "learning_rate": 9.98548320887149e-05, + "loss": 1.6184, + "step": 728 + }, + { + "epoch": 0.040633186555933336, + "grad_norm": 0.5059638023376465, + "learning_rate": 9.985415327188245e-05, + "loss": 1.8383, + "step": 729 + }, + { + "epoch": 0.040688924809096486, + "grad_norm": 0.4717106819152832, + "learning_rate": 9.985347287396692e-05, + "loss": 1.67, + "step": 730 + }, + { + "epoch": 0.04074466306225963, + "grad_norm": 0.4953088164329529, + "learning_rate": 9.98527908949899e-05, + "loss": 1.8185, + "step": 731 + }, + { + "epoch": 0.04080040131542277, + "grad_norm": 0.49030283093452454, + "learning_rate": 9.985210733497301e-05, + "loss": 1.7909, + "step": 732 + }, + { + "epoch": 0.04085613956858592, + "grad_norm": 0.5224010944366455, + "learning_rate": 9.985142219393795e-05, + "loss": 1.8615, + "step": 733 + }, + { + "epoch": 0.040911877821749064, + "grad_norm": 0.5008676648139954, + "learning_rate": 9.985073547190641e-05, + "loss": 1.9337, + "step": 734 + }, + { + "epoch": 0.040967616074912214, + "grad_norm": 0.4777420163154602, + "learning_rate": 9.98500471689002e-05, + "loss": 1.8345, + "step": 735 + }, + { + "epoch": 0.04102335432807536, + "grad_norm": 0.4995800852775574, + "learning_rate": 9.984935728494113e-05, + "loss": 1.843, + "step": 736 + }, + { + "epoch": 0.041079092581238506, + "grad_norm": 0.5097813010215759, + "learning_rate": 9.984866582005111e-05, + "loss": 1.9642, + "step": 737 + }, + { + "epoch": 0.04113483083440165, + "grad_norm": 0.4956590533256531, + "learning_rate": 9.984797277425204e-05, + "loss": 1.8874, + "step": 738 + }, + { + "epoch": 0.0411905690875648, + "grad_norm": 0.5304232239723206, + "learning_rate": 9.98472781475659e-05, + "loss": 1.9269, + "step": 739 + }, + { + "epoch": 0.04124630734072794, + "grad_norm": 0.5134212374687195, + "learning_rate": 9.984658194001474e-05, + "loss": 1.5059, + "step": 740 + }, + { + "epoch": 0.041302045593891085, + "grad_norm": 0.4551413953304291, + "learning_rate": 9.984588415162061e-05, + "loss": 1.7386, + "step": 741 + }, + { + "epoch": 0.041357783847054234, + "grad_norm": 0.5477944612503052, + "learning_rate": 9.984518478240568e-05, + "loss": 1.9075, + "step": 742 + }, + { + "epoch": 0.04141352210021738, + "grad_norm": 0.4997386038303375, + "learning_rate": 9.98444838323921e-05, + "loss": 1.7812, + "step": 743 + }, + { + "epoch": 0.04146926035338053, + "grad_norm": 0.5239866971969604, + "learning_rate": 9.984378130160208e-05, + "loss": 1.9155, + "step": 744 + }, + { + "epoch": 0.04152499860654367, + "grad_norm": 0.46206948161125183, + "learning_rate": 9.984307719005795e-05, + "loss": 1.6661, + "step": 745 + }, + { + "epoch": 0.04158073685970682, + "grad_norm": 0.4978305399417877, + "learning_rate": 9.984237149778201e-05, + "loss": 1.8456, + "step": 746 + }, + { + "epoch": 0.04163647511286996, + "grad_norm": 0.50936359167099, + "learning_rate": 9.984166422479663e-05, + "loss": 1.9118, + "step": 747 + }, + { + "epoch": 0.041692213366033105, + "grad_norm": 0.49744611978530884, + "learning_rate": 9.984095537112429e-05, + "loss": 1.7721, + "step": 748 + }, + { + "epoch": 0.041747951619196255, + "grad_norm": 0.536056637763977, + "learning_rate": 9.984024493678743e-05, + "loss": 1.7968, + "step": 749 + }, + { + "epoch": 0.0418036898723594, + "grad_norm": 0.5262266993522644, + "learning_rate": 9.983953292180857e-05, + "loss": 1.858, + "step": 750 + }, + { + "epoch": 0.04185942812552255, + "grad_norm": 0.5085186958312988, + "learning_rate": 9.983881932621033e-05, + "loss": 1.751, + "step": 751 + }, + { + "epoch": 0.04191516637868569, + "grad_norm": 0.4641915261745453, + "learning_rate": 9.983810415001531e-05, + "loss": 1.5998, + "step": 752 + }, + { + "epoch": 0.04197090463184884, + "grad_norm": 0.5268242955207825, + "learning_rate": 9.983738739324621e-05, + "loss": 1.7263, + "step": 753 + }, + { + "epoch": 0.04202664288501198, + "grad_norm": 0.5283384919166565, + "learning_rate": 9.983666905592576e-05, + "loss": 1.9334, + "step": 754 + }, + { + "epoch": 0.042082381138175126, + "grad_norm": 0.5007447600364685, + "learning_rate": 9.983594913807672e-05, + "loss": 1.6944, + "step": 755 + }, + { + "epoch": 0.042138119391338276, + "grad_norm": 0.5626598596572876, + "learning_rate": 9.983522763972196e-05, + "loss": 2.042, + "step": 756 + }, + { + "epoch": 0.04219385764450142, + "grad_norm": 0.46739470958709717, + "learning_rate": 9.983450456088432e-05, + "loss": 1.6733, + "step": 757 + }, + { + "epoch": 0.04224959589766457, + "grad_norm": 0.5124320983886719, + "learning_rate": 9.983377990158676e-05, + "loss": 1.8463, + "step": 758 + }, + { + "epoch": 0.04230533415082771, + "grad_norm": 0.4762093722820282, + "learning_rate": 9.983305366185223e-05, + "loss": 1.7602, + "step": 759 + }, + { + "epoch": 0.04236107240399086, + "grad_norm": 0.5182420015335083, + "learning_rate": 9.983232584170381e-05, + "loss": 1.8644, + "step": 760 + }, + { + "epoch": 0.042416810657154004, + "grad_norm": 0.4640427231788635, + "learning_rate": 9.983159644116454e-05, + "loss": 1.6919, + "step": 761 + }, + { + "epoch": 0.042472548910317154, + "grad_norm": 0.4894956946372986, + "learning_rate": 9.983086546025759e-05, + "loss": 1.9491, + "step": 762 + }, + { + "epoch": 0.0425282871634803, + "grad_norm": 0.49869638681411743, + "learning_rate": 9.98301328990061e-05, + "loss": 1.9184, + "step": 763 + }, + { + "epoch": 0.04258402541664344, + "grad_norm": 0.5161083936691284, + "learning_rate": 9.982939875743333e-05, + "loss": 1.826, + "step": 764 + }, + { + "epoch": 0.04263976366980659, + "grad_norm": 0.4913845956325531, + "learning_rate": 9.982866303556258e-05, + "loss": 1.7675, + "step": 765 + }, + { + "epoch": 0.04269550192296973, + "grad_norm": 0.49277618527412415, + "learning_rate": 9.982792573341713e-05, + "loss": 1.8539, + "step": 766 + }, + { + "epoch": 0.04275124017613288, + "grad_norm": 0.5222828388214111, + "learning_rate": 9.982718685102039e-05, + "loss": 1.9196, + "step": 767 + }, + { + "epoch": 0.042806978429296025, + "grad_norm": 0.5137212872505188, + "learning_rate": 9.982644638839583e-05, + "loss": 1.8719, + "step": 768 + }, + { + "epoch": 0.042862716682459175, + "grad_norm": 0.646440327167511, + "learning_rate": 9.982570434556686e-05, + "loss": 1.9678, + "step": 769 + }, + { + "epoch": 0.04291845493562232, + "grad_norm": 0.4992925524711609, + "learning_rate": 9.982496072255708e-05, + "loss": 1.7078, + "step": 770 + }, + { + "epoch": 0.04297419318878546, + "grad_norm": 0.4863613247871399, + "learning_rate": 9.982421551939003e-05, + "loss": 1.8064, + "step": 771 + }, + { + "epoch": 0.04302993144194861, + "grad_norm": 0.4646783769130707, + "learning_rate": 9.982346873608937e-05, + "loss": 1.6427, + "step": 772 + }, + { + "epoch": 0.04308566969511175, + "grad_norm": 0.5143455266952515, + "learning_rate": 9.982272037267877e-05, + "loss": 1.7367, + "step": 773 + }, + { + "epoch": 0.0431414079482749, + "grad_norm": 0.4936600923538208, + "learning_rate": 9.982197042918195e-05, + "loss": 1.6834, + "step": 774 + }, + { + "epoch": 0.043197146201438046, + "grad_norm": 0.4923505485057831, + "learning_rate": 9.982121890562273e-05, + "loss": 1.7545, + "step": 775 + }, + { + "epoch": 0.043252884454601195, + "grad_norm": 0.5399130582809448, + "learning_rate": 9.982046580202493e-05, + "loss": 1.784, + "step": 776 + }, + { + "epoch": 0.04330862270776434, + "grad_norm": 0.49087220430374146, + "learning_rate": 9.98197111184124e-05, + "loss": 1.8328, + "step": 777 + }, + { + "epoch": 0.04336436096092749, + "grad_norm": 0.5504277348518372, + "learning_rate": 9.981895485480912e-05, + "loss": 1.8808, + "step": 778 + }, + { + "epoch": 0.04342009921409063, + "grad_norm": 0.45953568816185, + "learning_rate": 9.981819701123907e-05, + "loss": 1.577, + "step": 779 + }, + { + "epoch": 0.043475837467253774, + "grad_norm": 0.4762939214706421, + "learning_rate": 9.981743758772625e-05, + "loss": 1.6959, + "step": 780 + }, + { + "epoch": 0.043531575720416923, + "grad_norm": 0.4667057991027832, + "learning_rate": 9.981667658429477e-05, + "loss": 1.6635, + "step": 781 + }, + { + "epoch": 0.043587313973580066, + "grad_norm": 0.5036124587059021, + "learning_rate": 9.981591400096877e-05, + "loss": 1.854, + "step": 782 + }, + { + "epoch": 0.043643052226743216, + "grad_norm": 0.48234641551971436, + "learning_rate": 9.98151498377724e-05, + "loss": 1.781, + "step": 783 + }, + { + "epoch": 0.04369879047990636, + "grad_norm": 0.4990682005882263, + "learning_rate": 9.981438409472994e-05, + "loss": 1.6629, + "step": 784 + }, + { + "epoch": 0.04375452873306951, + "grad_norm": 0.4655357599258423, + "learning_rate": 9.981361677186566e-05, + "loss": 1.7694, + "step": 785 + }, + { + "epoch": 0.04381026698623265, + "grad_norm": 0.4690426290035248, + "learning_rate": 9.981284786920388e-05, + "loss": 1.7242, + "step": 786 + }, + { + "epoch": 0.043866005239395794, + "grad_norm": 0.46350887417793274, + "learning_rate": 9.981207738676899e-05, + "loss": 1.6032, + "step": 787 + }, + { + "epoch": 0.043921743492558944, + "grad_norm": 0.5220307111740112, + "learning_rate": 9.981130532458544e-05, + "loss": 1.8624, + "step": 788 + }, + { + "epoch": 0.04397748174572209, + "grad_norm": 0.465497761964798, + "learning_rate": 9.98105316826777e-05, + "loss": 1.6831, + "step": 789 + }, + { + "epoch": 0.04403321999888524, + "grad_norm": 0.4893016219139099, + "learning_rate": 9.980975646107032e-05, + "loss": 1.7933, + "step": 790 + }, + { + "epoch": 0.04408895825204838, + "grad_norm": 0.4457073509693146, + "learning_rate": 9.980897965978787e-05, + "loss": 1.6383, + "step": 791 + }, + { + "epoch": 0.04414469650521153, + "grad_norm": 0.5064904093742371, + "learning_rate": 9.980820127885497e-05, + "loss": 1.8771, + "step": 792 + }, + { + "epoch": 0.04420043475837467, + "grad_norm": 0.5663847327232361, + "learning_rate": 9.980742131829635e-05, + "loss": 2.0977, + "step": 793 + }, + { + "epoch": 0.044256173011537815, + "grad_norm": 0.558462381362915, + "learning_rate": 9.980663977813672e-05, + "loss": 1.9813, + "step": 794 + }, + { + "epoch": 0.044311911264700965, + "grad_norm": 0.5043233633041382, + "learning_rate": 9.980585665840087e-05, + "loss": 1.7362, + "step": 795 + }, + { + "epoch": 0.04436764951786411, + "grad_norm": 0.5110850930213928, + "learning_rate": 9.980507195911363e-05, + "loss": 1.5489, + "step": 796 + }, + { + "epoch": 0.04442338777102726, + "grad_norm": 0.5611404180526733, + "learning_rate": 9.980428568029989e-05, + "loss": 1.9545, + "step": 797 + }, + { + "epoch": 0.0444791260241904, + "grad_norm": 0.44059324264526367, + "learning_rate": 9.98034978219846e-05, + "loss": 1.6321, + "step": 798 + }, + { + "epoch": 0.04453486427735355, + "grad_norm": 0.5034955143928528, + "learning_rate": 9.980270838419273e-05, + "loss": 1.7045, + "step": 799 + }, + { + "epoch": 0.04459060253051669, + "grad_norm": 0.49383604526519775, + "learning_rate": 9.98019173669493e-05, + "loss": 1.6414, + "step": 800 + }, + { + "epoch": 0.04464634078367984, + "grad_norm": 0.5035958290100098, + "learning_rate": 9.980112477027942e-05, + "loss": 1.8683, + "step": 801 + }, + { + "epoch": 0.044702079036842986, + "grad_norm": 0.4942208230495453, + "learning_rate": 9.980033059420826e-05, + "loss": 1.7773, + "step": 802 + }, + { + "epoch": 0.04475781729000613, + "grad_norm": 0.5211103558540344, + "learning_rate": 9.979953483876095e-05, + "loss": 2.0631, + "step": 803 + }, + { + "epoch": 0.04481355554316928, + "grad_norm": 0.5940659046173096, + "learning_rate": 9.979873750396273e-05, + "loss": 2.0601, + "step": 804 + }, + { + "epoch": 0.04486929379633242, + "grad_norm": 0.5211898684501648, + "learning_rate": 9.979793858983891e-05, + "loss": 1.7687, + "step": 805 + }, + { + "epoch": 0.04492503204949557, + "grad_norm": 0.5175243020057678, + "learning_rate": 9.979713809641482e-05, + "loss": 1.9662, + "step": 806 + }, + { + "epoch": 0.044980770302658714, + "grad_norm": 0.5139010548591614, + "learning_rate": 9.979633602371586e-05, + "loss": 1.7011, + "step": 807 + }, + { + "epoch": 0.045036508555821864, + "grad_norm": 0.4817015826702118, + "learning_rate": 9.979553237176744e-05, + "loss": 1.7632, + "step": 808 + }, + { + "epoch": 0.045092246808985006, + "grad_norm": 0.49766993522644043, + "learning_rate": 9.979472714059506e-05, + "loss": 1.917, + "step": 809 + }, + { + "epoch": 0.04514798506214815, + "grad_norm": 0.5208562612533569, + "learning_rate": 9.979392033022427e-05, + "loss": 1.946, + "step": 810 + }, + { + "epoch": 0.0452037233153113, + "grad_norm": 0.4790688753128052, + "learning_rate": 9.979311194068064e-05, + "loss": 1.8072, + "step": 811 + }, + { + "epoch": 0.04525946156847444, + "grad_norm": 0.46075010299682617, + "learning_rate": 9.979230197198981e-05, + "loss": 1.6243, + "step": 812 + }, + { + "epoch": 0.04531519982163759, + "grad_norm": 0.488349974155426, + "learning_rate": 9.979149042417749e-05, + "loss": 1.7733, + "step": 813 + }, + { + "epoch": 0.045370938074800735, + "grad_norm": 0.4905661940574646, + "learning_rate": 9.979067729726938e-05, + "loss": 1.821, + "step": 814 + }, + { + "epoch": 0.045426676327963884, + "grad_norm": 0.5073617696762085, + "learning_rate": 9.978986259129129e-05, + "loss": 1.8286, + "step": 815 + }, + { + "epoch": 0.04548241458112703, + "grad_norm": 0.5074631571769714, + "learning_rate": 9.978904630626904e-05, + "loss": 1.7967, + "step": 816 + }, + { + "epoch": 0.04553815283429017, + "grad_norm": 0.5455936193466187, + "learning_rate": 9.978822844222855e-05, + "loss": 1.9883, + "step": 817 + }, + { + "epoch": 0.04559389108745332, + "grad_norm": 0.5111860632896423, + "learning_rate": 9.978740899919574e-05, + "loss": 1.8694, + "step": 818 + }, + { + "epoch": 0.04564962934061646, + "grad_norm": 0.4975983202457428, + "learning_rate": 9.978658797719658e-05, + "loss": 1.714, + "step": 819 + }, + { + "epoch": 0.04570536759377961, + "grad_norm": 0.4770795702934265, + "learning_rate": 9.978576537625714e-05, + "loss": 1.8288, + "step": 820 + }, + { + "epoch": 0.045761105846942755, + "grad_norm": 0.5559741854667664, + "learning_rate": 9.97849411964035e-05, + "loss": 2.093, + "step": 821 + }, + { + "epoch": 0.045816844100105905, + "grad_norm": 0.4961313307285309, + "learning_rate": 9.978411543766177e-05, + "loss": 1.6607, + "step": 822 + }, + { + "epoch": 0.04587258235326905, + "grad_norm": 0.5356935262680054, + "learning_rate": 9.978328810005816e-05, + "loss": 1.9762, + "step": 823 + }, + { + "epoch": 0.0459283206064322, + "grad_norm": 0.4933258295059204, + "learning_rate": 9.978245918361893e-05, + "loss": 1.6018, + "step": 824 + }, + { + "epoch": 0.04598405885959534, + "grad_norm": 0.5278127193450928, + "learning_rate": 9.978162868837034e-05, + "loss": 1.8532, + "step": 825 + }, + { + "epoch": 0.04603979711275848, + "grad_norm": 0.4802572429180145, + "learning_rate": 9.978079661433873e-05, + "loss": 1.7551, + "step": 826 + }, + { + "epoch": 0.04609553536592163, + "grad_norm": 0.4906105101108551, + "learning_rate": 9.977996296155049e-05, + "loss": 1.7463, + "step": 827 + }, + { + "epoch": 0.046151273619084776, + "grad_norm": 0.43020668625831604, + "learning_rate": 9.977912773003206e-05, + "loss": 1.6216, + "step": 828 + }, + { + "epoch": 0.046207011872247926, + "grad_norm": 0.49433162808418274, + "learning_rate": 9.977829091980995e-05, + "loss": 1.9011, + "step": 829 + }, + { + "epoch": 0.04626275012541107, + "grad_norm": 0.45222243666648865, + "learning_rate": 9.977745253091067e-05, + "loss": 1.3583, + "step": 830 + }, + { + "epoch": 0.04631848837857422, + "grad_norm": 0.4955357015132904, + "learning_rate": 9.977661256336081e-05, + "loss": 1.7256, + "step": 831 + }, + { + "epoch": 0.04637422663173736, + "grad_norm": 0.5137125253677368, + "learning_rate": 9.977577101718701e-05, + "loss": 1.8484, + "step": 832 + }, + { + "epoch": 0.046429964884900504, + "grad_norm": 0.49741753935813904, + "learning_rate": 9.977492789241598e-05, + "loss": 1.6564, + "step": 833 + }, + { + "epoch": 0.046485703138063654, + "grad_norm": 0.4994182586669922, + "learning_rate": 9.977408318907444e-05, + "loss": 1.721, + "step": 834 + }, + { + "epoch": 0.0465414413912268, + "grad_norm": 0.539135754108429, + "learning_rate": 9.97732369071892e-05, + "loss": 2.0474, + "step": 835 + }, + { + "epoch": 0.04659717964438995, + "grad_norm": 0.49502313137054443, + "learning_rate": 9.977238904678707e-05, + "loss": 1.4078, + "step": 836 + }, + { + "epoch": 0.04665291789755309, + "grad_norm": 0.4542715549468994, + "learning_rate": 9.977153960789497e-05, + "loss": 1.5402, + "step": 837 + }, + { + "epoch": 0.04670865615071624, + "grad_norm": 0.48588764667510986, + "learning_rate": 9.97706885905398e-05, + "loss": 1.8641, + "step": 838 + }, + { + "epoch": 0.04676439440387938, + "grad_norm": 0.529255211353302, + "learning_rate": 9.976983599474857e-05, + "loss": 1.8055, + "step": 839 + }, + { + "epoch": 0.046820132657042525, + "grad_norm": 0.4630698561668396, + "learning_rate": 9.976898182054832e-05, + "loss": 1.5263, + "step": 840 + }, + { + "epoch": 0.046875870910205675, + "grad_norm": 0.5334575176239014, + "learning_rate": 9.976812606796615e-05, + "loss": 1.7926, + "step": 841 + }, + { + "epoch": 0.04693160916336882, + "grad_norm": 0.49275916814804077, + "learning_rate": 9.976726873702918e-05, + "loss": 1.6341, + "step": 842 + }, + { + "epoch": 0.04698734741653197, + "grad_norm": 0.5276961326599121, + "learning_rate": 9.976640982776461e-05, + "loss": 1.882, + "step": 843 + }, + { + "epoch": 0.04704308566969511, + "grad_norm": 0.49929726123809814, + "learning_rate": 9.97655493401997e-05, + "loss": 1.6004, + "step": 844 + }, + { + "epoch": 0.04709882392285826, + "grad_norm": 0.4716168940067291, + "learning_rate": 9.97646872743617e-05, + "loss": 1.7355, + "step": 845 + }, + { + "epoch": 0.0471545621760214, + "grad_norm": 0.5293796062469482, + "learning_rate": 9.976382363027797e-05, + "loss": 1.9073, + "step": 846 + }, + { + "epoch": 0.04721030042918455, + "grad_norm": 0.47008490562438965, + "learning_rate": 9.976295840797589e-05, + "loss": 1.6875, + "step": 847 + }, + { + "epoch": 0.047266038682347696, + "grad_norm": 0.48457372188568115, + "learning_rate": 9.976209160748292e-05, + "loss": 1.6172, + "step": 848 + }, + { + "epoch": 0.04732177693551084, + "grad_norm": 0.500151515007019, + "learning_rate": 9.976122322882653e-05, + "loss": 1.6371, + "step": 849 + }, + { + "epoch": 0.04737751518867399, + "grad_norm": 0.5459775924682617, + "learning_rate": 9.976035327203427e-05, + "loss": 1.9283, + "step": 850 + }, + { + "epoch": 0.04743325344183713, + "grad_norm": 0.5352368950843811, + "learning_rate": 9.975948173713374e-05, + "loss": 2.0407, + "step": 851 + }, + { + "epoch": 0.04748899169500028, + "grad_norm": 0.5491572618484497, + "learning_rate": 9.975860862415254e-05, + "loss": 1.7475, + "step": 852 + }, + { + "epoch": 0.047544729948163424, + "grad_norm": 0.49011510610580444, + "learning_rate": 9.975773393311841e-05, + "loss": 1.7922, + "step": 853 + }, + { + "epoch": 0.04760046820132657, + "grad_norm": 0.5197030305862427, + "learning_rate": 9.975685766405906e-05, + "loss": 1.7012, + "step": 854 + }, + { + "epoch": 0.047656206454489716, + "grad_norm": 0.487704336643219, + "learning_rate": 9.975597981700228e-05, + "loss": 1.6647, + "step": 855 + }, + { + "epoch": 0.04771194470765286, + "grad_norm": 0.4743403196334839, + "learning_rate": 9.975510039197592e-05, + "loss": 1.5522, + "step": 856 + }, + { + "epoch": 0.04776768296081601, + "grad_norm": 0.46670085191726685, + "learning_rate": 9.975421938900789e-05, + "loss": 1.5235, + "step": 857 + }, + { + "epoch": 0.04782342121397915, + "grad_norm": 0.48920536041259766, + "learning_rate": 9.975333680812609e-05, + "loss": 1.8876, + "step": 858 + }, + { + "epoch": 0.0478791594671423, + "grad_norm": 0.5793198943138123, + "learning_rate": 9.975245264935852e-05, + "loss": 1.8422, + "step": 859 + }, + { + "epoch": 0.047934897720305444, + "grad_norm": 0.49111589789390564, + "learning_rate": 9.975156691273324e-05, + "loss": 1.7702, + "step": 860 + }, + { + "epoch": 0.047990635973468594, + "grad_norm": 0.5276595950126648, + "learning_rate": 9.975067959827833e-05, + "loss": 1.9332, + "step": 861 + }, + { + "epoch": 0.04804637422663174, + "grad_norm": 0.4866962134838104, + "learning_rate": 9.974979070602192e-05, + "loss": 1.7497, + "step": 862 + }, + { + "epoch": 0.04810211247979488, + "grad_norm": 0.5197125673294067, + "learning_rate": 9.974890023599222e-05, + "loss": 2.0405, + "step": 863 + }, + { + "epoch": 0.04815785073295803, + "grad_norm": 0.49782440066337585, + "learning_rate": 9.974800818821746e-05, + "loss": 1.7609, + "step": 864 + }, + { + "epoch": 0.04821358898612117, + "grad_norm": 0.52313232421875, + "learning_rate": 9.974711456272593e-05, + "loss": 1.9515, + "step": 865 + }, + { + "epoch": 0.04826932723928432, + "grad_norm": 0.4546637237071991, + "learning_rate": 9.974621935954597e-05, + "loss": 1.645, + "step": 866 + }, + { + "epoch": 0.048325065492447465, + "grad_norm": 0.47760143876075745, + "learning_rate": 9.974532257870596e-05, + "loss": 1.7104, + "step": 867 + }, + { + "epoch": 0.048380803745610615, + "grad_norm": 0.4868486225605011, + "learning_rate": 9.974442422023438e-05, + "loss": 1.8043, + "step": 868 + }, + { + "epoch": 0.04843654199877376, + "grad_norm": 0.5107572078704834, + "learning_rate": 9.974352428415968e-05, + "loss": 1.9662, + "step": 869 + }, + { + "epoch": 0.04849228025193691, + "grad_norm": 0.5269783139228821, + "learning_rate": 9.974262277051041e-05, + "loss": 1.8876, + "step": 870 + }, + { + "epoch": 0.04854801850510005, + "grad_norm": 0.48782503604888916, + "learning_rate": 9.974171967931519e-05, + "loss": 1.5996, + "step": 871 + }, + { + "epoch": 0.04860375675826319, + "grad_norm": 0.5057775974273682, + "learning_rate": 9.974081501060259e-05, + "loss": 1.6907, + "step": 872 + }, + { + "epoch": 0.04865949501142634, + "grad_norm": 0.4904307723045349, + "learning_rate": 9.973990876440138e-05, + "loss": 1.7377, + "step": 873 + }, + { + "epoch": 0.048715233264589486, + "grad_norm": 0.4725581407546997, + "learning_rate": 9.973900094074027e-05, + "loss": 1.8001, + "step": 874 + }, + { + "epoch": 0.048770971517752636, + "grad_norm": 0.527885913848877, + "learning_rate": 9.973809153964804e-05, + "loss": 1.8128, + "step": 875 + }, + { + "epoch": 0.04882670977091578, + "grad_norm": 0.5520697236061096, + "learning_rate": 9.973718056115354e-05, + "loss": 2.0648, + "step": 876 + }, + { + "epoch": 0.04888244802407893, + "grad_norm": 0.4812840223312378, + "learning_rate": 9.973626800528566e-05, + "loss": 1.8552, + "step": 877 + }, + { + "epoch": 0.04893818627724207, + "grad_norm": 0.46856966614723206, + "learning_rate": 9.973535387207333e-05, + "loss": 1.577, + "step": 878 + }, + { + "epoch": 0.048993924530405214, + "grad_norm": 0.4921995997428894, + "learning_rate": 9.973443816154557e-05, + "loss": 1.66, + "step": 879 + }, + { + "epoch": 0.049049662783568364, + "grad_norm": 0.5018383264541626, + "learning_rate": 9.97335208737314e-05, + "loss": 1.7623, + "step": 880 + }, + { + "epoch": 0.04910540103673151, + "grad_norm": 0.5345847010612488, + "learning_rate": 9.973260200865991e-05, + "loss": 1.8681, + "step": 881 + }, + { + "epoch": 0.049161139289894656, + "grad_norm": 0.5296522974967957, + "learning_rate": 9.973168156636025e-05, + "loss": 1.9225, + "step": 882 + }, + { + "epoch": 0.0492168775430578, + "grad_norm": 0.5303376317024231, + "learning_rate": 9.97307595468616e-05, + "loss": 1.8308, + "step": 883 + }, + { + "epoch": 0.04927261579622095, + "grad_norm": 0.45620301365852356, + "learning_rate": 9.97298359501932e-05, + "loss": 1.5791, + "step": 884 + }, + { + "epoch": 0.04932835404938409, + "grad_norm": 0.5314328074455261, + "learning_rate": 9.972891077638438e-05, + "loss": 1.7279, + "step": 885 + }, + { + "epoch": 0.049384092302547235, + "grad_norm": 0.4765213429927826, + "learning_rate": 9.972798402546441e-05, + "loss": 1.5131, + "step": 886 + }, + { + "epoch": 0.049439830555710385, + "grad_norm": 0.4913032054901123, + "learning_rate": 9.972705569746274e-05, + "loss": 1.6591, + "step": 887 + }, + { + "epoch": 0.04949556880887353, + "grad_norm": 0.48732152581214905, + "learning_rate": 9.972612579240881e-05, + "loss": 1.7141, + "step": 888 + }, + { + "epoch": 0.04955130706203668, + "grad_norm": 0.5283141732215881, + "learning_rate": 9.972519431033206e-05, + "loss": 1.8636, + "step": 889 + }, + { + "epoch": 0.04960704531519982, + "grad_norm": 0.5298954844474792, + "learning_rate": 9.972426125126209e-05, + "loss": 1.7943, + "step": 890 + }, + { + "epoch": 0.04966278356836297, + "grad_norm": 0.5104478597640991, + "learning_rate": 9.972332661522845e-05, + "loss": 1.6949, + "step": 891 + }, + { + "epoch": 0.04971852182152611, + "grad_norm": 0.5439249873161316, + "learning_rate": 9.972239040226082e-05, + "loss": 1.9313, + "step": 892 + }, + { + "epoch": 0.04977426007468926, + "grad_norm": 0.4874706566333771, + "learning_rate": 9.972145261238884e-05, + "loss": 1.8589, + "step": 893 + }, + { + "epoch": 0.049829998327852405, + "grad_norm": 0.5243585705757141, + "learning_rate": 9.972051324564229e-05, + "loss": 1.9736, + "step": 894 + }, + { + "epoch": 0.04988573658101555, + "grad_norm": 0.5669842958450317, + "learning_rate": 9.971957230205096e-05, + "loss": 2.1093, + "step": 895 + }, + { + "epoch": 0.0499414748341787, + "grad_norm": 0.4888775050640106, + "learning_rate": 9.971862978164466e-05, + "loss": 1.6786, + "step": 896 + }, + { + "epoch": 0.04999721308734184, + "grad_norm": 0.5279240608215332, + "learning_rate": 9.971768568445332e-05, + "loss": 1.8162, + "step": 897 + }, + { + "epoch": 0.05005295134050499, + "grad_norm": 0.4473552405834198, + "learning_rate": 9.971674001050686e-05, + "loss": 1.3044, + "step": 898 + }, + { + "epoch": 0.05010868959366813, + "grad_norm": 0.4724571704864502, + "learning_rate": 9.971579275983527e-05, + "loss": 1.7169, + "step": 899 + }, + { + "epoch": 0.05016442784683128, + "grad_norm": 0.4805344343185425, + "learning_rate": 9.971484393246861e-05, + "loss": 1.4898, + "step": 900 + }, + { + "epoch": 0.050220166099994426, + "grad_norm": 0.4852250814437866, + "learning_rate": 9.971389352843695e-05, + "loss": 1.6325, + "step": 901 + }, + { + "epoch": 0.05027590435315757, + "grad_norm": 0.49681854248046875, + "learning_rate": 9.971294154777044e-05, + "loss": 1.5962, + "step": 902 + }, + { + "epoch": 0.05033164260632072, + "grad_norm": 0.5085350871086121, + "learning_rate": 9.971198799049928e-05, + "loss": 1.8215, + "step": 903 + }, + { + "epoch": 0.05038738085948386, + "grad_norm": 0.49748629331588745, + "learning_rate": 9.971103285665369e-05, + "loss": 1.9647, + "step": 904 + }, + { + "epoch": 0.05044311911264701, + "grad_norm": 0.4835662543773651, + "learning_rate": 9.971007614626397e-05, + "loss": 1.6109, + "step": 905 + }, + { + "epoch": 0.050498857365810154, + "grad_norm": 0.5058585405349731, + "learning_rate": 9.970911785936047e-05, + "loss": 1.6419, + "step": 906 + }, + { + "epoch": 0.050554595618973304, + "grad_norm": 0.5386664271354675, + "learning_rate": 9.970815799597358e-05, + "loss": 1.6144, + "step": 907 + }, + { + "epoch": 0.05061033387213645, + "grad_norm": 0.5337561964988708, + "learning_rate": 9.970719655613373e-05, + "loss": 1.7978, + "step": 908 + }, + { + "epoch": 0.05066607212529959, + "grad_norm": 0.532317578792572, + "learning_rate": 9.970623353987141e-05, + "loss": 1.8175, + "step": 909 + }, + { + "epoch": 0.05072181037846274, + "grad_norm": 0.5630917549133301, + "learning_rate": 9.97052689472172e-05, + "loss": 2.043, + "step": 910 + }, + { + "epoch": 0.05077754863162588, + "grad_norm": 0.554322361946106, + "learning_rate": 9.970430277820165e-05, + "loss": 1.9165, + "step": 911 + }, + { + "epoch": 0.05083328688478903, + "grad_norm": 0.49685636162757874, + "learning_rate": 9.970333503285539e-05, + "loss": 1.8203, + "step": 912 + }, + { + "epoch": 0.050889025137952175, + "grad_norm": 0.5380950570106506, + "learning_rate": 9.970236571120915e-05, + "loss": 1.9429, + "step": 913 + }, + { + "epoch": 0.050944763391115325, + "grad_norm": 0.5279613733291626, + "learning_rate": 9.970139481329364e-05, + "loss": 2.0989, + "step": 914 + }, + { + "epoch": 0.05100050164427847, + "grad_norm": 0.509904682636261, + "learning_rate": 9.970042233913968e-05, + "loss": 1.7213, + "step": 915 + }, + { + "epoch": 0.05105623989744162, + "grad_norm": 0.48252367973327637, + "learning_rate": 9.96994482887781e-05, + "loss": 1.6979, + "step": 916 + }, + { + "epoch": 0.05111197815060476, + "grad_norm": 0.5245582461357117, + "learning_rate": 9.969847266223979e-05, + "loss": 1.7629, + "step": 917 + }, + { + "epoch": 0.0511677164037679, + "grad_norm": 0.48625627160072327, + "learning_rate": 9.969749545955567e-05, + "loss": 1.7208, + "step": 918 + }, + { + "epoch": 0.05122345465693105, + "grad_norm": 0.5168225169181824, + "learning_rate": 9.969651668075678e-05, + "loss": 1.952, + "step": 919 + }, + { + "epoch": 0.051279192910094196, + "grad_norm": 0.47759923338890076, + "learning_rate": 9.969553632587409e-05, + "loss": 1.6574, + "step": 920 + }, + { + "epoch": 0.051334931163257345, + "grad_norm": 0.49498680233955383, + "learning_rate": 9.969455439493877e-05, + "loss": 1.6173, + "step": 921 + }, + { + "epoch": 0.05139066941642049, + "grad_norm": 0.48092684149742126, + "learning_rate": 9.96935708879819e-05, + "loss": 1.6471, + "step": 922 + }, + { + "epoch": 0.05144640766958364, + "grad_norm": 0.5342095494270325, + "learning_rate": 9.969258580503471e-05, + "loss": 2.0134, + "step": 923 + }, + { + "epoch": 0.05150214592274678, + "grad_norm": 0.58601975440979, + "learning_rate": 9.969159914612843e-05, + "loss": 2.1658, + "step": 924 + }, + { + "epoch": 0.051557884175909924, + "grad_norm": 0.4867340922355652, + "learning_rate": 9.969061091129433e-05, + "loss": 1.9766, + "step": 925 + }, + { + "epoch": 0.051613622429073074, + "grad_norm": 0.4857270121574402, + "learning_rate": 9.968962110056379e-05, + "loss": 1.678, + "step": 926 + }, + { + "epoch": 0.051669360682236216, + "grad_norm": 0.5170820355415344, + "learning_rate": 9.968862971396816e-05, + "loss": 1.8249, + "step": 927 + }, + { + "epoch": 0.051725098935399366, + "grad_norm": 0.4657866358757019, + "learning_rate": 9.96876367515389e-05, + "loss": 1.7606, + "step": 928 + }, + { + "epoch": 0.05178083718856251, + "grad_norm": 0.5119996666908264, + "learning_rate": 9.968664221330751e-05, + "loss": 1.8612, + "step": 929 + }, + { + "epoch": 0.05183657544172566, + "grad_norm": 0.5372640490531921, + "learning_rate": 9.968564609930553e-05, + "loss": 1.8672, + "step": 930 + }, + { + "epoch": 0.0518923136948888, + "grad_norm": 0.49778059124946594, + "learning_rate": 9.968464840956453e-05, + "loss": 1.766, + "step": 931 + }, + { + "epoch": 0.05194805194805195, + "grad_norm": 0.5260003805160522, + "learning_rate": 9.968364914411616e-05, + "loss": 1.8631, + "step": 932 + }, + { + "epoch": 0.052003790201215094, + "grad_norm": 0.5278846621513367, + "learning_rate": 9.968264830299213e-05, + "loss": 1.5441, + "step": 933 + }, + { + "epoch": 0.05205952845437824, + "grad_norm": 0.5427425503730774, + "learning_rate": 9.968164588622415e-05, + "loss": 1.7751, + "step": 934 + }, + { + "epoch": 0.05211526670754139, + "grad_norm": 0.4653323292732239, + "learning_rate": 9.968064189384403e-05, + "loss": 1.662, + "step": 935 + }, + { + "epoch": 0.05217100496070453, + "grad_norm": 0.5192728638648987, + "learning_rate": 9.967963632588362e-05, + "loss": 1.7384, + "step": 936 + }, + { + "epoch": 0.05222674321386768, + "grad_norm": 0.4995409846305847, + "learning_rate": 9.96786291823748e-05, + "loss": 1.8133, + "step": 937 + }, + { + "epoch": 0.05228248146703082, + "grad_norm": 0.5626217722892761, + "learning_rate": 9.96776204633495e-05, + "loss": 1.8851, + "step": 938 + }, + { + "epoch": 0.05233821972019397, + "grad_norm": 0.5185354351997375, + "learning_rate": 9.967661016883972e-05, + "loss": 1.6583, + "step": 939 + }, + { + "epoch": 0.052393957973357115, + "grad_norm": 0.5034851431846619, + "learning_rate": 9.967559829887749e-05, + "loss": 1.6385, + "step": 940 + }, + { + "epoch": 0.05244969622652026, + "grad_norm": 0.4795439541339874, + "learning_rate": 9.967458485349492e-05, + "loss": 1.6901, + "step": 941 + }, + { + "epoch": 0.05250543447968341, + "grad_norm": 0.6365668177604675, + "learning_rate": 9.967356983272414e-05, + "loss": 1.9757, + "step": 942 + }, + { + "epoch": 0.05256117273284655, + "grad_norm": 0.48566654324531555, + "learning_rate": 9.967255323659734e-05, + "loss": 1.6266, + "step": 943 + }, + { + "epoch": 0.0526169109860097, + "grad_norm": 0.4971524775028229, + "learning_rate": 9.967153506514677e-05, + "loss": 1.6938, + "step": 944 + }, + { + "epoch": 0.05267264923917284, + "grad_norm": 0.5263299345970154, + "learning_rate": 9.967051531840471e-05, + "loss": 1.8448, + "step": 945 + }, + { + "epoch": 0.05272838749233599, + "grad_norm": 0.4903882145881653, + "learning_rate": 9.96694939964035e-05, + "loss": 1.5313, + "step": 946 + }, + { + "epoch": 0.052784125745499136, + "grad_norm": 0.5515956878662109, + "learning_rate": 9.966847109917555e-05, + "loss": 1.9398, + "step": 947 + }, + { + "epoch": 0.05283986399866228, + "grad_norm": 0.47069814801216125, + "learning_rate": 9.966744662675326e-05, + "loss": 1.8052, + "step": 948 + }, + { + "epoch": 0.05289560225182543, + "grad_norm": 0.4904758036136627, + "learning_rate": 9.966642057916915e-05, + "loss": 1.7875, + "step": 949 + }, + { + "epoch": 0.05295134050498857, + "grad_norm": 0.5010367035865784, + "learning_rate": 9.966539295645576e-05, + "loss": 1.6786, + "step": 950 + }, + { + "epoch": 0.05300707875815172, + "grad_norm": 0.4812747538089752, + "learning_rate": 9.966436375864567e-05, + "loss": 1.473, + "step": 951 + }, + { + "epoch": 0.053062817011314864, + "grad_norm": 0.5010087490081787, + "learning_rate": 9.966333298577154e-05, + "loss": 1.7648, + "step": 952 + }, + { + "epoch": 0.053118555264478014, + "grad_norm": 0.5247920155525208, + "learning_rate": 9.966230063786602e-05, + "loss": 1.6435, + "step": 953 + }, + { + "epoch": 0.05317429351764116, + "grad_norm": 0.5183125734329224, + "learning_rate": 9.96612667149619e-05, + "loss": 1.762, + "step": 954 + }, + { + "epoch": 0.053230031770804306, + "grad_norm": 0.5197505950927734, + "learning_rate": 9.966023121709192e-05, + "loss": 1.8957, + "step": 955 + }, + { + "epoch": 0.05328577002396745, + "grad_norm": 0.4871842563152313, + "learning_rate": 9.965919414428896e-05, + "loss": 1.8783, + "step": 956 + }, + { + "epoch": 0.05334150827713059, + "grad_norm": 0.4965290427207947, + "learning_rate": 9.965815549658589e-05, + "loss": 1.8575, + "step": 957 + }, + { + "epoch": 0.05339724653029374, + "grad_norm": 0.5005083680152893, + "learning_rate": 9.965711527401567e-05, + "loss": 1.7704, + "step": 958 + }, + { + "epoch": 0.053452984783456885, + "grad_norm": 0.4561206102371216, + "learning_rate": 9.965607347661125e-05, + "loss": 1.6103, + "step": 959 + }, + { + "epoch": 0.053508723036620034, + "grad_norm": 0.5352826714515686, + "learning_rate": 9.965503010440571e-05, + "loss": 1.9864, + "step": 960 + }, + { + "epoch": 0.05356446128978318, + "grad_norm": 0.4568333327770233, + "learning_rate": 9.965398515743212e-05, + "loss": 1.7264, + "step": 961 + }, + { + "epoch": 0.05362019954294633, + "grad_norm": 0.5570031404495239, + "learning_rate": 9.965293863572363e-05, + "loss": 2.2176, + "step": 962 + }, + { + "epoch": 0.05367593779610947, + "grad_norm": 0.5380359888076782, + "learning_rate": 9.96518905393134e-05, + "loss": 2.0434, + "step": 963 + }, + { + "epoch": 0.05373167604927261, + "grad_norm": 0.46430766582489014, + "learning_rate": 9.965084086823472e-05, + "loss": 1.4151, + "step": 964 + }, + { + "epoch": 0.05378741430243576, + "grad_norm": 0.4653235077857971, + "learning_rate": 9.964978962252085e-05, + "loss": 1.7144, + "step": 965 + }, + { + "epoch": 0.053843152555598905, + "grad_norm": 0.49018028378486633, + "learning_rate": 9.964873680220512e-05, + "loss": 1.6531, + "step": 966 + }, + { + "epoch": 0.053898890808762055, + "grad_norm": 0.5718449354171753, + "learning_rate": 9.964768240732093e-05, + "loss": 1.9851, + "step": 967 + }, + { + "epoch": 0.0539546290619252, + "grad_norm": 0.5048679113388062, + "learning_rate": 9.964662643790173e-05, + "loss": 1.9137, + "step": 968 + }, + { + "epoch": 0.05401036731508835, + "grad_norm": 0.5291681885719299, + "learning_rate": 9.9645568893981e-05, + "loss": 1.8972, + "step": 969 + }, + { + "epoch": 0.05406610556825149, + "grad_norm": 0.5041894316673279, + "learning_rate": 9.964450977559226e-05, + "loss": 1.5612, + "step": 970 + }, + { + "epoch": 0.054121843821414634, + "grad_norm": 0.561788022518158, + "learning_rate": 9.964344908276914e-05, + "loss": 2.0708, + "step": 971 + }, + { + "epoch": 0.05417758207457778, + "grad_norm": 0.4838697016239166, + "learning_rate": 9.964238681554524e-05, + "loss": 1.6573, + "step": 972 + }, + { + "epoch": 0.054233320327740926, + "grad_norm": 0.5092923641204834, + "learning_rate": 9.964132297395428e-05, + "loss": 1.918, + "step": 973 + }, + { + "epoch": 0.054289058580904076, + "grad_norm": 0.5128215551376343, + "learning_rate": 9.964025755802997e-05, + "loss": 1.721, + "step": 974 + }, + { + "epoch": 0.05434479683406722, + "grad_norm": 0.597062885761261, + "learning_rate": 9.963919056780612e-05, + "loss": 1.9453, + "step": 975 + }, + { + "epoch": 0.05440053508723037, + "grad_norm": 0.5623565316200256, + "learning_rate": 9.963812200331656e-05, + "loss": 1.9271, + "step": 976 + }, + { + "epoch": 0.05445627334039351, + "grad_norm": 0.4568030834197998, + "learning_rate": 9.963705186459517e-05, + "loss": 1.5766, + "step": 977 + }, + { + "epoch": 0.05451201159355666, + "grad_norm": 0.4906899631023407, + "learning_rate": 9.963598015167592e-05, + "loss": 1.7721, + "step": 978 + }, + { + "epoch": 0.054567749846719804, + "grad_norm": 0.5041657090187073, + "learning_rate": 9.963490686459277e-05, + "loss": 1.6293, + "step": 979 + }, + { + "epoch": 0.05462348809988295, + "grad_norm": 0.533762514591217, + "learning_rate": 9.963383200337977e-05, + "loss": 1.8723, + "step": 980 + }, + { + "epoch": 0.0546792263530461, + "grad_norm": 0.4968359172344208, + "learning_rate": 9.963275556807098e-05, + "loss": 1.7368, + "step": 981 + }, + { + "epoch": 0.05473496460620924, + "grad_norm": 0.4822302758693695, + "learning_rate": 9.963167755870059e-05, + "loss": 1.4994, + "step": 982 + }, + { + "epoch": 0.05479070285937239, + "grad_norm": 0.5066803097724915, + "learning_rate": 9.963059797530274e-05, + "loss": 1.8058, + "step": 983 + }, + { + "epoch": 0.05484644111253553, + "grad_norm": 0.518132209777832, + "learning_rate": 9.96295168179117e-05, + "loss": 1.7393, + "step": 984 + }, + { + "epoch": 0.05490217936569868, + "grad_norm": 0.5607625842094421, + "learning_rate": 9.962843408656176e-05, + "loss": 2.149, + "step": 985 + }, + { + "epoch": 0.054957917618861825, + "grad_norm": 0.5685406923294067, + "learning_rate": 9.962734978128723e-05, + "loss": 2.1734, + "step": 986 + }, + { + "epoch": 0.05501365587202497, + "grad_norm": 0.5319599509239197, + "learning_rate": 9.962626390212251e-05, + "loss": 1.8782, + "step": 987 + }, + { + "epoch": 0.05506939412518812, + "grad_norm": 0.4679426848888397, + "learning_rate": 9.962517644910204e-05, + "loss": 1.7033, + "step": 988 + }, + { + "epoch": 0.05512513237835126, + "grad_norm": 0.5416939854621887, + "learning_rate": 9.962408742226032e-05, + "loss": 1.969, + "step": 989 + }, + { + "epoch": 0.05518087063151441, + "grad_norm": 0.49005210399627686, + "learning_rate": 9.962299682163186e-05, + "loss": 1.8229, + "step": 990 + }, + { + "epoch": 0.05523660888467755, + "grad_norm": 0.5170348286628723, + "learning_rate": 9.962190464725128e-05, + "loss": 1.8161, + "step": 991 + }, + { + "epoch": 0.0552923471378407, + "grad_norm": 0.5188906192779541, + "learning_rate": 9.962081089915319e-05, + "loss": 1.938, + "step": 992 + }, + { + "epoch": 0.055348085391003846, + "grad_norm": 0.4945777952671051, + "learning_rate": 9.961971557737227e-05, + "loss": 1.7414, + "step": 993 + }, + { + "epoch": 0.05540382364416699, + "grad_norm": 0.511976420879364, + "learning_rate": 9.96186186819433e-05, + "loss": 1.8595, + "step": 994 + }, + { + "epoch": 0.05545956189733014, + "grad_norm": 0.5381083488464355, + "learning_rate": 9.961752021290103e-05, + "loss": 1.8233, + "step": 995 + }, + { + "epoch": 0.05551530015049328, + "grad_norm": 0.4679305851459503, + "learning_rate": 9.961642017028033e-05, + "loss": 1.6666, + "step": 996 + }, + { + "epoch": 0.05557103840365643, + "grad_norm": 0.5513458847999573, + "learning_rate": 9.961531855411603e-05, + "loss": 2.0589, + "step": 997 + }, + { + "epoch": 0.055626776656819574, + "grad_norm": 0.5168341994285583, + "learning_rate": 9.961421536444313e-05, + "loss": 2.0774, + "step": 998 + }, + { + "epoch": 0.055682514909982724, + "grad_norm": 0.5111126899719238, + "learning_rate": 9.961311060129659e-05, + "loss": 1.5936, + "step": 999 + }, + { + "epoch": 0.055738253163145866, + "grad_norm": 0.5352098941802979, + "learning_rate": 9.961200426471142e-05, + "loss": 1.8414, + "step": 1000 + }, + { + "epoch": 0.055793991416309016, + "grad_norm": 0.47616758942604065, + "learning_rate": 9.961089635472276e-05, + "loss": 1.6496, + "step": 1001 + }, + { + "epoch": 0.05584972966947216, + "grad_norm": 0.4767918288707733, + "learning_rate": 9.96097868713657e-05, + "loss": 1.3193, + "step": 1002 + }, + { + "epoch": 0.0559054679226353, + "grad_norm": 0.46608811616897583, + "learning_rate": 9.960867581467546e-05, + "loss": 1.6453, + "step": 1003 + }, + { + "epoch": 0.05596120617579845, + "grad_norm": 0.5042111277580261, + "learning_rate": 9.960756318468726e-05, + "loss": 1.8798, + "step": 1004 + }, + { + "epoch": 0.056016944428961594, + "grad_norm": 0.5502855777740479, + "learning_rate": 9.960644898143639e-05, + "loss": 1.9322, + "step": 1005 + }, + { + "epoch": 0.056072682682124744, + "grad_norm": 0.4749864935874939, + "learning_rate": 9.960533320495818e-05, + "loss": 1.5659, + "step": 1006 + }, + { + "epoch": 0.05612842093528789, + "grad_norm": 0.4787498712539673, + "learning_rate": 9.960421585528802e-05, + "loss": 1.8482, + "step": 1007 + }, + { + "epoch": 0.05618415918845104, + "grad_norm": 0.578971266746521, + "learning_rate": 9.960309693246135e-05, + "loss": 1.9905, + "step": 1008 + }, + { + "epoch": 0.05623989744161418, + "grad_norm": 0.4983009099960327, + "learning_rate": 9.960197643651363e-05, + "loss": 1.722, + "step": 1009 + }, + { + "epoch": 0.05629563569477732, + "grad_norm": 0.5528213977813721, + "learning_rate": 9.960085436748044e-05, + "loss": 1.8293, + "step": 1010 + }, + { + "epoch": 0.05635137394794047, + "grad_norm": 0.49824774265289307, + "learning_rate": 9.959973072539734e-05, + "loss": 1.8081, + "step": 1011 + }, + { + "epoch": 0.056407112201103615, + "grad_norm": 0.49810606241226196, + "learning_rate": 9.959860551029996e-05, + "loss": 1.5834, + "step": 1012 + }, + { + "epoch": 0.056462850454266765, + "grad_norm": 0.515215277671814, + "learning_rate": 9.9597478722224e-05, + "loss": 1.8318, + "step": 1013 + }, + { + "epoch": 0.05651858870742991, + "grad_norm": 0.5139912962913513, + "learning_rate": 9.959635036120518e-05, + "loss": 1.7475, + "step": 1014 + }, + { + "epoch": 0.05657432696059306, + "grad_norm": 0.4912470579147339, + "learning_rate": 9.959522042727932e-05, + "loss": 1.6809, + "step": 1015 + }, + { + "epoch": 0.0566300652137562, + "grad_norm": 0.4990215003490448, + "learning_rate": 9.959408892048219e-05, + "loss": 1.7024, + "step": 1016 + }, + { + "epoch": 0.05668580346691934, + "grad_norm": 0.5626692771911621, + "learning_rate": 9.959295584084974e-05, + "loss": 1.9791, + "step": 1017 + }, + { + "epoch": 0.05674154172008249, + "grad_norm": 0.4737264811992645, + "learning_rate": 9.959182118841786e-05, + "loss": 1.5592, + "step": 1018 + }, + { + "epoch": 0.056797279973245636, + "grad_norm": 0.5367196798324585, + "learning_rate": 9.959068496322256e-05, + "loss": 2.012, + "step": 1019 + }, + { + "epoch": 0.056853018226408786, + "grad_norm": 0.5062724947929382, + "learning_rate": 9.958954716529987e-05, + "loss": 1.6301, + "step": 1020 + }, + { + "epoch": 0.05690875647957193, + "grad_norm": 0.5419873595237732, + "learning_rate": 9.958840779468586e-05, + "loss": 1.8351, + "step": 1021 + }, + { + "epoch": 0.05696449473273508, + "grad_norm": 0.5291727781295776, + "learning_rate": 9.958726685141668e-05, + "loss": 1.8221, + "step": 1022 + }, + { + "epoch": 0.05702023298589822, + "grad_norm": 0.5285983085632324, + "learning_rate": 9.958612433552852e-05, + "loss": 1.8575, + "step": 1023 + }, + { + "epoch": 0.05707597123906137, + "grad_norm": 0.49050652980804443, + "learning_rate": 9.95849802470576e-05, + "loss": 1.7646, + "step": 1024 + }, + { + "epoch": 0.057131709492224514, + "grad_norm": 0.49379006028175354, + "learning_rate": 9.95838345860402e-05, + "loss": 1.6789, + "step": 1025 + }, + { + "epoch": 0.05718744774538766, + "grad_norm": 0.4859938621520996, + "learning_rate": 9.958268735251266e-05, + "loss": 1.8542, + "step": 1026 + }, + { + "epoch": 0.057243185998550807, + "grad_norm": 0.5445101857185364, + "learning_rate": 9.958153854651136e-05, + "loss": 1.819, + "step": 1027 + }, + { + "epoch": 0.05729892425171395, + "grad_norm": 0.5075321197509766, + "learning_rate": 9.958038816807276e-05, + "loss": 1.7872, + "step": 1028 + }, + { + "epoch": 0.0573546625048771, + "grad_norm": 0.4982723593711853, + "learning_rate": 9.957923621723329e-05, + "loss": 1.8243, + "step": 1029 + }, + { + "epoch": 0.05741040075804024, + "grad_norm": 0.49452096223831177, + "learning_rate": 9.957808269402954e-05, + "loss": 1.7316, + "step": 1030 + }, + { + "epoch": 0.05746613901120339, + "grad_norm": 0.5450426936149597, + "learning_rate": 9.957692759849806e-05, + "loss": 2.0758, + "step": 1031 + }, + { + "epoch": 0.057521877264366535, + "grad_norm": 0.5058251023292542, + "learning_rate": 9.957577093067548e-05, + "loss": 1.6588, + "step": 1032 + }, + { + "epoch": 0.05757761551752968, + "grad_norm": 0.4902496039867401, + "learning_rate": 9.957461269059851e-05, + "loss": 1.8477, + "step": 1033 + }, + { + "epoch": 0.05763335377069283, + "grad_norm": 0.5185796618461609, + "learning_rate": 9.957345287830386e-05, + "loss": 1.7541, + "step": 1034 + }, + { + "epoch": 0.05768909202385597, + "grad_norm": 0.5609437227249146, + "learning_rate": 9.95722914938283e-05, + "loss": 1.8738, + "step": 1035 + }, + { + "epoch": 0.05774483027701912, + "grad_norm": 0.47249266505241394, + "learning_rate": 9.957112853720871e-05, + "loss": 1.6668, + "step": 1036 + }, + { + "epoch": 0.05780056853018226, + "grad_norm": 0.4762544333934784, + "learning_rate": 9.956996400848191e-05, + "loss": 1.5023, + "step": 1037 + }, + { + "epoch": 0.05785630678334541, + "grad_norm": 0.5092499852180481, + "learning_rate": 9.956879790768489e-05, + "loss": 1.7614, + "step": 1038 + }, + { + "epoch": 0.057912045036508555, + "grad_norm": 0.4864351451396942, + "learning_rate": 9.95676302348546e-05, + "loss": 1.7874, + "step": 1039 + }, + { + "epoch": 0.0579677832896717, + "grad_norm": 0.5312706828117371, + "learning_rate": 9.956646099002807e-05, + "loss": 1.7864, + "step": 1040 + }, + { + "epoch": 0.05802352154283485, + "grad_norm": 0.5099919438362122, + "learning_rate": 9.95652901732424e-05, + "loss": 1.9396, + "step": 1041 + }, + { + "epoch": 0.05807925979599799, + "grad_norm": 0.4992043375968933, + "learning_rate": 9.95641177845347e-05, + "loss": 1.8373, + "step": 1042 + }, + { + "epoch": 0.05813499804916114, + "grad_norm": 0.557106614112854, + "learning_rate": 9.956294382394218e-05, + "loss": 2.0565, + "step": 1043 + }, + { + "epoch": 0.058190736302324284, + "grad_norm": 0.5183643102645874, + "learning_rate": 9.956176829150204e-05, + "loss": 1.837, + "step": 1044 + }, + { + "epoch": 0.05824647455548743, + "grad_norm": 0.4911157488822937, + "learning_rate": 9.956059118725158e-05, + "loss": 1.736, + "step": 1045 + }, + { + "epoch": 0.058302212808650576, + "grad_norm": 0.524387538433075, + "learning_rate": 9.955941251122812e-05, + "loss": 1.9561, + "step": 1046 + }, + { + "epoch": 0.058357951061813726, + "grad_norm": 0.4891200065612793, + "learning_rate": 9.955823226346905e-05, + "loss": 1.723, + "step": 1047 + }, + { + "epoch": 0.05841368931497687, + "grad_norm": 0.5014610886573792, + "learning_rate": 9.95570504440118e-05, + "loss": 1.6632, + "step": 1048 + }, + { + "epoch": 0.05846942756814001, + "grad_norm": 0.46674925088882446, + "learning_rate": 9.955586705289386e-05, + "loss": 1.5877, + "step": 1049 + }, + { + "epoch": 0.05852516582130316, + "grad_norm": 0.5613251328468323, + "learning_rate": 9.955468209015273e-05, + "loss": 2.0043, + "step": 1050 + }, + { + "epoch": 0.058580904074466304, + "grad_norm": 0.49603840708732605, + "learning_rate": 9.9553495555826e-05, + "loss": 1.7604, + "step": 1051 + }, + { + "epoch": 0.058636642327629454, + "grad_norm": 0.5199983716011047, + "learning_rate": 9.955230744995132e-05, + "loss": 1.8945, + "step": 1052 + }, + { + "epoch": 0.0586923805807926, + "grad_norm": 0.5177999138832092, + "learning_rate": 9.955111777256635e-05, + "loss": 1.9154, + "step": 1053 + }, + { + "epoch": 0.05874811883395575, + "grad_norm": 0.49996909499168396, + "learning_rate": 9.954992652370885e-05, + "loss": 1.6888, + "step": 1054 + }, + { + "epoch": 0.05880385708711889, + "grad_norm": 0.5143979787826538, + "learning_rate": 9.954873370341656e-05, + "loss": 1.7544, + "step": 1055 + }, + { + "epoch": 0.05885959534028203, + "grad_norm": 0.498963862657547, + "learning_rate": 9.954753931172733e-05, + "loss": 1.9448, + "step": 1056 + }, + { + "epoch": 0.05891533359344518, + "grad_norm": 0.5648823976516724, + "learning_rate": 9.954634334867902e-05, + "loss": 2.0281, + "step": 1057 + }, + { + "epoch": 0.058971071846608325, + "grad_norm": 0.4741098880767822, + "learning_rate": 9.95451458143096e-05, + "loss": 1.7383, + "step": 1058 + }, + { + "epoch": 0.059026810099771475, + "grad_norm": 0.5303511023521423, + "learning_rate": 9.9543946708657e-05, + "loss": 1.9047, + "step": 1059 + }, + { + "epoch": 0.05908254835293462, + "grad_norm": 0.6070243716239929, + "learning_rate": 9.95427460317593e-05, + "loss": 2.1998, + "step": 1060 + }, + { + "epoch": 0.05913828660609777, + "grad_norm": 0.509857177734375, + "learning_rate": 9.954154378365453e-05, + "loss": 1.9788, + "step": 1061 + }, + { + "epoch": 0.05919402485926091, + "grad_norm": 0.4909118711948395, + "learning_rate": 9.954033996438084e-05, + "loss": 1.7906, + "step": 1062 + }, + { + "epoch": 0.05924976311242406, + "grad_norm": 0.5275348424911499, + "learning_rate": 9.95391345739764e-05, + "loss": 1.9644, + "step": 1063 + }, + { + "epoch": 0.0593055013655872, + "grad_norm": 0.5134482979774475, + "learning_rate": 9.953792761247946e-05, + "loss": 1.7528, + "step": 1064 + }, + { + "epoch": 0.059361239618750346, + "grad_norm": 0.4846155345439911, + "learning_rate": 9.953671907992827e-05, + "loss": 1.7198, + "step": 1065 + }, + { + "epoch": 0.059416977871913496, + "grad_norm": 0.508575975894928, + "learning_rate": 9.953550897636117e-05, + "loss": 1.8502, + "step": 1066 + }, + { + "epoch": 0.05947271612507664, + "grad_norm": 0.6168702244758606, + "learning_rate": 9.953429730181653e-05, + "loss": 1.8859, + "step": 1067 + }, + { + "epoch": 0.05952845437823979, + "grad_norm": 0.5224670767784119, + "learning_rate": 9.953308405633281e-05, + "loss": 1.9667, + "step": 1068 + }, + { + "epoch": 0.05958419263140293, + "grad_norm": 0.5521063208580017, + "learning_rate": 9.953186923994845e-05, + "loss": 1.9502, + "step": 1069 + }, + { + "epoch": 0.05963993088456608, + "grad_norm": 0.5243295431137085, + "learning_rate": 9.953065285270198e-05, + "loss": 1.7872, + "step": 1070 + }, + { + "epoch": 0.059695669137729224, + "grad_norm": 0.457383394241333, + "learning_rate": 9.952943489463199e-05, + "loss": 1.4861, + "step": 1071 + }, + { + "epoch": 0.059751407390892367, + "grad_norm": 0.5042887330055237, + "learning_rate": 9.95282153657771e-05, + "loss": 1.8046, + "step": 1072 + }, + { + "epoch": 0.059807145644055516, + "grad_norm": 0.5393437147140503, + "learning_rate": 9.9526994266176e-05, + "loss": 2.0209, + "step": 1073 + }, + { + "epoch": 0.05986288389721866, + "grad_norm": 0.5133099555969238, + "learning_rate": 9.952577159586739e-05, + "loss": 2.0277, + "step": 1074 + }, + { + "epoch": 0.05991862215038181, + "grad_norm": 0.538661539554596, + "learning_rate": 9.952454735489007e-05, + "loss": 1.9108, + "step": 1075 + }, + { + "epoch": 0.05997436040354495, + "grad_norm": 0.5276675224304199, + "learning_rate": 9.952332154328286e-05, + "loss": 2.0656, + "step": 1076 + }, + { + "epoch": 0.0600300986567081, + "grad_norm": 0.5048499703407288, + "learning_rate": 9.952209416108461e-05, + "loss": 1.757, + "step": 1077 + }, + { + "epoch": 0.060085836909871244, + "grad_norm": 0.5175162553787231, + "learning_rate": 9.952086520833428e-05, + "loss": 1.7967, + "step": 1078 + }, + { + "epoch": 0.06014157516303439, + "grad_norm": 0.5084596276283264, + "learning_rate": 9.951963468507084e-05, + "loss": 1.705, + "step": 1079 + }, + { + "epoch": 0.06019731341619754, + "grad_norm": 0.45831501483917236, + "learning_rate": 9.95184025913333e-05, + "loss": 1.6394, + "step": 1080 + }, + { + "epoch": 0.06025305166936068, + "grad_norm": 0.47496846318244934, + "learning_rate": 9.951716892716074e-05, + "loss": 1.5622, + "step": 1081 + }, + { + "epoch": 0.06030878992252383, + "grad_norm": 0.5142143964767456, + "learning_rate": 9.951593369259229e-05, + "loss": 1.943, + "step": 1082 + }, + { + "epoch": 0.06036452817568697, + "grad_norm": 0.4750124216079712, + "learning_rate": 9.951469688766712e-05, + "loss": 1.7855, + "step": 1083 + }, + { + "epoch": 0.06042026642885012, + "grad_norm": 0.5169959664344788, + "learning_rate": 9.951345851242445e-05, + "loss": 1.8589, + "step": 1084 + }, + { + "epoch": 0.060476004682013265, + "grad_norm": 0.4891696572303772, + "learning_rate": 9.951221856690355e-05, + "loss": 1.8431, + "step": 1085 + }, + { + "epoch": 0.060531742935176415, + "grad_norm": 0.49664726853370667, + "learning_rate": 9.951097705114378e-05, + "loss": 1.8495, + "step": 1086 + }, + { + "epoch": 0.06058748118833956, + "grad_norm": 0.4737338423728943, + "learning_rate": 9.950973396518449e-05, + "loss": 1.6244, + "step": 1087 + }, + { + "epoch": 0.0606432194415027, + "grad_norm": 0.4466894865036011, + "learning_rate": 9.950848930906506e-05, + "loss": 1.569, + "step": 1088 + }, + { + "epoch": 0.06069895769466585, + "grad_norm": 0.5531814694404602, + "learning_rate": 9.950724308282504e-05, + "loss": 1.8739, + "step": 1089 + }, + { + "epoch": 0.06075469594782899, + "grad_norm": 0.5358182191848755, + "learning_rate": 9.95059952865039e-05, + "loss": 1.5985, + "step": 1090 + }, + { + "epoch": 0.06081043420099214, + "grad_norm": 0.5551037788391113, + "learning_rate": 9.950474592014123e-05, + "loss": 1.9313, + "step": 1091 + }, + { + "epoch": 0.060866172454155286, + "grad_norm": 0.46842116117477417, + "learning_rate": 9.950349498377666e-05, + "loss": 1.5846, + "step": 1092 + }, + { + "epoch": 0.060921910707318436, + "grad_norm": 0.5490810871124268, + "learning_rate": 9.950224247744986e-05, + "loss": 1.7246, + "step": 1093 + }, + { + "epoch": 0.06097764896048158, + "grad_norm": 0.46604838967323303, + "learning_rate": 9.950098840120055e-05, + "loss": 1.3499, + "step": 1094 + }, + { + "epoch": 0.06103338721364472, + "grad_norm": 0.4957679808139801, + "learning_rate": 9.949973275506847e-05, + "loss": 1.7099, + "step": 1095 + }, + { + "epoch": 0.06108912546680787, + "grad_norm": 0.5058358907699585, + "learning_rate": 9.94984755390935e-05, + "loss": 2.0376, + "step": 1096 + }, + { + "epoch": 0.061144863719971014, + "grad_norm": 0.5344205498695374, + "learning_rate": 9.949721675331546e-05, + "loss": 1.8721, + "step": 1097 + }, + { + "epoch": 0.061200601973134164, + "grad_norm": 0.5005959272384644, + "learning_rate": 9.94959563977743e-05, + "loss": 1.8502, + "step": 1098 + }, + { + "epoch": 0.06125634022629731, + "grad_norm": 0.5033101439476013, + "learning_rate": 9.949469447250998e-05, + "loss": 1.762, + "step": 1099 + }, + { + "epoch": 0.061312078479460456, + "grad_norm": 0.489114373922348, + "learning_rate": 9.949343097756253e-05, + "loss": 1.779, + "step": 1100 + }, + { + "epoch": 0.0613678167326236, + "grad_norm": 0.49902451038360596, + "learning_rate": 9.949216591297203e-05, + "loss": 1.6705, + "step": 1101 + }, + { + "epoch": 0.06142355498578674, + "grad_norm": 0.5019201636314392, + "learning_rate": 9.949089927877858e-05, + "loss": 1.6734, + "step": 1102 + }, + { + "epoch": 0.06147929323894989, + "grad_norm": 0.5644415020942688, + "learning_rate": 9.948963107502235e-05, + "loss": 2.0193, + "step": 1103 + }, + { + "epoch": 0.061535031492113035, + "grad_norm": 0.55086749792099, + "learning_rate": 9.948836130174358e-05, + "loss": 1.9377, + "step": 1104 + }, + { + "epoch": 0.061590769745276185, + "grad_norm": 0.48262813687324524, + "learning_rate": 9.94870899589825e-05, + "loss": 1.6455, + "step": 1105 + }, + { + "epoch": 0.06164650799843933, + "grad_norm": 0.5041834115982056, + "learning_rate": 9.948581704677949e-05, + "loss": 1.9186, + "step": 1106 + }, + { + "epoch": 0.06170224625160248, + "grad_norm": 0.5112140774726868, + "learning_rate": 9.948454256517486e-05, + "loss": 1.9353, + "step": 1107 + }, + { + "epoch": 0.06175798450476562, + "grad_norm": 0.5558189749717712, + "learning_rate": 9.948326651420907e-05, + "loss": 1.6834, + "step": 1108 + }, + { + "epoch": 0.06181372275792877, + "grad_norm": 0.5652199983596802, + "learning_rate": 9.948198889392255e-05, + "loss": 1.8998, + "step": 1109 + }, + { + "epoch": 0.06186946101109191, + "grad_norm": 0.5617989301681519, + "learning_rate": 9.948070970435587e-05, + "loss": 2.1707, + "step": 1110 + }, + { + "epoch": 0.061925199264255056, + "grad_norm": 0.5738351941108704, + "learning_rate": 9.947942894554956e-05, + "loss": 1.9854, + "step": 1111 + }, + { + "epoch": 0.061980937517418205, + "grad_norm": 0.4870631694793701, + "learning_rate": 9.947814661754425e-05, + "loss": 1.6627, + "step": 1112 + }, + { + "epoch": 0.06203667577058135, + "grad_norm": 0.5056869387626648, + "learning_rate": 9.947686272038059e-05, + "loss": 2.0686, + "step": 1113 + }, + { + "epoch": 0.0620924140237445, + "grad_norm": 0.47897595167160034, + "learning_rate": 9.947557725409934e-05, + "loss": 1.7178, + "step": 1114 + }, + { + "epoch": 0.06214815227690764, + "grad_norm": 0.5754001140594482, + "learning_rate": 9.947429021874123e-05, + "loss": 1.9185, + "step": 1115 + }, + { + "epoch": 0.06220389053007079, + "grad_norm": 0.5134566426277161, + "learning_rate": 9.94730016143471e-05, + "loss": 1.7684, + "step": 1116 + }, + { + "epoch": 0.06225962878323393, + "grad_norm": 0.5307061076164246, + "learning_rate": 9.947171144095779e-05, + "loss": 1.8471, + "step": 1117 + }, + { + "epoch": 0.062315367036397076, + "grad_norm": 0.5750778913497925, + "learning_rate": 9.947041969861424e-05, + "loss": 2.0452, + "step": 1118 + }, + { + "epoch": 0.062371105289560226, + "grad_norm": 0.4882142245769501, + "learning_rate": 9.946912638735741e-05, + "loss": 1.6376, + "step": 1119 + }, + { + "epoch": 0.06242684354272337, + "grad_norm": 0.5403459668159485, + "learning_rate": 9.946783150722832e-05, + "loss": 1.7909, + "step": 1120 + }, + { + "epoch": 0.06248258179588652, + "grad_norm": 0.6261606812477112, + "learning_rate": 9.946653505826802e-05, + "loss": 2.3971, + "step": 1121 + }, + { + "epoch": 0.06253832004904966, + "grad_norm": 0.5000771880149841, + "learning_rate": 9.946523704051765e-05, + "loss": 1.6772, + "step": 1122 + }, + { + "epoch": 0.0625940583022128, + "grad_norm": 0.5789170265197754, + "learning_rate": 9.946393745401836e-05, + "loss": 1.5496, + "step": 1123 + }, + { + "epoch": 0.06264979655537596, + "grad_norm": 0.5486829280853271, + "learning_rate": 9.946263629881137e-05, + "loss": 1.926, + "step": 1124 + }, + { + "epoch": 0.0627055348085391, + "grad_norm": 0.4877256751060486, + "learning_rate": 9.946133357493794e-05, + "loss": 1.8916, + "step": 1125 + }, + { + "epoch": 0.06276127306170225, + "grad_norm": 0.505279541015625, + "learning_rate": 9.946002928243939e-05, + "loss": 1.7043, + "step": 1126 + }, + { + "epoch": 0.06281701131486539, + "grad_norm": 0.5650628805160522, + "learning_rate": 9.945872342135709e-05, + "loss": 2.0595, + "step": 1127 + }, + { + "epoch": 0.06287274956802853, + "grad_norm": 0.5424087047576904, + "learning_rate": 9.945741599173244e-05, + "loss": 1.7227, + "step": 1128 + }, + { + "epoch": 0.06292848782119169, + "grad_norm": 0.5090418457984924, + "learning_rate": 9.945610699360692e-05, + "loss": 1.7466, + "step": 1129 + }, + { + "epoch": 0.06298422607435483, + "grad_norm": 0.5532562732696533, + "learning_rate": 9.945479642702203e-05, + "loss": 1.9668, + "step": 1130 + }, + { + "epoch": 0.06303996432751797, + "grad_norm": 0.4829805791378021, + "learning_rate": 9.945348429201933e-05, + "loss": 1.664, + "step": 1131 + }, + { + "epoch": 0.06309570258068112, + "grad_norm": 0.5276423096656799, + "learning_rate": 9.945217058864045e-05, + "loss": 1.7043, + "step": 1132 + }, + { + "epoch": 0.06315144083384426, + "grad_norm": 0.49455907940864563, + "learning_rate": 9.945085531692704e-05, + "loss": 1.6095, + "step": 1133 + }, + { + "epoch": 0.06320717908700742, + "grad_norm": 0.49773842096328735, + "learning_rate": 9.944953847692082e-05, + "loss": 1.6696, + "step": 1134 + }, + { + "epoch": 0.06326291734017056, + "grad_norm": 0.5351307988166809, + "learning_rate": 9.944822006866356e-05, + "loss": 1.8795, + "step": 1135 + }, + { + "epoch": 0.0633186555933337, + "grad_norm": 0.5688774585723877, + "learning_rate": 9.944690009219705e-05, + "loss": 1.6658, + "step": 1136 + }, + { + "epoch": 0.06337439384649685, + "grad_norm": 0.5083485841751099, + "learning_rate": 9.944557854756316e-05, + "loss": 1.5768, + "step": 1137 + }, + { + "epoch": 0.06343013209966, + "grad_norm": 0.5670489072799683, + "learning_rate": 9.944425543480382e-05, + "loss": 1.9228, + "step": 1138 + }, + { + "epoch": 0.06348587035282315, + "grad_norm": 0.49227067828178406, + "learning_rate": 9.944293075396098e-05, + "loss": 1.5889, + "step": 1139 + }, + { + "epoch": 0.06354160860598629, + "grad_norm": 0.5258840918540955, + "learning_rate": 9.944160450507665e-05, + "loss": 1.7821, + "step": 1140 + }, + { + "epoch": 0.06359734685914943, + "grad_norm": 0.5238833427429199, + "learning_rate": 9.944027668819286e-05, + "loss": 1.6987, + "step": 1141 + }, + { + "epoch": 0.06365308511231257, + "grad_norm": 0.45374488830566406, + "learning_rate": 9.943894730335179e-05, + "loss": 1.4687, + "step": 1142 + }, + { + "epoch": 0.06370882336547573, + "grad_norm": 0.496855765581131, + "learning_rate": 9.943761635059554e-05, + "loss": 1.6539, + "step": 1143 + }, + { + "epoch": 0.06376456161863887, + "grad_norm": 0.5250856876373291, + "learning_rate": 9.943628382996634e-05, + "loss": 1.9439, + "step": 1144 + }, + { + "epoch": 0.06382029987180202, + "grad_norm": 0.49122875928878784, + "learning_rate": 9.943494974150644e-05, + "loss": 1.6248, + "step": 1145 + }, + { + "epoch": 0.06387603812496516, + "grad_norm": 0.5038126111030579, + "learning_rate": 9.943361408525818e-05, + "loss": 1.8027, + "step": 1146 + }, + { + "epoch": 0.06393177637812832, + "grad_norm": 0.5918904542922974, + "learning_rate": 9.94322768612639e-05, + "loss": 2.1447, + "step": 1147 + }, + { + "epoch": 0.06398751463129146, + "grad_norm": 0.46479690074920654, + "learning_rate": 9.943093806956601e-05, + "loss": 1.8147, + "step": 1148 + }, + { + "epoch": 0.0640432528844546, + "grad_norm": 0.5129300355911255, + "learning_rate": 9.942959771020694e-05, + "loss": 1.9251, + "step": 1149 + }, + { + "epoch": 0.06409899113761774, + "grad_norm": 0.5755007266998291, + "learning_rate": 9.942825578322926e-05, + "loss": 1.9842, + "step": 1150 + }, + { + "epoch": 0.06415472939078089, + "grad_norm": 0.4916748106479645, + "learning_rate": 9.942691228867546e-05, + "loss": 1.7163, + "step": 1151 + }, + { + "epoch": 0.06421046764394404, + "grad_norm": 0.5524545311927795, + "learning_rate": 9.94255672265882e-05, + "loss": 1.8273, + "step": 1152 + }, + { + "epoch": 0.06426620589710719, + "grad_norm": 0.5353971719741821, + "learning_rate": 9.942422059701012e-05, + "loss": 1.8914, + "step": 1153 + }, + { + "epoch": 0.06432194415027033, + "grad_norm": 0.48068755865097046, + "learning_rate": 9.942287239998392e-05, + "loss": 1.7668, + "step": 1154 + }, + { + "epoch": 0.06437768240343347, + "grad_norm": 0.48459264636039734, + "learning_rate": 9.942152263555237e-05, + "loss": 1.5809, + "step": 1155 + }, + { + "epoch": 0.06443342065659662, + "grad_norm": 0.5255505442619324, + "learning_rate": 9.942017130375825e-05, + "loss": 1.8543, + "step": 1156 + }, + { + "epoch": 0.06448915890975977, + "grad_norm": 0.5935083627700806, + "learning_rate": 9.941881840464447e-05, + "loss": 1.7744, + "step": 1157 + }, + { + "epoch": 0.06454489716292292, + "grad_norm": 0.5216168761253357, + "learning_rate": 9.941746393825386e-05, + "loss": 1.5802, + "step": 1158 + }, + { + "epoch": 0.06460063541608606, + "grad_norm": 0.5127310752868652, + "learning_rate": 9.941610790462946e-05, + "loss": 1.8704, + "step": 1159 + }, + { + "epoch": 0.0646563736692492, + "grad_norm": 0.5310918688774109, + "learning_rate": 9.94147503038142e-05, + "loss": 1.7503, + "step": 1160 + }, + { + "epoch": 0.06471211192241236, + "grad_norm": 0.5417837500572205, + "learning_rate": 9.941339113585117e-05, + "loss": 1.7069, + "step": 1161 + }, + { + "epoch": 0.0647678501755755, + "grad_norm": 0.46583306789398193, + "learning_rate": 9.94120304007835e-05, + "loss": 1.6529, + "step": 1162 + }, + { + "epoch": 0.06482358842873864, + "grad_norm": 0.5210421681404114, + "learning_rate": 9.941066809865429e-05, + "loss": 1.8965, + "step": 1163 + }, + { + "epoch": 0.06487932668190179, + "grad_norm": 0.4983007311820984, + "learning_rate": 9.940930422950679e-05, + "loss": 1.797, + "step": 1164 + }, + { + "epoch": 0.06493506493506493, + "grad_norm": 0.5835360884666443, + "learning_rate": 9.940793879338424e-05, + "loss": 1.9707, + "step": 1165 + }, + { + "epoch": 0.06499080318822809, + "grad_norm": 0.48875924944877625, + "learning_rate": 9.940657179032993e-05, + "loss": 1.8563, + "step": 1166 + }, + { + "epoch": 0.06504654144139123, + "grad_norm": 0.4999620020389557, + "learning_rate": 9.940520322038722e-05, + "loss": 1.6063, + "step": 1167 + }, + { + "epoch": 0.06510227969455437, + "grad_norm": 0.49378272891044617, + "learning_rate": 9.940383308359951e-05, + "loss": 1.8387, + "step": 1168 + }, + { + "epoch": 0.06515801794771751, + "grad_norm": 0.44992733001708984, + "learning_rate": 9.940246138001027e-05, + "loss": 1.4808, + "step": 1169 + }, + { + "epoch": 0.06521375620088067, + "grad_norm": 0.5133140683174133, + "learning_rate": 9.9401088109663e-05, + "loss": 1.9234, + "step": 1170 + }, + { + "epoch": 0.06526949445404381, + "grad_norm": 0.6143995523452759, + "learning_rate": 9.939971327260122e-05, + "loss": 2.1587, + "step": 1171 + }, + { + "epoch": 0.06532523270720696, + "grad_norm": 0.5144213438034058, + "learning_rate": 9.939833686886857e-05, + "loss": 1.8453, + "step": 1172 + }, + { + "epoch": 0.0653809709603701, + "grad_norm": 0.48773664236068726, + "learning_rate": 9.939695889850869e-05, + "loss": 1.7421, + "step": 1173 + }, + { + "epoch": 0.06543670921353324, + "grad_norm": 0.48457232117652893, + "learning_rate": 9.939557936156527e-05, + "loss": 1.7447, + "step": 1174 + }, + { + "epoch": 0.0654924474666964, + "grad_norm": 0.48477059602737427, + "learning_rate": 9.939419825808207e-05, + "loss": 1.5579, + "step": 1175 + }, + { + "epoch": 0.06554818571985954, + "grad_norm": 0.5835525393486023, + "learning_rate": 9.93928155881029e-05, + "loss": 2.1224, + "step": 1176 + }, + { + "epoch": 0.06560392397302268, + "grad_norm": 0.5277059078216553, + "learning_rate": 9.939143135167158e-05, + "loss": 1.8331, + "step": 1177 + }, + { + "epoch": 0.06565966222618583, + "grad_norm": 0.5046493411064148, + "learning_rate": 9.939004554883205e-05, + "loss": 1.7895, + "step": 1178 + }, + { + "epoch": 0.06571540047934897, + "grad_norm": 0.5206563472747803, + "learning_rate": 9.938865817962822e-05, + "loss": 1.7342, + "step": 1179 + }, + { + "epoch": 0.06577113873251213, + "grad_norm": 0.43598276376724243, + "learning_rate": 9.938726924410412e-05, + "loss": 1.5657, + "step": 1180 + }, + { + "epoch": 0.06582687698567527, + "grad_norm": 0.49584537744522095, + "learning_rate": 9.938587874230379e-05, + "loss": 1.7487, + "step": 1181 + }, + { + "epoch": 0.06588261523883841, + "grad_norm": 0.539125382900238, + "learning_rate": 9.938448667427131e-05, + "loss": 1.8534, + "step": 1182 + }, + { + "epoch": 0.06593835349200156, + "grad_norm": 0.4833453595638275, + "learning_rate": 9.938309304005086e-05, + "loss": 1.6074, + "step": 1183 + }, + { + "epoch": 0.06599409174516471, + "grad_norm": 0.5339459180831909, + "learning_rate": 9.938169783968663e-05, + "loss": 1.7358, + "step": 1184 + }, + { + "epoch": 0.06604982999832786, + "grad_norm": 0.5234376788139343, + "learning_rate": 9.938030107322283e-05, + "loss": 1.5923, + "step": 1185 + }, + { + "epoch": 0.066105568251491, + "grad_norm": 0.5175224542617798, + "learning_rate": 9.93789027407038e-05, + "loss": 1.8394, + "step": 1186 + }, + { + "epoch": 0.06616130650465414, + "grad_norm": 0.5155382752418518, + "learning_rate": 9.937750284217389e-05, + "loss": 1.6385, + "step": 1187 + }, + { + "epoch": 0.06621704475781728, + "grad_norm": 0.47023966908454895, + "learning_rate": 9.937610137767747e-05, + "loss": 1.6236, + "step": 1188 + }, + { + "epoch": 0.06627278301098044, + "grad_norm": 0.4659249484539032, + "learning_rate": 9.937469834725898e-05, + "loss": 1.6139, + "step": 1189 + }, + { + "epoch": 0.06632852126414358, + "grad_norm": 0.4964550733566284, + "learning_rate": 9.937329375096297e-05, + "loss": 1.62, + "step": 1190 + }, + { + "epoch": 0.06638425951730673, + "grad_norm": 0.5324812531471252, + "learning_rate": 9.937188758883393e-05, + "loss": 1.8803, + "step": 1191 + }, + { + "epoch": 0.06643999777046987, + "grad_norm": 0.5404229164123535, + "learning_rate": 9.937047986091646e-05, + "loss": 1.9219, + "step": 1192 + }, + { + "epoch": 0.06649573602363303, + "grad_norm": 0.49228188395500183, + "learning_rate": 9.936907056725524e-05, + "loss": 1.7777, + "step": 1193 + }, + { + "epoch": 0.06655147427679617, + "grad_norm": 0.5689822435379028, + "learning_rate": 9.936765970789492e-05, + "loss": 1.9888, + "step": 1194 + }, + { + "epoch": 0.06660721252995931, + "grad_norm": 0.5374904274940491, + "learning_rate": 9.936624728288029e-05, + "loss": 1.6308, + "step": 1195 + }, + { + "epoch": 0.06666295078312245, + "grad_norm": 0.48381903767585754, + "learning_rate": 9.93648332922561e-05, + "loss": 1.6621, + "step": 1196 + }, + { + "epoch": 0.0667186890362856, + "grad_norm": 0.5000702738761902, + "learning_rate": 9.936341773606723e-05, + "loss": 1.6883, + "step": 1197 + }, + { + "epoch": 0.06677442728944875, + "grad_norm": 0.4849522113800049, + "learning_rate": 9.936200061435857e-05, + "loss": 1.6099, + "step": 1198 + }, + { + "epoch": 0.0668301655426119, + "grad_norm": 0.5355091094970703, + "learning_rate": 9.936058192717502e-05, + "loss": 1.725, + "step": 1199 + }, + { + "epoch": 0.06688590379577504, + "grad_norm": 0.4482690095901489, + "learning_rate": 9.935916167456163e-05, + "loss": 1.5314, + "step": 1200 + }, + { + "epoch": 0.06694164204893818, + "grad_norm": 0.4166151285171509, + "learning_rate": 9.93577398565634e-05, + "loss": 1.094, + "step": 1201 + }, + { + "epoch": 0.06699738030210133, + "grad_norm": 0.569545328617096, + "learning_rate": 9.935631647322544e-05, + "loss": 1.9806, + "step": 1202 + }, + { + "epoch": 0.06705311855526448, + "grad_norm": 0.528708279132843, + "learning_rate": 9.93548915245929e-05, + "loss": 1.7586, + "step": 1203 + }, + { + "epoch": 0.06710885680842762, + "grad_norm": 0.48107293248176575, + "learning_rate": 9.935346501071095e-05, + "loss": 1.6344, + "step": 1204 + }, + { + "epoch": 0.06716459506159077, + "grad_norm": 0.5078762769699097, + "learning_rate": 9.935203693162483e-05, + "loss": 1.7792, + "step": 1205 + }, + { + "epoch": 0.06722033331475391, + "grad_norm": 0.4985436797142029, + "learning_rate": 9.935060728737986e-05, + "loss": 1.8226, + "step": 1206 + }, + { + "epoch": 0.06727607156791707, + "grad_norm": 0.5001996755599976, + "learning_rate": 9.934917607802135e-05, + "loss": 1.65, + "step": 1207 + }, + { + "epoch": 0.06733180982108021, + "grad_norm": 0.4552146792411804, + "learning_rate": 9.934774330359471e-05, + "loss": 1.5889, + "step": 1208 + }, + { + "epoch": 0.06738754807424335, + "grad_norm": 0.4674372673034668, + "learning_rate": 9.934630896414536e-05, + "loss": 1.6367, + "step": 1209 + }, + { + "epoch": 0.0674432863274065, + "grad_norm": 0.4658129811286926, + "learning_rate": 9.93448730597188e-05, + "loss": 1.6565, + "step": 1210 + }, + { + "epoch": 0.06749902458056964, + "grad_norm": 0.4953976273536682, + "learning_rate": 9.934343559036056e-05, + "loss": 1.7874, + "step": 1211 + }, + { + "epoch": 0.0675547628337328, + "grad_norm": 0.5296363830566406, + "learning_rate": 9.934199655611624e-05, + "loss": 1.4178, + "step": 1212 + }, + { + "epoch": 0.06761050108689594, + "grad_norm": 0.5114982724189758, + "learning_rate": 9.934055595703149e-05, + "loss": 1.8371, + "step": 1213 + }, + { + "epoch": 0.06766623934005908, + "grad_norm": 0.54044109582901, + "learning_rate": 9.933911379315198e-05, + "loss": 1.77, + "step": 1214 + }, + { + "epoch": 0.06772197759322222, + "grad_norm": 0.5306605100631714, + "learning_rate": 9.933767006452341e-05, + "loss": 1.7457, + "step": 1215 + }, + { + "epoch": 0.06777771584638538, + "grad_norm": 0.45446470379829407, + "learning_rate": 9.933622477119165e-05, + "loss": 1.4759, + "step": 1216 + }, + { + "epoch": 0.06783345409954852, + "grad_norm": 0.5077145099639893, + "learning_rate": 9.933477791320246e-05, + "loss": 1.5853, + "step": 1217 + }, + { + "epoch": 0.06788919235271167, + "grad_norm": 0.4767955541610718, + "learning_rate": 9.933332949060177e-05, + "loss": 1.624, + "step": 1218 + }, + { + "epoch": 0.06794493060587481, + "grad_norm": 0.5637747049331665, + "learning_rate": 9.93318795034355e-05, + "loss": 1.9126, + "step": 1219 + }, + { + "epoch": 0.06800066885903795, + "grad_norm": 0.5085890889167786, + "learning_rate": 9.933042795174963e-05, + "loss": 1.7807, + "step": 1220 + }, + { + "epoch": 0.06805640711220111, + "grad_norm": 0.539089024066925, + "learning_rate": 9.93289748355902e-05, + "loss": 1.8777, + "step": 1221 + }, + { + "epoch": 0.06811214536536425, + "grad_norm": 0.557056725025177, + "learning_rate": 9.93275201550033e-05, + "loss": 1.7479, + "step": 1222 + }, + { + "epoch": 0.0681678836185274, + "grad_norm": 0.5699108839035034, + "learning_rate": 9.932606391003508e-05, + "loss": 1.9158, + "step": 1223 + }, + { + "epoch": 0.06822362187169054, + "grad_norm": 0.5341405868530273, + "learning_rate": 9.932460610073167e-05, + "loss": 1.7554, + "step": 1224 + }, + { + "epoch": 0.06827936012485368, + "grad_norm": 0.6143330335617065, + "learning_rate": 9.932314672713936e-05, + "loss": 1.7927, + "step": 1225 + }, + { + "epoch": 0.06833509837801684, + "grad_norm": 0.500853419303894, + "learning_rate": 9.932168578930439e-05, + "loss": 1.7221, + "step": 1226 + }, + { + "epoch": 0.06839083663117998, + "grad_norm": 0.5622022151947021, + "learning_rate": 9.932022328727313e-05, + "loss": 2.0262, + "step": 1227 + }, + { + "epoch": 0.06844657488434312, + "grad_norm": 0.4860107898712158, + "learning_rate": 9.931875922109195e-05, + "loss": 1.7353, + "step": 1228 + }, + { + "epoch": 0.06850231313750627, + "grad_norm": 0.5524904131889343, + "learning_rate": 9.931729359080726e-05, + "loss": 1.8789, + "step": 1229 + }, + { + "epoch": 0.06855805139066942, + "grad_norm": 0.5192303657531738, + "learning_rate": 9.931582639646556e-05, + "loss": 1.9549, + "step": 1230 + }, + { + "epoch": 0.06861378964383257, + "grad_norm": 0.47247666120529175, + "learning_rate": 9.931435763811338e-05, + "loss": 1.7371, + "step": 1231 + }, + { + "epoch": 0.06866952789699571, + "grad_norm": 0.5242395401000977, + "learning_rate": 9.93128873157973e-05, + "loss": 1.8187, + "step": 1232 + }, + { + "epoch": 0.06872526615015885, + "grad_norm": 0.4895036816596985, + "learning_rate": 9.931141542956394e-05, + "loss": 1.6269, + "step": 1233 + }, + { + "epoch": 0.068781004403322, + "grad_norm": 0.5657653212547302, + "learning_rate": 9.930994197945999e-05, + "loss": 1.9831, + "step": 1234 + }, + { + "epoch": 0.06883674265648515, + "grad_norm": 0.5430802702903748, + "learning_rate": 9.930846696553219e-05, + "loss": 1.9577, + "step": 1235 + }, + { + "epoch": 0.0688924809096483, + "grad_norm": 0.6241572499275208, + "learning_rate": 9.930699038782729e-05, + "loss": 1.7921, + "step": 1236 + }, + { + "epoch": 0.06894821916281144, + "grad_norm": 0.5370758175849915, + "learning_rate": 9.930551224639215e-05, + "loss": 1.921, + "step": 1237 + }, + { + "epoch": 0.06900395741597458, + "grad_norm": 0.5141679048538208, + "learning_rate": 9.930403254127363e-05, + "loss": 1.8209, + "step": 1238 + }, + { + "epoch": 0.06905969566913774, + "grad_norm": 0.511951208114624, + "learning_rate": 9.930255127251866e-05, + "loss": 1.9209, + "step": 1239 + }, + { + "epoch": 0.06911543392230088, + "grad_norm": 0.5124894976615906, + "learning_rate": 9.93010684401742e-05, + "loss": 1.9073, + "step": 1240 + }, + { + "epoch": 0.06917117217546402, + "grad_norm": 0.49549224972724915, + "learning_rate": 9.929958404428732e-05, + "loss": 1.6648, + "step": 1241 + }, + { + "epoch": 0.06922691042862716, + "grad_norm": 0.4937445819377899, + "learning_rate": 9.929809808490505e-05, + "loss": 1.6878, + "step": 1242 + }, + { + "epoch": 0.06928264868179031, + "grad_norm": 0.5082506537437439, + "learning_rate": 9.929661056207455e-05, + "loss": 1.8051, + "step": 1243 + }, + { + "epoch": 0.06933838693495346, + "grad_norm": 0.5111956596374512, + "learning_rate": 9.929512147584297e-05, + "loss": 1.7016, + "step": 1244 + }, + { + "epoch": 0.0693941251881166, + "grad_norm": 0.46468988060951233, + "learning_rate": 9.929363082625755e-05, + "loss": 1.7512, + "step": 1245 + }, + { + "epoch": 0.06944986344127975, + "grad_norm": 0.5274616479873657, + "learning_rate": 9.929213861336557e-05, + "loss": 1.7578, + "step": 1246 + }, + { + "epoch": 0.06950560169444289, + "grad_norm": 0.5274865031242371, + "learning_rate": 9.929064483721435e-05, + "loss": 1.7655, + "step": 1247 + }, + { + "epoch": 0.06956133994760604, + "grad_norm": 0.5010793209075928, + "learning_rate": 9.928914949785124e-05, + "loss": 1.8085, + "step": 1248 + }, + { + "epoch": 0.06961707820076919, + "grad_norm": 0.5141963362693787, + "learning_rate": 9.928765259532371e-05, + "loss": 1.4068, + "step": 1249 + }, + { + "epoch": 0.06967281645393233, + "grad_norm": 0.5250492691993713, + "learning_rate": 9.928615412967919e-05, + "loss": 1.9137, + "step": 1250 + }, + { + "epoch": 0.06972855470709548, + "grad_norm": 0.5868452191352844, + "learning_rate": 9.928465410096521e-05, + "loss": 1.6562, + "step": 1251 + }, + { + "epoch": 0.06978429296025862, + "grad_norm": 0.553932785987854, + "learning_rate": 9.928315250922937e-05, + "loss": 1.7661, + "step": 1252 + }, + { + "epoch": 0.06984003121342178, + "grad_norm": 0.49618422985076904, + "learning_rate": 9.928164935451927e-05, + "loss": 1.9336, + "step": 1253 + }, + { + "epoch": 0.06989576946658492, + "grad_norm": 0.5094950199127197, + "learning_rate": 9.928014463688257e-05, + "loss": 1.8955, + "step": 1254 + }, + { + "epoch": 0.06995150771974806, + "grad_norm": 0.5146217942237854, + "learning_rate": 9.927863835636703e-05, + "loss": 1.7892, + "step": 1255 + }, + { + "epoch": 0.0700072459729112, + "grad_norm": 0.5579236745834351, + "learning_rate": 9.927713051302037e-05, + "loss": 1.8628, + "step": 1256 + }, + { + "epoch": 0.07006298422607435, + "grad_norm": 0.5719481706619263, + "learning_rate": 9.927562110689046e-05, + "loss": 1.9999, + "step": 1257 + }, + { + "epoch": 0.0701187224792375, + "grad_norm": 0.5164546966552734, + "learning_rate": 9.927411013802512e-05, + "loss": 1.6341, + "step": 1258 + }, + { + "epoch": 0.07017446073240065, + "grad_norm": 0.5111738443374634, + "learning_rate": 9.927259760647232e-05, + "loss": 1.8801, + "step": 1259 + }, + { + "epoch": 0.07023019898556379, + "grad_norm": 0.47879326343536377, + "learning_rate": 9.927108351227998e-05, + "loss": 1.6122, + "step": 1260 + }, + { + "epoch": 0.07028593723872693, + "grad_norm": 0.6105756759643555, + "learning_rate": 9.926956785549616e-05, + "loss": 2.0343, + "step": 1261 + }, + { + "epoch": 0.07034167549189009, + "grad_norm": 0.5080457329750061, + "learning_rate": 9.92680506361689e-05, + "loss": 1.9449, + "step": 1262 + }, + { + "epoch": 0.07039741374505323, + "grad_norm": 0.4686660170555115, + "learning_rate": 9.926653185434634e-05, + "loss": 1.7354, + "step": 1263 + }, + { + "epoch": 0.07045315199821638, + "grad_norm": 0.5146884322166443, + "learning_rate": 9.926501151007662e-05, + "loss": 1.8347, + "step": 1264 + }, + { + "epoch": 0.07050889025137952, + "grad_norm": 0.5533162355422974, + "learning_rate": 9.926348960340796e-05, + "loss": 1.887, + "step": 1265 + }, + { + "epoch": 0.07056462850454266, + "grad_norm": 0.5264948606491089, + "learning_rate": 9.926196613438865e-05, + "loss": 1.8267, + "step": 1266 + }, + { + "epoch": 0.07062036675770582, + "grad_norm": 0.5064124464988708, + "learning_rate": 9.926044110306698e-05, + "loss": 1.4021, + "step": 1267 + }, + { + "epoch": 0.07067610501086896, + "grad_norm": 0.5374730229377747, + "learning_rate": 9.925891450949135e-05, + "loss": 2.1346, + "step": 1268 + }, + { + "epoch": 0.0707318432640321, + "grad_norm": 0.5050212144851685, + "learning_rate": 9.925738635371011e-05, + "loss": 1.7458, + "step": 1269 + }, + { + "epoch": 0.07078758151719525, + "grad_norm": 0.5477495789527893, + "learning_rate": 9.925585663577181e-05, + "loss": 1.9184, + "step": 1270 + }, + { + "epoch": 0.0708433197703584, + "grad_norm": 0.4926922917366028, + "learning_rate": 9.92543253557249e-05, + "loss": 1.7406, + "step": 1271 + }, + { + "epoch": 0.07089905802352155, + "grad_norm": 0.5027531981468201, + "learning_rate": 9.925279251361795e-05, + "loss": 1.6771, + "step": 1272 + }, + { + "epoch": 0.07095479627668469, + "grad_norm": 0.44907525181770325, + "learning_rate": 9.92512581094996e-05, + "loss": 1.534, + "step": 1273 + }, + { + "epoch": 0.07101053452984783, + "grad_norm": 0.4935868978500366, + "learning_rate": 9.92497221434185e-05, + "loss": 1.6932, + "step": 1274 + }, + { + "epoch": 0.07106627278301098, + "grad_norm": 0.5403043031692505, + "learning_rate": 9.924818461542335e-05, + "loss": 1.7863, + "step": 1275 + }, + { + "epoch": 0.07112201103617413, + "grad_norm": 0.49991410970687866, + "learning_rate": 9.924664552556293e-05, + "loss": 1.5134, + "step": 1276 + }, + { + "epoch": 0.07117774928933727, + "grad_norm": 0.5363178849220276, + "learning_rate": 9.924510487388603e-05, + "loss": 1.7264, + "step": 1277 + }, + { + "epoch": 0.07123348754250042, + "grad_norm": 0.6076151728630066, + "learning_rate": 9.924356266044153e-05, + "loss": 2.0642, + "step": 1278 + }, + { + "epoch": 0.07128922579566356, + "grad_norm": 0.5013806223869324, + "learning_rate": 9.924201888527833e-05, + "loss": 1.5962, + "step": 1279 + }, + { + "epoch": 0.0713449640488267, + "grad_norm": 0.4695322513580322, + "learning_rate": 9.924047354844539e-05, + "loss": 1.657, + "step": 1280 + }, + { + "epoch": 0.07140070230198986, + "grad_norm": 0.5039030909538269, + "learning_rate": 9.923892664999173e-05, + "loss": 1.8447, + "step": 1281 + }, + { + "epoch": 0.071456440555153, + "grad_norm": 0.5190325379371643, + "learning_rate": 9.923737818996639e-05, + "loss": 1.7732, + "step": 1282 + }, + { + "epoch": 0.07151217880831615, + "grad_norm": 0.4986951947212219, + "learning_rate": 9.92358281684185e-05, + "loss": 1.5262, + "step": 1283 + }, + { + "epoch": 0.07156791706147929, + "grad_norm": 0.5534316897392273, + "learning_rate": 9.92342765853972e-05, + "loss": 2.0328, + "step": 1284 + }, + { + "epoch": 0.07162365531464245, + "grad_norm": 0.49968552589416504, + "learning_rate": 9.923272344095169e-05, + "loss": 1.7766, + "step": 1285 + }, + { + "epoch": 0.07167939356780559, + "grad_norm": 0.5316057205200195, + "learning_rate": 9.923116873513125e-05, + "loss": 1.9544, + "step": 1286 + }, + { + "epoch": 0.07173513182096873, + "grad_norm": 0.49467048048973083, + "learning_rate": 9.922961246798516e-05, + "loss": 1.6245, + "step": 1287 + }, + { + "epoch": 0.07179087007413187, + "grad_norm": 0.5283698439598083, + "learning_rate": 9.922805463956282e-05, + "loss": 1.8113, + "step": 1288 + }, + { + "epoch": 0.07184660832729502, + "grad_norm": 0.5117636322975159, + "learning_rate": 9.922649524991359e-05, + "loss": 1.5682, + "step": 1289 + }, + { + "epoch": 0.07190234658045817, + "grad_norm": 0.524705708026886, + "learning_rate": 9.922493429908695e-05, + "loss": 1.7724, + "step": 1290 + }, + { + "epoch": 0.07195808483362132, + "grad_norm": 0.5265300273895264, + "learning_rate": 9.922337178713238e-05, + "loss": 1.8775, + "step": 1291 + }, + { + "epoch": 0.07201382308678446, + "grad_norm": 0.4668891429901123, + "learning_rate": 9.922180771409945e-05, + "loss": 1.6585, + "step": 1292 + }, + { + "epoch": 0.0720695613399476, + "grad_norm": 0.5392476916313171, + "learning_rate": 9.922024208003777e-05, + "loss": 1.7811, + "step": 1293 + }, + { + "epoch": 0.07212529959311076, + "grad_norm": 0.45741191506385803, + "learning_rate": 9.921867488499699e-05, + "loss": 1.5123, + "step": 1294 + }, + { + "epoch": 0.0721810378462739, + "grad_norm": 0.5779647827148438, + "learning_rate": 9.92171061290268e-05, + "loss": 1.798, + "step": 1295 + }, + { + "epoch": 0.07223677609943704, + "grad_norm": 0.5434536337852478, + "learning_rate": 9.921553581217697e-05, + "loss": 1.8681, + "step": 1296 + }, + { + "epoch": 0.07229251435260019, + "grad_norm": 0.47686439752578735, + "learning_rate": 9.921396393449727e-05, + "loss": 1.5803, + "step": 1297 + }, + { + "epoch": 0.07234825260576333, + "grad_norm": 0.5182580947875977, + "learning_rate": 9.921239049603759e-05, + "loss": 1.8512, + "step": 1298 + }, + { + "epoch": 0.07240399085892649, + "grad_norm": 0.5331408977508545, + "learning_rate": 9.921081549684779e-05, + "loss": 1.9001, + "step": 1299 + }, + { + "epoch": 0.07245972911208963, + "grad_norm": 0.49691641330718994, + "learning_rate": 9.920923893697786e-05, + "loss": 1.718, + "step": 1300 + }, + { + "epoch": 0.07251546736525277, + "grad_norm": 0.526009202003479, + "learning_rate": 9.920766081647779e-05, + "loss": 1.6531, + "step": 1301 + }, + { + "epoch": 0.07257120561841592, + "grad_norm": 0.5836690664291382, + "learning_rate": 9.92060811353976e-05, + "loss": 1.6522, + "step": 1302 + }, + { + "epoch": 0.07262694387157906, + "grad_norm": 0.5216406583786011, + "learning_rate": 9.920449989378742e-05, + "loss": 1.5131, + "step": 1303 + }, + { + "epoch": 0.07268268212474222, + "grad_norm": 0.4874148964881897, + "learning_rate": 9.920291709169737e-05, + "loss": 1.5922, + "step": 1304 + }, + { + "epoch": 0.07273842037790536, + "grad_norm": 0.4904099404811859, + "learning_rate": 9.920133272917767e-05, + "loss": 1.83, + "step": 1305 + }, + { + "epoch": 0.0727941586310685, + "grad_norm": 0.5295507907867432, + "learning_rate": 9.919974680627856e-05, + "loss": 1.8742, + "step": 1306 + }, + { + "epoch": 0.07284989688423164, + "grad_norm": 0.5288472175598145, + "learning_rate": 9.919815932305034e-05, + "loss": 1.8706, + "step": 1307 + }, + { + "epoch": 0.0729056351373948, + "grad_norm": 0.48234906792640686, + "learning_rate": 9.919657027954335e-05, + "loss": 1.6827, + "step": 1308 + }, + { + "epoch": 0.07296137339055794, + "grad_norm": 0.5203633904457092, + "learning_rate": 9.919497967580798e-05, + "loss": 1.7064, + "step": 1309 + }, + { + "epoch": 0.07301711164372109, + "grad_norm": 0.51950603723526, + "learning_rate": 9.919338751189468e-05, + "loss": 1.7643, + "step": 1310 + }, + { + "epoch": 0.07307284989688423, + "grad_norm": 0.5219436883926392, + "learning_rate": 9.919179378785396e-05, + "loss": 1.928, + "step": 1311 + }, + { + "epoch": 0.07312858815004737, + "grad_norm": 0.5543720722198486, + "learning_rate": 9.919019850373635e-05, + "loss": 2.0754, + "step": 1312 + }, + { + "epoch": 0.07318432640321053, + "grad_norm": 0.4778376817703247, + "learning_rate": 9.918860165959243e-05, + "loss": 1.652, + "step": 1313 + }, + { + "epoch": 0.07324006465637367, + "grad_norm": 0.5367230772972107, + "learning_rate": 9.918700325547286e-05, + "loss": 1.9413, + "step": 1314 + }, + { + "epoch": 0.07329580290953681, + "grad_norm": 0.5712525248527527, + "learning_rate": 9.918540329142831e-05, + "loss": 1.7279, + "step": 1315 + }, + { + "epoch": 0.07335154116269996, + "grad_norm": 0.5032913088798523, + "learning_rate": 9.918380176750955e-05, + "loss": 1.7546, + "step": 1316 + }, + { + "epoch": 0.07340727941586311, + "grad_norm": 0.4760904908180237, + "learning_rate": 9.918219868376737e-05, + "loss": 1.657, + "step": 1317 + }, + { + "epoch": 0.07346301766902626, + "grad_norm": 0.5059273838996887, + "learning_rate": 9.91805940402526e-05, + "loss": 1.8728, + "step": 1318 + }, + { + "epoch": 0.0735187559221894, + "grad_norm": 0.5608049631118774, + "learning_rate": 9.917898783701612e-05, + "loss": 2.008, + "step": 1319 + }, + { + "epoch": 0.07357449417535254, + "grad_norm": 0.5329555869102478, + "learning_rate": 9.917738007410888e-05, + "loss": 1.6254, + "step": 1320 + }, + { + "epoch": 0.07363023242851569, + "grad_norm": 0.5802140831947327, + "learning_rate": 9.917577075158186e-05, + "loss": 2.0478, + "step": 1321 + }, + { + "epoch": 0.07368597068167884, + "grad_norm": 0.5300236940383911, + "learning_rate": 9.917415986948612e-05, + "loss": 1.8852, + "step": 1322 + }, + { + "epoch": 0.07374170893484198, + "grad_norm": 0.4858631491661072, + "learning_rate": 9.917254742787273e-05, + "loss": 1.5704, + "step": 1323 + }, + { + "epoch": 0.07379744718800513, + "grad_norm": 0.5059242248535156, + "learning_rate": 9.917093342679284e-05, + "loss": 1.6683, + "step": 1324 + }, + { + "epoch": 0.07385318544116827, + "grad_norm": 0.4971073567867279, + "learning_rate": 9.916931786629761e-05, + "loss": 1.6127, + "step": 1325 + }, + { + "epoch": 0.07390892369433141, + "grad_norm": 0.5727537274360657, + "learning_rate": 9.916770074643831e-05, + "loss": 1.8274, + "step": 1326 + }, + { + "epoch": 0.07396466194749457, + "grad_norm": 0.5242769718170166, + "learning_rate": 9.91660820672662e-05, + "loss": 1.7747, + "step": 1327 + }, + { + "epoch": 0.07402040020065771, + "grad_norm": 0.5268994569778442, + "learning_rate": 9.916446182883264e-05, + "loss": 1.8716, + "step": 1328 + }, + { + "epoch": 0.07407613845382086, + "grad_norm": 0.5069685578346252, + "learning_rate": 9.916284003118897e-05, + "loss": 1.572, + "step": 1329 + }, + { + "epoch": 0.074131876706984, + "grad_norm": 0.5535740852355957, + "learning_rate": 9.916121667438667e-05, + "loss": 1.852, + "step": 1330 + }, + { + "epoch": 0.07418761496014716, + "grad_norm": 0.5100526213645935, + "learning_rate": 9.915959175847723e-05, + "loss": 1.8053, + "step": 1331 + }, + { + "epoch": 0.0742433532133103, + "grad_norm": 0.5486835837364197, + "learning_rate": 9.915796528351212e-05, + "loss": 1.9061, + "step": 1332 + }, + { + "epoch": 0.07429909146647344, + "grad_norm": 0.546424150466919, + "learning_rate": 9.915633724954299e-05, + "loss": 1.8031, + "step": 1333 + }, + { + "epoch": 0.07435482971963658, + "grad_norm": 0.5596832036972046, + "learning_rate": 9.915470765662143e-05, + "loss": 1.7918, + "step": 1334 + }, + { + "epoch": 0.07441056797279973, + "grad_norm": 0.5737068057060242, + "learning_rate": 9.915307650479914e-05, + "loss": 1.7687, + "step": 1335 + }, + { + "epoch": 0.07446630622596288, + "grad_norm": 0.5227526426315308, + "learning_rate": 9.915144379412784e-05, + "loss": 1.6509, + "step": 1336 + }, + { + "epoch": 0.07452204447912603, + "grad_norm": 0.5172739028930664, + "learning_rate": 9.914980952465932e-05, + "loss": 1.7922, + "step": 1337 + }, + { + "epoch": 0.07457778273228917, + "grad_norm": 0.5068166851997375, + "learning_rate": 9.91481736964454e-05, + "loss": 1.6475, + "step": 1338 + }, + { + "epoch": 0.07463352098545231, + "grad_norm": 0.5804305076599121, + "learning_rate": 9.914653630953797e-05, + "loss": 1.9451, + "step": 1339 + }, + { + "epoch": 0.07468925923861547, + "grad_norm": 0.5118273496627808, + "learning_rate": 9.914489736398895e-05, + "loss": 1.6014, + "step": 1340 + }, + { + "epoch": 0.07474499749177861, + "grad_norm": 0.47122183442115784, + "learning_rate": 9.914325685985033e-05, + "loss": 1.7206, + "step": 1341 + }, + { + "epoch": 0.07480073574494175, + "grad_norm": 0.5404577851295471, + "learning_rate": 9.914161479717413e-05, + "loss": 1.984, + "step": 1342 + }, + { + "epoch": 0.0748564739981049, + "grad_norm": 0.5037184953689575, + "learning_rate": 9.91399711760124e-05, + "loss": 1.8535, + "step": 1343 + }, + { + "epoch": 0.07491221225126804, + "grad_norm": 0.5099769830703735, + "learning_rate": 9.91383259964173e-05, + "loss": 1.7632, + "step": 1344 + }, + { + "epoch": 0.0749679505044312, + "grad_norm": 0.5458886623382568, + "learning_rate": 9.9136679258441e-05, + "loss": 2.0607, + "step": 1345 + }, + { + "epoch": 0.07502368875759434, + "grad_norm": 0.4648517668247223, + "learning_rate": 9.913503096213572e-05, + "loss": 1.914, + "step": 1346 + }, + { + "epoch": 0.07507942701075748, + "grad_norm": 0.5120497941970825, + "learning_rate": 9.913338110755375e-05, + "loss": 1.8349, + "step": 1347 + }, + { + "epoch": 0.07513516526392063, + "grad_norm": 0.4551779329776764, + "learning_rate": 9.913172969474737e-05, + "loss": 1.5673, + "step": 1348 + }, + { + "epoch": 0.07519090351708377, + "grad_norm": 0.5728102326393127, + "learning_rate": 9.913007672376899e-05, + "loss": 2.1014, + "step": 1349 + }, + { + "epoch": 0.07524664177024692, + "grad_norm": 0.47414430975914, + "learning_rate": 9.912842219467105e-05, + "loss": 1.6999, + "step": 1350 + }, + { + "epoch": 0.07530238002341007, + "grad_norm": 0.5111278891563416, + "learning_rate": 9.912676610750598e-05, + "loss": 1.9367, + "step": 1351 + }, + { + "epoch": 0.07535811827657321, + "grad_norm": 0.5118902325630188, + "learning_rate": 9.91251084623263e-05, + "loss": 1.8136, + "step": 1352 + }, + { + "epoch": 0.07541385652973635, + "grad_norm": 0.5514450669288635, + "learning_rate": 9.912344925918462e-05, + "loss": 1.7309, + "step": 1353 + }, + { + "epoch": 0.07546959478289951, + "grad_norm": 0.4836481511592865, + "learning_rate": 9.912178849813353e-05, + "loss": 1.2918, + "step": 1354 + }, + { + "epoch": 0.07552533303606265, + "grad_norm": 0.5168613791465759, + "learning_rate": 9.91201261792257e-05, + "loss": 1.8673, + "step": 1355 + }, + { + "epoch": 0.0755810712892258, + "grad_norm": 0.48082637786865234, + "learning_rate": 9.911846230251388e-05, + "loss": 1.6275, + "step": 1356 + }, + { + "epoch": 0.07563680954238894, + "grad_norm": 0.504571259021759, + "learning_rate": 9.91167968680508e-05, + "loss": 1.7718, + "step": 1357 + }, + { + "epoch": 0.07569254779555208, + "grad_norm": 0.499100923538208, + "learning_rate": 9.911512987588932e-05, + "loss": 1.7842, + "step": 1358 + }, + { + "epoch": 0.07574828604871524, + "grad_norm": 0.4926021993160248, + "learning_rate": 9.911346132608225e-05, + "loss": 1.5556, + "step": 1359 + }, + { + "epoch": 0.07580402430187838, + "grad_norm": 0.5981921553611755, + "learning_rate": 9.911179121868255e-05, + "loss": 1.853, + "step": 1360 + }, + { + "epoch": 0.07585976255504152, + "grad_norm": 0.4938274621963501, + "learning_rate": 9.911011955374316e-05, + "loss": 1.646, + "step": 1361 + }, + { + "epoch": 0.07591550080820467, + "grad_norm": 0.4952639937400818, + "learning_rate": 9.910844633131713e-05, + "loss": 1.6188, + "step": 1362 + }, + { + "epoch": 0.07597123906136782, + "grad_norm": 0.5024005770683289, + "learning_rate": 9.91067715514575e-05, + "loss": 1.9164, + "step": 1363 + }, + { + "epoch": 0.07602697731453097, + "grad_norm": 0.5488448143005371, + "learning_rate": 9.910509521421738e-05, + "loss": 1.9139, + "step": 1364 + }, + { + "epoch": 0.07608271556769411, + "grad_norm": 0.5247362852096558, + "learning_rate": 9.910341731964996e-05, + "loss": 1.8488, + "step": 1365 + }, + { + "epoch": 0.07613845382085725, + "grad_norm": 0.5229883193969727, + "learning_rate": 9.910173786780842e-05, + "loss": 1.8503, + "step": 1366 + }, + { + "epoch": 0.0761941920740204, + "grad_norm": 0.49642667174339294, + "learning_rate": 9.910005685874603e-05, + "loss": 1.7051, + "step": 1367 + }, + { + "epoch": 0.07624993032718355, + "grad_norm": 0.48131421208381653, + "learning_rate": 9.909837429251614e-05, + "loss": 1.4925, + "step": 1368 + }, + { + "epoch": 0.0763056685803467, + "grad_norm": 0.4743631184101105, + "learning_rate": 9.909669016917204e-05, + "loss": 1.5833, + "step": 1369 + }, + { + "epoch": 0.07636140683350984, + "grad_norm": 0.5918928980827332, + "learning_rate": 9.909500448876721e-05, + "loss": 2.1295, + "step": 1370 + }, + { + "epoch": 0.07641714508667298, + "grad_norm": 0.5590381622314453, + "learning_rate": 9.909331725135509e-05, + "loss": 1.862, + "step": 1371 + }, + { + "epoch": 0.07647288333983612, + "grad_norm": 0.5015060305595398, + "learning_rate": 9.909162845698916e-05, + "loss": 1.7541, + "step": 1372 + }, + { + "epoch": 0.07652862159299928, + "grad_norm": 0.5213440656661987, + "learning_rate": 9.9089938105723e-05, + "loss": 1.7944, + "step": 1373 + }, + { + "epoch": 0.07658435984616242, + "grad_norm": 0.5424663424491882, + "learning_rate": 9.908824619761023e-05, + "loss": 1.8207, + "step": 1374 + }, + { + "epoch": 0.07664009809932557, + "grad_norm": 0.548622727394104, + "learning_rate": 9.908655273270449e-05, + "loss": 1.8224, + "step": 1375 + }, + { + "epoch": 0.07669583635248871, + "grad_norm": 0.5018399953842163, + "learning_rate": 9.908485771105949e-05, + "loss": 1.856, + "step": 1376 + }, + { + "epoch": 0.07675157460565186, + "grad_norm": 0.5578395128250122, + "learning_rate": 9.908316113272897e-05, + "loss": 1.7791, + "step": 1377 + }, + { + "epoch": 0.07680731285881501, + "grad_norm": 0.5207507610321045, + "learning_rate": 9.908146299776678e-05, + "loss": 1.7608, + "step": 1378 + }, + { + "epoch": 0.07686305111197815, + "grad_norm": 0.5391795039176941, + "learning_rate": 9.907976330622674e-05, + "loss": 1.772, + "step": 1379 + }, + { + "epoch": 0.0769187893651413, + "grad_norm": 0.47418221831321716, + "learning_rate": 9.907806205816277e-05, + "loss": 1.2319, + "step": 1380 + }, + { + "epoch": 0.07697452761830444, + "grad_norm": 0.49630096554756165, + "learning_rate": 9.90763592536288e-05, + "loss": 1.676, + "step": 1381 + }, + { + "epoch": 0.0770302658714676, + "grad_norm": 0.533801257610321, + "learning_rate": 9.907465489267886e-05, + "loss": 1.7612, + "step": 1382 + }, + { + "epoch": 0.07708600412463074, + "grad_norm": 0.5061699748039246, + "learning_rate": 9.907294897536699e-05, + "loss": 1.8883, + "step": 1383 + }, + { + "epoch": 0.07714174237779388, + "grad_norm": 0.5732898116111755, + "learning_rate": 9.90712415017473e-05, + "loss": 1.8195, + "step": 1384 + }, + { + "epoch": 0.07719748063095702, + "grad_norm": 0.5062339901924133, + "learning_rate": 9.906953247187392e-05, + "loss": 1.765, + "step": 1385 + }, + { + "epoch": 0.07725321888412018, + "grad_norm": 0.4672509729862213, + "learning_rate": 9.906782188580107e-05, + "loss": 1.5199, + "step": 1386 + }, + { + "epoch": 0.07730895713728332, + "grad_norm": 0.5902494788169861, + "learning_rate": 9.9066109743583e-05, + "loss": 2.1369, + "step": 1387 + }, + { + "epoch": 0.07736469539044646, + "grad_norm": 0.4874188005924225, + "learning_rate": 9.9064396045274e-05, + "loss": 1.6941, + "step": 1388 + }, + { + "epoch": 0.0774204336436096, + "grad_norm": 0.5620763301849365, + "learning_rate": 9.906268079092843e-05, + "loss": 1.7395, + "step": 1389 + }, + { + "epoch": 0.07747617189677275, + "grad_norm": 0.5454680919647217, + "learning_rate": 9.906096398060067e-05, + "loss": 1.7771, + "step": 1390 + }, + { + "epoch": 0.0775319101499359, + "grad_norm": 0.5270059704780579, + "learning_rate": 9.905924561434519e-05, + "loss": 1.8375, + "step": 1391 + }, + { + "epoch": 0.07758764840309905, + "grad_norm": 0.4714577794075012, + "learning_rate": 9.905752569221647e-05, + "loss": 1.4259, + "step": 1392 + }, + { + "epoch": 0.07764338665626219, + "grad_norm": 0.4905398190021515, + "learning_rate": 9.905580421426905e-05, + "loss": 1.7302, + "step": 1393 + }, + { + "epoch": 0.07769912490942534, + "grad_norm": 0.5166676640510559, + "learning_rate": 9.905408118055755e-05, + "loss": 1.665, + "step": 1394 + }, + { + "epoch": 0.07775486316258848, + "grad_norm": 0.5545955896377563, + "learning_rate": 9.905235659113658e-05, + "loss": 1.7589, + "step": 1395 + }, + { + "epoch": 0.07781060141575163, + "grad_norm": 0.5974867343902588, + "learning_rate": 9.905063044606088e-05, + "loss": 1.9677, + "step": 1396 + }, + { + "epoch": 0.07786633966891478, + "grad_norm": 0.538375198841095, + "learning_rate": 9.904890274538516e-05, + "loss": 1.6438, + "step": 1397 + }, + { + "epoch": 0.07792207792207792, + "grad_norm": 0.5226508378982544, + "learning_rate": 9.904717348916421e-05, + "loss": 1.8672, + "step": 1398 + }, + { + "epoch": 0.07797781617524106, + "grad_norm": 0.5076341032981873, + "learning_rate": 9.904544267745288e-05, + "loss": 1.6942, + "step": 1399 + }, + { + "epoch": 0.07803355442840422, + "grad_norm": 0.5587323307991028, + "learning_rate": 9.904371031030608e-05, + "loss": 2.0127, + "step": 1400 + }, + { + "epoch": 0.07808929268156736, + "grad_norm": 0.5744814276695251, + "learning_rate": 9.904197638777872e-05, + "loss": 1.6781, + "step": 1401 + }, + { + "epoch": 0.0781450309347305, + "grad_norm": 0.4966742992401123, + "learning_rate": 9.904024090992581e-05, + "loss": 1.7314, + "step": 1402 + }, + { + "epoch": 0.07820076918789365, + "grad_norm": 0.5050981640815735, + "learning_rate": 9.903850387680238e-05, + "loss": 1.8782, + "step": 1403 + }, + { + "epoch": 0.07825650744105679, + "grad_norm": 0.518583357334137, + "learning_rate": 9.903676528846352e-05, + "loss": 1.9028, + "step": 1404 + }, + { + "epoch": 0.07831224569421995, + "grad_norm": 0.5047330856323242, + "learning_rate": 9.903502514496436e-05, + "loss": 1.6501, + "step": 1405 + }, + { + "epoch": 0.07836798394738309, + "grad_norm": 0.5036478042602539, + "learning_rate": 9.903328344636012e-05, + "loss": 1.7873, + "step": 1406 + }, + { + "epoch": 0.07842372220054623, + "grad_norm": 0.49196913838386536, + "learning_rate": 9.903154019270599e-05, + "loss": 1.6404, + "step": 1407 + }, + { + "epoch": 0.07847946045370938, + "grad_norm": 0.5227888226509094, + "learning_rate": 9.90297953840573e-05, + "loss": 1.8049, + "step": 1408 + }, + { + "epoch": 0.07853519870687253, + "grad_norm": 0.5419712662696838, + "learning_rate": 9.902804902046935e-05, + "loss": 1.8979, + "step": 1409 + }, + { + "epoch": 0.07859093696003568, + "grad_norm": 0.5512637495994568, + "learning_rate": 9.902630110199753e-05, + "loss": 1.5322, + "step": 1410 + }, + { + "epoch": 0.07864667521319882, + "grad_norm": 0.5147241353988647, + "learning_rate": 9.90245516286973e-05, + "loss": 1.8126, + "step": 1411 + }, + { + "epoch": 0.07870241346636196, + "grad_norm": 0.5257126092910767, + "learning_rate": 9.902280060062413e-05, + "loss": 1.9197, + "step": 1412 + }, + { + "epoch": 0.0787581517195251, + "grad_norm": 0.5739386677742004, + "learning_rate": 9.902104801783352e-05, + "loss": 2.0767, + "step": 1413 + }, + { + "epoch": 0.07881388997268826, + "grad_norm": 0.47901228070259094, + "learning_rate": 9.90192938803811e-05, + "loss": 1.4594, + "step": 1414 + }, + { + "epoch": 0.0788696282258514, + "grad_norm": 0.4943484663963318, + "learning_rate": 9.901753818832248e-05, + "loss": 1.6394, + "step": 1415 + }, + { + "epoch": 0.07892536647901455, + "grad_norm": 0.5033669471740723, + "learning_rate": 9.901578094171333e-05, + "loss": 1.6963, + "step": 1416 + }, + { + "epoch": 0.07898110473217769, + "grad_norm": 0.5039759874343872, + "learning_rate": 9.90140221406094e-05, + "loss": 1.5721, + "step": 1417 + }, + { + "epoch": 0.07903684298534083, + "grad_norm": 0.49595627188682556, + "learning_rate": 9.901226178506646e-05, + "loss": 1.7414, + "step": 1418 + }, + { + "epoch": 0.07909258123850399, + "grad_norm": 0.5233118534088135, + "learning_rate": 9.901049987514033e-05, + "loss": 1.7728, + "step": 1419 + }, + { + "epoch": 0.07914831949166713, + "grad_norm": 0.5164638757705688, + "learning_rate": 9.90087364108869e-05, + "loss": 1.8569, + "step": 1420 + }, + { + "epoch": 0.07920405774483028, + "grad_norm": 0.5309315323829651, + "learning_rate": 9.900697139236209e-05, + "loss": 1.7734, + "step": 1421 + }, + { + "epoch": 0.07925979599799342, + "grad_norm": 0.4936157464981079, + "learning_rate": 9.900520481962188e-05, + "loss": 1.6859, + "step": 1422 + }, + { + "epoch": 0.07931553425115657, + "grad_norm": 0.4760551452636719, + "learning_rate": 9.90034366927223e-05, + "loss": 1.7148, + "step": 1423 + }, + { + "epoch": 0.07937127250431972, + "grad_norm": 0.5099088549613953, + "learning_rate": 9.90016670117194e-05, + "loss": 1.7605, + "step": 1424 + }, + { + "epoch": 0.07942701075748286, + "grad_norm": 0.512695848941803, + "learning_rate": 9.899989577666933e-05, + "loss": 1.7824, + "step": 1425 + }, + { + "epoch": 0.079482749010646, + "grad_norm": 0.5051438212394714, + "learning_rate": 9.899812298762826e-05, + "loss": 1.8003, + "step": 1426 + }, + { + "epoch": 0.07953848726380915, + "grad_norm": 0.5289508700370789, + "learning_rate": 9.899634864465241e-05, + "loss": 1.7588, + "step": 1427 + }, + { + "epoch": 0.0795942255169723, + "grad_norm": 0.4910021424293518, + "learning_rate": 9.899457274779804e-05, + "loss": 1.7284, + "step": 1428 + }, + { + "epoch": 0.07964996377013545, + "grad_norm": 0.6068856716156006, + "learning_rate": 9.899279529712148e-05, + "loss": 1.9947, + "step": 1429 + }, + { + "epoch": 0.07970570202329859, + "grad_norm": 0.5239669680595398, + "learning_rate": 9.899101629267911e-05, + "loss": 1.5956, + "step": 1430 + }, + { + "epoch": 0.07976144027646173, + "grad_norm": 0.5577272176742554, + "learning_rate": 9.898923573452734e-05, + "loss": 2.0396, + "step": 1431 + }, + { + "epoch": 0.07981717852962489, + "grad_norm": 0.4893241822719574, + "learning_rate": 9.898745362272264e-05, + "loss": 1.5054, + "step": 1432 + }, + { + "epoch": 0.07987291678278803, + "grad_norm": 0.48603859543800354, + "learning_rate": 9.898566995732153e-05, + "loss": 1.6304, + "step": 1433 + }, + { + "epoch": 0.07992865503595117, + "grad_norm": 0.5560683012008667, + "learning_rate": 9.898388473838056e-05, + "loss": 1.8177, + "step": 1434 + }, + { + "epoch": 0.07998439328911432, + "grad_norm": 0.5030083060264587, + "learning_rate": 9.898209796595636e-05, + "loss": 1.7325, + "step": 1435 + }, + { + "epoch": 0.08004013154227746, + "grad_norm": 0.48422524333000183, + "learning_rate": 9.898030964010562e-05, + "loss": 1.5905, + "step": 1436 + }, + { + "epoch": 0.08009586979544062, + "grad_norm": 0.5284083485603333, + "learning_rate": 9.897851976088501e-05, + "loss": 1.672, + "step": 1437 + }, + { + "epoch": 0.08015160804860376, + "grad_norm": 0.5937215685844421, + "learning_rate": 9.897672832835135e-05, + "loss": 1.9549, + "step": 1438 + }, + { + "epoch": 0.0802073463017669, + "grad_norm": 0.4896755516529083, + "learning_rate": 9.89749353425614e-05, + "loss": 1.7438, + "step": 1439 + }, + { + "epoch": 0.08026308455493004, + "grad_norm": 0.5281119346618652, + "learning_rate": 9.897314080357202e-05, + "loss": 1.6437, + "step": 1440 + }, + { + "epoch": 0.08031882280809319, + "grad_norm": 0.5150919556617737, + "learning_rate": 9.897134471144019e-05, + "loss": 1.742, + "step": 1441 + }, + { + "epoch": 0.08037456106125634, + "grad_norm": 0.5028387308120728, + "learning_rate": 9.896954706622281e-05, + "loss": 1.5031, + "step": 1442 + }, + { + "epoch": 0.08043029931441949, + "grad_norm": 0.5158771276473999, + "learning_rate": 9.896774786797691e-05, + "loss": 1.533, + "step": 1443 + }, + { + "epoch": 0.08048603756758263, + "grad_norm": 0.5377411842346191, + "learning_rate": 9.896594711675954e-05, + "loss": 2.0242, + "step": 1444 + }, + { + "epoch": 0.08054177582074577, + "grad_norm": 0.4912663698196411, + "learning_rate": 9.896414481262784e-05, + "loss": 1.815, + "step": 1445 + }, + { + "epoch": 0.08059751407390893, + "grad_norm": 0.47936177253723145, + "learning_rate": 9.896234095563893e-05, + "loss": 1.5458, + "step": 1446 + }, + { + "epoch": 0.08065325232707207, + "grad_norm": 0.5695403218269348, + "learning_rate": 9.896053554585006e-05, + "loss": 2.1062, + "step": 1447 + }, + { + "epoch": 0.08070899058023522, + "grad_norm": 0.5067823529243469, + "learning_rate": 9.895872858331843e-05, + "loss": 1.7228, + "step": 1448 + }, + { + "epoch": 0.08076472883339836, + "grad_norm": 0.5249797105789185, + "learning_rate": 9.89569200681014e-05, + "loss": 1.8915, + "step": 1449 + }, + { + "epoch": 0.0808204670865615, + "grad_norm": 0.5042678713798523, + "learning_rate": 9.895511000025629e-05, + "loss": 1.857, + "step": 1450 + }, + { + "epoch": 0.08087620533972466, + "grad_norm": 0.5119437575340271, + "learning_rate": 9.895329837984053e-05, + "loss": 1.7033, + "step": 1451 + }, + { + "epoch": 0.0809319435928878, + "grad_norm": 0.5357143878936768, + "learning_rate": 9.895148520691155e-05, + "loss": 1.9076, + "step": 1452 + }, + { + "epoch": 0.08098768184605094, + "grad_norm": 0.47728776931762695, + "learning_rate": 9.894967048152688e-05, + "loss": 1.4164, + "step": 1453 + }, + { + "epoch": 0.08104342009921409, + "grad_norm": 0.5269622206687927, + "learning_rate": 9.894785420374405e-05, + "loss": 1.9833, + "step": 1454 + }, + { + "epoch": 0.08109915835237724, + "grad_norm": 0.5312412977218628, + "learning_rate": 9.894603637362068e-05, + "loss": 1.8342, + "step": 1455 + }, + { + "epoch": 0.08115489660554039, + "grad_norm": 0.5786725282669067, + "learning_rate": 9.894421699121439e-05, + "loss": 2.1415, + "step": 1456 + }, + { + "epoch": 0.08121063485870353, + "grad_norm": 0.4990336000919342, + "learning_rate": 9.894239605658292e-05, + "loss": 1.8387, + "step": 1457 + }, + { + "epoch": 0.08126637311186667, + "grad_norm": 0.5438005924224854, + "learning_rate": 9.8940573569784e-05, + "loss": 1.9307, + "step": 1458 + }, + { + "epoch": 0.08132211136502981, + "grad_norm": 0.5444794297218323, + "learning_rate": 9.893874953087543e-05, + "loss": 1.7991, + "step": 1459 + }, + { + "epoch": 0.08137784961819297, + "grad_norm": 0.5221540331840515, + "learning_rate": 9.893692393991504e-05, + "loss": 1.7898, + "step": 1460 + }, + { + "epoch": 0.08143358787135611, + "grad_norm": 0.509023129940033, + "learning_rate": 9.893509679696077e-05, + "loss": 1.8955, + "step": 1461 + }, + { + "epoch": 0.08148932612451926, + "grad_norm": 0.5018633008003235, + "learning_rate": 9.893326810207053e-05, + "loss": 1.6774, + "step": 1462 + }, + { + "epoch": 0.0815450643776824, + "grad_norm": 0.5234403610229492, + "learning_rate": 9.893143785530233e-05, + "loss": 1.5989, + "step": 1463 + }, + { + "epoch": 0.08160080263084554, + "grad_norm": 0.5122543573379517, + "learning_rate": 9.892960605671421e-05, + "loss": 1.6129, + "step": 1464 + }, + { + "epoch": 0.0816565408840087, + "grad_norm": 0.5005357265472412, + "learning_rate": 9.892777270636426e-05, + "loss": 1.7568, + "step": 1465 + }, + { + "epoch": 0.08171227913717184, + "grad_norm": 0.4521070420742035, + "learning_rate": 9.892593780431063e-05, + "loss": 1.5785, + "step": 1466 + }, + { + "epoch": 0.08176801739033498, + "grad_norm": 0.5116862058639526, + "learning_rate": 9.892410135061151e-05, + "loss": 1.6021, + "step": 1467 + }, + { + "epoch": 0.08182375564349813, + "grad_norm": 0.5345929861068726, + "learning_rate": 9.892226334532515e-05, + "loss": 1.7185, + "step": 1468 + }, + { + "epoch": 0.08187949389666128, + "grad_norm": 0.5190909504890442, + "learning_rate": 9.892042378850983e-05, + "loss": 1.7729, + "step": 1469 + }, + { + "epoch": 0.08193523214982443, + "grad_norm": 0.5051796436309814, + "learning_rate": 9.89185826802239e-05, + "loss": 1.7497, + "step": 1470 + }, + { + "epoch": 0.08199097040298757, + "grad_norm": 0.49057456851005554, + "learning_rate": 9.891674002052572e-05, + "loss": 1.7032, + "step": 1471 + }, + { + "epoch": 0.08204670865615071, + "grad_norm": 0.48970887064933777, + "learning_rate": 9.891489580947377e-05, + "loss": 1.697, + "step": 1472 + }, + { + "epoch": 0.08210244690931386, + "grad_norm": 0.466226726770401, + "learning_rate": 9.891305004712652e-05, + "loss": 1.676, + "step": 1473 + }, + { + "epoch": 0.08215818516247701, + "grad_norm": 0.5120090246200562, + "learning_rate": 9.891120273354248e-05, + "loss": 1.7862, + "step": 1474 + }, + { + "epoch": 0.08221392341564016, + "grad_norm": 0.5071076154708862, + "learning_rate": 9.890935386878029e-05, + "loss": 1.7835, + "step": 1475 + }, + { + "epoch": 0.0822696616688033, + "grad_norm": 0.5432698726654053, + "learning_rate": 9.890750345289855e-05, + "loss": 1.9147, + "step": 1476 + }, + { + "epoch": 0.08232539992196644, + "grad_norm": 0.5131239295005798, + "learning_rate": 9.890565148595594e-05, + "loss": 1.9944, + "step": 1477 + }, + { + "epoch": 0.0823811381751296, + "grad_norm": 0.49580785632133484, + "learning_rate": 9.890379796801122e-05, + "loss": 1.7003, + "step": 1478 + }, + { + "epoch": 0.08243687642829274, + "grad_norm": 0.5251078605651855, + "learning_rate": 9.890194289912315e-05, + "loss": 1.5901, + "step": 1479 + }, + { + "epoch": 0.08249261468145588, + "grad_norm": 0.4522892236709595, + "learning_rate": 9.890008627935057e-05, + "loss": 1.4628, + "step": 1480 + }, + { + "epoch": 0.08254835293461903, + "grad_norm": 0.49866771697998047, + "learning_rate": 9.889822810875236e-05, + "loss": 1.797, + "step": 1481 + }, + { + "epoch": 0.08260409118778217, + "grad_norm": 0.5042446851730347, + "learning_rate": 9.889636838738745e-05, + "loss": 1.7715, + "step": 1482 + }, + { + "epoch": 0.08265982944094533, + "grad_norm": 0.5398827791213989, + "learning_rate": 9.889450711531482e-05, + "loss": 1.7935, + "step": 1483 + }, + { + "epoch": 0.08271556769410847, + "grad_norm": 0.5085358023643494, + "learning_rate": 9.889264429259351e-05, + "loss": 1.7009, + "step": 1484 + }, + { + "epoch": 0.08277130594727161, + "grad_norm": 0.5344458222389221, + "learning_rate": 9.889077991928257e-05, + "loss": 1.9159, + "step": 1485 + }, + { + "epoch": 0.08282704420043475, + "grad_norm": 0.5375879406929016, + "learning_rate": 9.888891399544116e-05, + "loss": 1.8089, + "step": 1486 + }, + { + "epoch": 0.0828827824535979, + "grad_norm": 0.5068013668060303, + "learning_rate": 9.888704652112841e-05, + "loss": 1.81, + "step": 1487 + }, + { + "epoch": 0.08293852070676105, + "grad_norm": 0.5293126106262207, + "learning_rate": 9.88851774964036e-05, + "loss": 1.8359, + "step": 1488 + }, + { + "epoch": 0.0829942589599242, + "grad_norm": 0.538372814655304, + "learning_rate": 9.8883306921326e-05, + "loss": 1.7542, + "step": 1489 + }, + { + "epoch": 0.08304999721308734, + "grad_norm": 0.5009732246398926, + "learning_rate": 9.888143479595487e-05, + "loss": 1.761, + "step": 1490 + }, + { + "epoch": 0.08310573546625048, + "grad_norm": 0.5073357820510864, + "learning_rate": 9.887956112034965e-05, + "loss": 1.961, + "step": 1491 + }, + { + "epoch": 0.08316147371941364, + "grad_norm": 0.5246378779411316, + "learning_rate": 9.887768589456973e-05, + "loss": 1.6075, + "step": 1492 + }, + { + "epoch": 0.08321721197257678, + "grad_norm": 0.5965234637260437, + "learning_rate": 9.88758091186746e-05, + "loss": 1.7721, + "step": 1493 + }, + { + "epoch": 0.08327295022573993, + "grad_norm": 0.580460250377655, + "learning_rate": 9.887393079272378e-05, + "loss": 2.0317, + "step": 1494 + }, + { + "epoch": 0.08332868847890307, + "grad_norm": 0.47487667202949524, + "learning_rate": 9.88720509167768e-05, + "loss": 1.614, + "step": 1495 + }, + { + "epoch": 0.08338442673206621, + "grad_norm": 0.511886715888977, + "learning_rate": 9.887016949089333e-05, + "loss": 1.7988, + "step": 1496 + }, + { + "epoch": 0.08344016498522937, + "grad_norm": 0.5386150479316711, + "learning_rate": 9.886828651513302e-05, + "loss": 1.6694, + "step": 1497 + }, + { + "epoch": 0.08349590323839251, + "grad_norm": 0.5117900967597961, + "learning_rate": 9.886640198955557e-05, + "loss": 1.9023, + "step": 1498 + }, + { + "epoch": 0.08355164149155565, + "grad_norm": 0.5726772546768188, + "learning_rate": 9.886451591422076e-05, + "loss": 1.8974, + "step": 1499 + }, + { + "epoch": 0.0836073797447188, + "grad_norm": 0.5696210861206055, + "learning_rate": 9.886262828918842e-05, + "loss": 2.011, + "step": 1500 + }, + { + "epoch": 0.08366311799788195, + "grad_norm": 0.5422051548957825, + "learning_rate": 9.886073911451838e-05, + "loss": 1.853, + "step": 1501 + }, + { + "epoch": 0.0837188562510451, + "grad_norm": 0.5856989622116089, + "learning_rate": 9.88588483902706e-05, + "loss": 2.0279, + "step": 1502 + }, + { + "epoch": 0.08377459450420824, + "grad_norm": 0.49369946122169495, + "learning_rate": 9.8856956116505e-05, + "loss": 1.9006, + "step": 1503 + }, + { + "epoch": 0.08383033275737138, + "grad_norm": 0.5601094961166382, + "learning_rate": 9.88550622932816e-05, + "loss": 1.8549, + "step": 1504 + }, + { + "epoch": 0.08388607101053452, + "grad_norm": 0.5482882857322693, + "learning_rate": 9.885316692066048e-05, + "loss": 1.6991, + "step": 1505 + }, + { + "epoch": 0.08394180926369768, + "grad_norm": 0.5111584663391113, + "learning_rate": 9.885126999870173e-05, + "loss": 1.7942, + "step": 1506 + }, + { + "epoch": 0.08399754751686082, + "grad_norm": 0.5061234831809998, + "learning_rate": 9.884937152746553e-05, + "loss": 1.7333, + "step": 1507 + }, + { + "epoch": 0.08405328577002397, + "grad_norm": 0.5409541726112366, + "learning_rate": 9.884747150701207e-05, + "loss": 1.8288, + "step": 1508 + }, + { + "epoch": 0.08410902402318711, + "grad_norm": 0.5025638341903687, + "learning_rate": 9.884556993740161e-05, + "loss": 1.7986, + "step": 1509 + }, + { + "epoch": 0.08416476227635025, + "grad_norm": 0.544328510761261, + "learning_rate": 9.884366681869447e-05, + "loss": 1.9335, + "step": 1510 + }, + { + "epoch": 0.08422050052951341, + "grad_norm": 0.5425384640693665, + "learning_rate": 9.8841762150951e-05, + "loss": 1.952, + "step": 1511 + }, + { + "epoch": 0.08427623878267655, + "grad_norm": 0.546819269657135, + "learning_rate": 9.883985593423158e-05, + "loss": 1.6983, + "step": 1512 + }, + { + "epoch": 0.0843319770358397, + "grad_norm": 0.5102137327194214, + "learning_rate": 9.88379481685967e-05, + "loss": 1.9128, + "step": 1513 + }, + { + "epoch": 0.08438771528900284, + "grad_norm": 0.5642107725143433, + "learning_rate": 9.883603885410686e-05, + "loss": 1.8798, + "step": 1514 + }, + { + "epoch": 0.084443453542166, + "grad_norm": 0.5285095572471619, + "learning_rate": 9.88341279908226e-05, + "loss": 1.987, + "step": 1515 + }, + { + "epoch": 0.08449919179532914, + "grad_norm": 0.5712692737579346, + "learning_rate": 9.88322155788045e-05, + "loss": 1.9272, + "step": 1516 + }, + { + "epoch": 0.08455493004849228, + "grad_norm": 0.5068216919898987, + "learning_rate": 9.883030161811324e-05, + "loss": 1.747, + "step": 1517 + }, + { + "epoch": 0.08461066830165542, + "grad_norm": 0.5292205810546875, + "learning_rate": 9.882838610880954e-05, + "loss": 1.7361, + "step": 1518 + }, + { + "epoch": 0.08466640655481857, + "grad_norm": 0.5131486654281616, + "learning_rate": 9.88264690509541e-05, + "loss": 1.7197, + "step": 1519 + }, + { + "epoch": 0.08472214480798172, + "grad_norm": 0.5345507860183716, + "learning_rate": 9.882455044460773e-05, + "loss": 1.6553, + "step": 1520 + }, + { + "epoch": 0.08477788306114487, + "grad_norm": 0.5729446411132812, + "learning_rate": 9.88226302898313e-05, + "loss": 1.9354, + "step": 1521 + }, + { + "epoch": 0.08483362131430801, + "grad_norm": 0.5425586700439453, + "learning_rate": 9.882070858668568e-05, + "loss": 1.7173, + "step": 1522 + }, + { + "epoch": 0.08488935956747115, + "grad_norm": 0.5828628540039062, + "learning_rate": 9.881878533523185e-05, + "loss": 1.5161, + "step": 1523 + }, + { + "epoch": 0.08494509782063431, + "grad_norm": 0.4496408998966217, + "learning_rate": 9.881686053553077e-05, + "loss": 1.4486, + "step": 1524 + }, + { + "epoch": 0.08500083607379745, + "grad_norm": 0.5365184545516968, + "learning_rate": 9.88149341876435e-05, + "loss": 1.7378, + "step": 1525 + }, + { + "epoch": 0.0850565743269606, + "grad_norm": 0.5183097720146179, + "learning_rate": 9.881300629163113e-05, + "loss": 1.7466, + "step": 1526 + }, + { + "epoch": 0.08511231258012374, + "grad_norm": 0.5500345826148987, + "learning_rate": 9.88110768475548e-05, + "loss": 2.05, + "step": 1527 + }, + { + "epoch": 0.08516805083328688, + "grad_norm": 0.5311182141304016, + "learning_rate": 9.88091458554757e-05, + "loss": 1.9213, + "step": 1528 + }, + { + "epoch": 0.08522378908645004, + "grad_norm": 0.5297403335571289, + "learning_rate": 9.880721331545507e-05, + "loss": 1.7725, + "step": 1529 + }, + { + "epoch": 0.08527952733961318, + "grad_norm": 0.4777231514453888, + "learning_rate": 9.880527922755418e-05, + "loss": 1.7671, + "step": 1530 + }, + { + "epoch": 0.08533526559277632, + "grad_norm": 0.5027580261230469, + "learning_rate": 9.880334359183441e-05, + "loss": 1.5094, + "step": 1531 + }, + { + "epoch": 0.08539100384593946, + "grad_norm": 0.5496742725372314, + "learning_rate": 9.880140640835711e-05, + "loss": 1.8291, + "step": 1532 + }, + { + "epoch": 0.08544674209910261, + "grad_norm": 0.5041139721870422, + "learning_rate": 9.879946767718374e-05, + "loss": 1.6669, + "step": 1533 + }, + { + "epoch": 0.08550248035226576, + "grad_norm": 0.5976061820983887, + "learning_rate": 9.879752739837578e-05, + "loss": 2.1902, + "step": 1534 + }, + { + "epoch": 0.0855582186054289, + "grad_norm": 0.5422946810722351, + "learning_rate": 9.879558557199475e-05, + "loss": 1.5727, + "step": 1535 + }, + { + "epoch": 0.08561395685859205, + "grad_norm": 0.4999959170818329, + "learning_rate": 9.879364219810226e-05, + "loss": 1.6102, + "step": 1536 + }, + { + "epoch": 0.08566969511175519, + "grad_norm": 0.5026562213897705, + "learning_rate": 9.879169727675991e-05, + "loss": 1.7124, + "step": 1537 + }, + { + "epoch": 0.08572543336491835, + "grad_norm": 0.5175659656524658, + "learning_rate": 9.87897508080294e-05, + "loss": 1.7585, + "step": 1538 + }, + { + "epoch": 0.08578117161808149, + "grad_norm": 0.5337525010108948, + "learning_rate": 9.878780279197247e-05, + "loss": 1.7857, + "step": 1539 + }, + { + "epoch": 0.08583690987124463, + "grad_norm": 0.5325166583061218, + "learning_rate": 9.878585322865087e-05, + "loss": 1.865, + "step": 1540 + }, + { + "epoch": 0.08589264812440778, + "grad_norm": 0.46590784192085266, + "learning_rate": 9.878390211812646e-05, + "loss": 1.627, + "step": 1541 + }, + { + "epoch": 0.08594838637757092, + "grad_norm": 0.4856724441051483, + "learning_rate": 9.87819494604611e-05, + "loss": 1.7221, + "step": 1542 + }, + { + "epoch": 0.08600412463073408, + "grad_norm": 0.5396975874900818, + "learning_rate": 9.877999525571673e-05, + "loss": 1.7696, + "step": 1543 + }, + { + "epoch": 0.08605986288389722, + "grad_norm": 0.49516481161117554, + "learning_rate": 9.87780395039553e-05, + "loss": 1.6928, + "step": 1544 + }, + { + "epoch": 0.08611560113706036, + "grad_norm": 0.5212313532829285, + "learning_rate": 9.877608220523886e-05, + "loss": 1.8461, + "step": 1545 + }, + { + "epoch": 0.0861713393902235, + "grad_norm": 0.5174347162246704, + "learning_rate": 9.877412335962948e-05, + "loss": 1.6598, + "step": 1546 + }, + { + "epoch": 0.08622707764338666, + "grad_norm": 0.5417358875274658, + "learning_rate": 9.877216296718929e-05, + "loss": 1.8449, + "step": 1547 + }, + { + "epoch": 0.0862828158965498, + "grad_norm": 0.6204573512077332, + "learning_rate": 9.877020102798044e-05, + "loss": 2.0521, + "step": 1548 + }, + { + "epoch": 0.08633855414971295, + "grad_norm": 0.548689067363739, + "learning_rate": 9.876823754206517e-05, + "loss": 1.8019, + "step": 1549 + }, + { + "epoch": 0.08639429240287609, + "grad_norm": 0.5634471774101257, + "learning_rate": 9.876627250950573e-05, + "loss": 1.9138, + "step": 1550 + }, + { + "epoch": 0.08645003065603923, + "grad_norm": 0.517440915107727, + "learning_rate": 9.876430593036445e-05, + "loss": 1.6576, + "step": 1551 + }, + { + "epoch": 0.08650576890920239, + "grad_norm": 0.5255969762802124, + "learning_rate": 9.876233780470373e-05, + "loss": 1.9165, + "step": 1552 + }, + { + "epoch": 0.08656150716236553, + "grad_norm": 0.5497751235961914, + "learning_rate": 9.876036813258593e-05, + "loss": 1.7924, + "step": 1553 + }, + { + "epoch": 0.08661724541552868, + "grad_norm": 0.49066075682640076, + "learning_rate": 9.875839691407355e-05, + "loss": 1.7025, + "step": 1554 + }, + { + "epoch": 0.08667298366869182, + "grad_norm": 0.5411027669906616, + "learning_rate": 9.875642414922913e-05, + "loss": 1.7742, + "step": 1555 + }, + { + "epoch": 0.08672872192185498, + "grad_norm": 0.5388767123222351, + "learning_rate": 9.875444983811517e-05, + "loss": 1.7676, + "step": 1556 + }, + { + "epoch": 0.08678446017501812, + "grad_norm": 0.540668249130249, + "learning_rate": 9.875247398079434e-05, + "loss": 1.7824, + "step": 1557 + }, + { + "epoch": 0.08684019842818126, + "grad_norm": 0.4785401523113251, + "learning_rate": 9.875049657732928e-05, + "loss": 1.5643, + "step": 1558 + }, + { + "epoch": 0.0868959366813444, + "grad_norm": 0.4758340120315552, + "learning_rate": 9.87485176277827e-05, + "loss": 1.7751, + "step": 1559 + }, + { + "epoch": 0.08695167493450755, + "grad_norm": 0.5260589122772217, + "learning_rate": 9.874653713221736e-05, + "loss": 1.6758, + "step": 1560 + }, + { + "epoch": 0.0870074131876707, + "grad_norm": 0.5716840624809265, + "learning_rate": 9.874455509069608e-05, + "loss": 1.9237, + "step": 1561 + }, + { + "epoch": 0.08706315144083385, + "grad_norm": 0.5434233546257019, + "learning_rate": 9.874257150328171e-05, + "loss": 1.8882, + "step": 1562 + }, + { + "epoch": 0.08711888969399699, + "grad_norm": 0.562435507774353, + "learning_rate": 9.874058637003715e-05, + "loss": 2.0451, + "step": 1563 + }, + { + "epoch": 0.08717462794716013, + "grad_norm": 0.5642979741096497, + "learning_rate": 9.87385996910254e-05, + "loss": 1.924, + "step": 1564 + }, + { + "epoch": 0.08723036620032328, + "grad_norm": 0.5052669048309326, + "learning_rate": 9.87366114663094e-05, + "loss": 1.58, + "step": 1565 + }, + { + "epoch": 0.08728610445348643, + "grad_norm": 0.5220628380775452, + "learning_rate": 9.873462169595225e-05, + "loss": 1.7895, + "step": 1566 + }, + { + "epoch": 0.08734184270664958, + "grad_norm": 0.517431378364563, + "learning_rate": 9.873263038001706e-05, + "loss": 1.6593, + "step": 1567 + }, + { + "epoch": 0.08739758095981272, + "grad_norm": 0.5140258073806763, + "learning_rate": 9.873063751856693e-05, + "loss": 1.8271, + "step": 1568 + }, + { + "epoch": 0.08745331921297586, + "grad_norm": 0.4922142028808594, + "learning_rate": 9.872864311166513e-05, + "loss": 1.6083, + "step": 1569 + }, + { + "epoch": 0.08750905746613902, + "grad_norm": 0.5390502214431763, + "learning_rate": 9.872664715937485e-05, + "loss": 1.4434, + "step": 1570 + }, + { + "epoch": 0.08756479571930216, + "grad_norm": 0.5033831596374512, + "learning_rate": 9.872464966175943e-05, + "loss": 1.7666, + "step": 1571 + }, + { + "epoch": 0.0876205339724653, + "grad_norm": 0.5968888401985168, + "learning_rate": 9.872265061888222e-05, + "loss": 2.129, + "step": 1572 + }, + { + "epoch": 0.08767627222562845, + "grad_norm": 0.4963712990283966, + "learning_rate": 9.87206500308066e-05, + "loss": 1.757, + "step": 1573 + }, + { + "epoch": 0.08773201047879159, + "grad_norm": 0.561555802822113, + "learning_rate": 9.871864789759602e-05, + "loss": 1.8953, + "step": 1574 + }, + { + "epoch": 0.08778774873195475, + "grad_norm": 0.5095016956329346, + "learning_rate": 9.871664421931397e-05, + "loss": 1.5125, + "step": 1575 + }, + { + "epoch": 0.08784348698511789, + "grad_norm": 0.5717408061027527, + "learning_rate": 9.8714638996024e-05, + "loss": 1.9326, + "step": 1576 + }, + { + "epoch": 0.08789922523828103, + "grad_norm": 0.5086256861686707, + "learning_rate": 9.871263222778972e-05, + "loss": 1.4956, + "step": 1577 + }, + { + "epoch": 0.08795496349144417, + "grad_norm": 0.5559898614883423, + "learning_rate": 9.871062391467476e-05, + "loss": 2.0481, + "step": 1578 + }, + { + "epoch": 0.08801070174460733, + "grad_norm": 0.511561930179596, + "learning_rate": 9.870861405674281e-05, + "loss": 1.6748, + "step": 1579 + }, + { + "epoch": 0.08806643999777047, + "grad_norm": 0.46475693583488464, + "learning_rate": 9.87066026540576e-05, + "loss": 1.5146, + "step": 1580 + }, + { + "epoch": 0.08812217825093362, + "grad_norm": 0.619973361492157, + "learning_rate": 9.870458970668295e-05, + "loss": 1.9752, + "step": 1581 + }, + { + "epoch": 0.08817791650409676, + "grad_norm": 0.5257066488265991, + "learning_rate": 9.870257521468267e-05, + "loss": 1.8943, + "step": 1582 + }, + { + "epoch": 0.0882336547572599, + "grad_norm": 0.48758870363235474, + "learning_rate": 9.870055917812066e-05, + "loss": 1.7243, + "step": 1583 + }, + { + "epoch": 0.08828939301042306, + "grad_norm": 0.500957190990448, + "learning_rate": 9.869854159706087e-05, + "loss": 1.608, + "step": 1584 + }, + { + "epoch": 0.0883451312635862, + "grad_norm": 0.5307281613349915, + "learning_rate": 9.869652247156726e-05, + "loss": 1.8326, + "step": 1585 + }, + { + "epoch": 0.08840086951674934, + "grad_norm": 0.5321508049964905, + "learning_rate": 9.869450180170388e-05, + "loss": 1.5715, + "step": 1586 + }, + { + "epoch": 0.08845660776991249, + "grad_norm": 0.512824296951294, + "learning_rate": 9.869247958753483e-05, + "loss": 1.9452, + "step": 1587 + }, + { + "epoch": 0.08851234602307563, + "grad_norm": 0.5297205448150635, + "learning_rate": 9.86904558291242e-05, + "loss": 1.7894, + "step": 1588 + }, + { + "epoch": 0.08856808427623879, + "grad_norm": 0.5388361215591431, + "learning_rate": 9.86884305265362e-05, + "loss": 1.8428, + "step": 1589 + }, + { + "epoch": 0.08862382252940193, + "grad_norm": 0.5642775297164917, + "learning_rate": 9.868640367983507e-05, + "loss": 1.9602, + "step": 1590 + }, + { + "epoch": 0.08867956078256507, + "grad_norm": 0.5613628029823303, + "learning_rate": 9.868437528908507e-05, + "loss": 1.8967, + "step": 1591 + }, + { + "epoch": 0.08873529903572822, + "grad_norm": 0.4843713641166687, + "learning_rate": 9.868234535435052e-05, + "loss": 1.5939, + "step": 1592 + }, + { + "epoch": 0.08879103728889137, + "grad_norm": 0.5549110770225525, + "learning_rate": 9.868031387569583e-05, + "loss": 1.7461, + "step": 1593 + }, + { + "epoch": 0.08884677554205452, + "grad_norm": 0.5344760417938232, + "learning_rate": 9.867828085318541e-05, + "loss": 1.7843, + "step": 1594 + }, + { + "epoch": 0.08890251379521766, + "grad_norm": 0.49532350897789, + "learning_rate": 9.867624628688374e-05, + "loss": 1.981, + "step": 1595 + }, + { + "epoch": 0.0889582520483808, + "grad_norm": 0.48208191990852356, + "learning_rate": 9.867421017685531e-05, + "loss": 1.3437, + "step": 1596 + }, + { + "epoch": 0.08901399030154394, + "grad_norm": 0.489444762468338, + "learning_rate": 9.867217252316476e-05, + "loss": 1.6426, + "step": 1597 + }, + { + "epoch": 0.0890697285547071, + "grad_norm": 0.5148588418960571, + "learning_rate": 9.867013332587667e-05, + "loss": 1.5808, + "step": 1598 + }, + { + "epoch": 0.08912546680787024, + "grad_norm": 0.5365609526634216, + "learning_rate": 9.86680925850557e-05, + "loss": 1.8197, + "step": 1599 + }, + { + "epoch": 0.08918120506103339, + "grad_norm": 0.48567450046539307, + "learning_rate": 9.86660503007666e-05, + "loss": 1.6238, + "step": 1600 + }, + { + "epoch": 0.08923694331419653, + "grad_norm": 0.515129029750824, + "learning_rate": 9.866400647307413e-05, + "loss": 1.8063, + "step": 1601 + }, + { + "epoch": 0.08929268156735969, + "grad_norm": 0.5591225028038025, + "learning_rate": 9.86619611020431e-05, + "loss": 1.8849, + "step": 1602 + }, + { + "epoch": 0.08934841982052283, + "grad_norm": 0.4950789213180542, + "learning_rate": 9.865991418773837e-05, + "loss": 1.5961, + "step": 1603 + }, + { + "epoch": 0.08940415807368597, + "grad_norm": 0.5623775124549866, + "learning_rate": 9.865786573022488e-05, + "loss": 1.782, + "step": 1604 + }, + { + "epoch": 0.08945989632684911, + "grad_norm": 0.5508179664611816, + "learning_rate": 9.865581572956759e-05, + "loss": 1.9102, + "step": 1605 + }, + { + "epoch": 0.08951563458001226, + "grad_norm": 0.5296784043312073, + "learning_rate": 9.86537641858315e-05, + "loss": 1.8494, + "step": 1606 + }, + { + "epoch": 0.08957137283317541, + "grad_norm": 0.5068146586418152, + "learning_rate": 9.865171109908169e-05, + "loss": 1.7515, + "step": 1607 + }, + { + "epoch": 0.08962711108633856, + "grad_norm": 0.5015462636947632, + "learning_rate": 9.864965646938326e-05, + "loss": 1.6874, + "step": 1608 + }, + { + "epoch": 0.0896828493395017, + "grad_norm": 0.5293746590614319, + "learning_rate": 9.864760029680137e-05, + "loss": 1.7417, + "step": 1609 + }, + { + "epoch": 0.08973858759266484, + "grad_norm": 0.5211681127548218, + "learning_rate": 9.864554258140124e-05, + "loss": 1.7553, + "step": 1610 + }, + { + "epoch": 0.08979432584582799, + "grad_norm": 0.7411361336708069, + "learning_rate": 9.864348332324811e-05, + "loss": 1.7663, + "step": 1611 + }, + { + "epoch": 0.08985006409899114, + "grad_norm": 0.4988972842693329, + "learning_rate": 9.864142252240731e-05, + "loss": 1.6, + "step": 1612 + }, + { + "epoch": 0.08990580235215428, + "grad_norm": 0.5340063571929932, + "learning_rate": 9.863936017894418e-05, + "loss": 1.8076, + "step": 1613 + }, + { + "epoch": 0.08996154060531743, + "grad_norm": 0.5994722247123718, + "learning_rate": 9.863729629292414e-05, + "loss": 1.7864, + "step": 1614 + }, + { + "epoch": 0.09001727885848057, + "grad_norm": 0.541131854057312, + "learning_rate": 9.863523086441264e-05, + "loss": 1.931, + "step": 1615 + }, + { + "epoch": 0.09007301711164373, + "grad_norm": 0.5259929299354553, + "learning_rate": 9.863316389347517e-05, + "loss": 1.7562, + "step": 1616 + }, + { + "epoch": 0.09012875536480687, + "grad_norm": 0.5242890119552612, + "learning_rate": 9.863109538017729e-05, + "loss": 1.6973, + "step": 1617 + }, + { + "epoch": 0.09018449361797001, + "grad_norm": 0.5834923386573792, + "learning_rate": 9.862902532458461e-05, + "loss": 2.0494, + "step": 1618 + }, + { + "epoch": 0.09024023187113316, + "grad_norm": 0.4912288188934326, + "learning_rate": 9.862695372676278e-05, + "loss": 1.6505, + "step": 1619 + }, + { + "epoch": 0.0902959701242963, + "grad_norm": 0.5288010239601135, + "learning_rate": 9.862488058677748e-05, + "loss": 1.734, + "step": 1620 + }, + { + "epoch": 0.09035170837745946, + "grad_norm": 0.5029554963111877, + "learning_rate": 9.862280590469448e-05, + "loss": 1.8098, + "step": 1621 + }, + { + "epoch": 0.0904074466306226, + "grad_norm": 0.531711995601654, + "learning_rate": 9.862072968057956e-05, + "loss": 1.8394, + "step": 1622 + }, + { + "epoch": 0.09046318488378574, + "grad_norm": 0.4818442165851593, + "learning_rate": 9.861865191449858e-05, + "loss": 1.6742, + "step": 1623 + }, + { + "epoch": 0.09051892313694888, + "grad_norm": 0.4834239184856415, + "learning_rate": 9.861657260651742e-05, + "loss": 1.6425, + "step": 1624 + }, + { + "epoch": 0.09057466139011204, + "grad_norm": 0.4923589825630188, + "learning_rate": 9.861449175670204e-05, + "loss": 1.5693, + "step": 1625 + }, + { + "epoch": 0.09063039964327518, + "grad_norm": 0.48194825649261475, + "learning_rate": 9.861240936511842e-05, + "loss": 1.6782, + "step": 1626 + }, + { + "epoch": 0.09068613789643833, + "grad_norm": 0.5542406439781189, + "learning_rate": 9.86103254318326e-05, + "loss": 1.9775, + "step": 1627 + }, + { + "epoch": 0.09074187614960147, + "grad_norm": 0.6013079881668091, + "learning_rate": 9.860823995691068e-05, + "loss": 1.9425, + "step": 1628 + }, + { + "epoch": 0.09079761440276461, + "grad_norm": 0.5376304984092712, + "learning_rate": 9.860615294041879e-05, + "loss": 1.6473, + "step": 1629 + }, + { + "epoch": 0.09085335265592777, + "grad_norm": 0.5485152006149292, + "learning_rate": 9.860406438242313e-05, + "loss": 1.6367, + "step": 1630 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 0.5142073035240173, + "learning_rate": 9.860197428298991e-05, + "loss": 1.7602, + "step": 1631 + }, + { + "epoch": 0.09096482916225405, + "grad_norm": 0.49521228671073914, + "learning_rate": 9.859988264218546e-05, + "loss": 1.546, + "step": 1632 + }, + { + "epoch": 0.0910205674154172, + "grad_norm": 0.5011737942695618, + "learning_rate": 9.859778946007608e-05, + "loss": 1.5578, + "step": 1633 + }, + { + "epoch": 0.09107630566858034, + "grad_norm": 0.4523265063762665, + "learning_rate": 9.859569473672816e-05, + "loss": 1.3888, + "step": 1634 + }, + { + "epoch": 0.0911320439217435, + "grad_norm": 0.48054036498069763, + "learning_rate": 9.859359847220815e-05, + "loss": 1.7516, + "step": 1635 + }, + { + "epoch": 0.09118778217490664, + "grad_norm": 0.5349341034889221, + "learning_rate": 9.85915006665825e-05, + "loss": 1.7055, + "step": 1636 + }, + { + "epoch": 0.09124352042806978, + "grad_norm": 0.5274312496185303, + "learning_rate": 9.858940131991777e-05, + "loss": 1.8203, + "step": 1637 + }, + { + "epoch": 0.09129925868123293, + "grad_norm": 0.4654419720172882, + "learning_rate": 9.85873004322805e-05, + "loss": 1.5783, + "step": 1638 + }, + { + "epoch": 0.09135499693439608, + "grad_norm": 0.5258073806762695, + "learning_rate": 9.858519800373738e-05, + "loss": 1.7707, + "step": 1639 + }, + { + "epoch": 0.09141073518755923, + "grad_norm": 0.4929850995540619, + "learning_rate": 9.858309403435501e-05, + "loss": 1.6027, + "step": 1640 + }, + { + "epoch": 0.09146647344072237, + "grad_norm": 0.5121711492538452, + "learning_rate": 9.85809885242002e-05, + "loss": 1.7874, + "step": 1641 + }, + { + "epoch": 0.09152221169388551, + "grad_norm": 0.4955439567565918, + "learning_rate": 9.857888147333965e-05, + "loss": 1.7223, + "step": 1642 + }, + { + "epoch": 0.09157794994704865, + "grad_norm": 0.519477903842926, + "learning_rate": 9.857677288184022e-05, + "loss": 1.8618, + "step": 1643 + }, + { + "epoch": 0.09163368820021181, + "grad_norm": 0.5247395038604736, + "learning_rate": 9.857466274976878e-05, + "loss": 1.761, + "step": 1644 + }, + { + "epoch": 0.09168942645337495, + "grad_norm": 0.4881756901741028, + "learning_rate": 9.857255107719225e-05, + "loss": 1.7272, + "step": 1645 + }, + { + "epoch": 0.0917451647065381, + "grad_norm": 0.5688063502311707, + "learning_rate": 9.857043786417759e-05, + "loss": 1.7532, + "step": 1646 + }, + { + "epoch": 0.09180090295970124, + "grad_norm": 0.531910240650177, + "learning_rate": 9.856832311079183e-05, + "loss": 1.9235, + "step": 1647 + }, + { + "epoch": 0.0918566412128644, + "grad_norm": 0.5271464586257935, + "learning_rate": 9.856620681710205e-05, + "loss": 1.8481, + "step": 1648 + }, + { + "epoch": 0.09191237946602754, + "grad_norm": 0.5019913911819458, + "learning_rate": 9.856408898317533e-05, + "loss": 1.7273, + "step": 1649 + }, + { + "epoch": 0.09196811771919068, + "grad_norm": 0.5375306010246277, + "learning_rate": 9.856196960907887e-05, + "loss": 1.8292, + "step": 1650 + }, + { + "epoch": 0.09202385597235382, + "grad_norm": 0.551287829875946, + "learning_rate": 9.855984869487985e-05, + "loss": 1.7672, + "step": 1651 + }, + { + "epoch": 0.09207959422551697, + "grad_norm": 0.5110806226730347, + "learning_rate": 9.855772624064557e-05, + "loss": 1.7338, + "step": 1652 + }, + { + "epoch": 0.09213533247868012, + "grad_norm": 0.5807773470878601, + "learning_rate": 9.855560224644332e-05, + "loss": 1.8558, + "step": 1653 + }, + { + "epoch": 0.09219107073184327, + "grad_norm": 0.5399064421653748, + "learning_rate": 9.855347671234045e-05, + "loss": 1.7338, + "step": 1654 + }, + { + "epoch": 0.09224680898500641, + "grad_norm": 0.5670611262321472, + "learning_rate": 9.855134963840441e-05, + "loss": 1.9314, + "step": 1655 + }, + { + "epoch": 0.09230254723816955, + "grad_norm": 0.49795302748680115, + "learning_rate": 9.854922102470262e-05, + "loss": 1.7196, + "step": 1656 + }, + { + "epoch": 0.0923582854913327, + "grad_norm": 0.5752295255661011, + "learning_rate": 9.85470908713026e-05, + "loss": 1.7249, + "step": 1657 + }, + { + "epoch": 0.09241402374449585, + "grad_norm": 0.4967830181121826, + "learning_rate": 9.854495917827191e-05, + "loss": 1.7368, + "step": 1658 + }, + { + "epoch": 0.092469761997659, + "grad_norm": 0.4957406520843506, + "learning_rate": 9.854282594567816e-05, + "loss": 1.8287, + "step": 1659 + }, + { + "epoch": 0.09252550025082214, + "grad_norm": 0.49035385251045227, + "learning_rate": 9.854069117358899e-05, + "loss": 1.743, + "step": 1660 + }, + { + "epoch": 0.09258123850398528, + "grad_norm": 0.5366220474243164, + "learning_rate": 9.853855486207211e-05, + "loss": 1.7903, + "step": 1661 + }, + { + "epoch": 0.09263697675714844, + "grad_norm": 0.5238292217254639, + "learning_rate": 9.853641701119525e-05, + "loss": 1.6038, + "step": 1662 + }, + { + "epoch": 0.09269271501031158, + "grad_norm": 0.507854700088501, + "learning_rate": 9.853427762102625e-05, + "loss": 1.7459, + "step": 1663 + }, + { + "epoch": 0.09274845326347472, + "grad_norm": 0.5182837247848511, + "learning_rate": 9.853213669163293e-05, + "loss": 1.7409, + "step": 1664 + }, + { + "epoch": 0.09280419151663787, + "grad_norm": 0.5023046135902405, + "learning_rate": 9.852999422308319e-05, + "loss": 1.8207, + "step": 1665 + }, + { + "epoch": 0.09285992976980101, + "grad_norm": 0.6185427308082581, + "learning_rate": 9.852785021544499e-05, + "loss": 1.9794, + "step": 1666 + }, + { + "epoch": 0.09291566802296417, + "grad_norm": 0.5567124485969543, + "learning_rate": 9.852570466878632e-05, + "loss": 1.8052, + "step": 1667 + }, + { + "epoch": 0.09297140627612731, + "grad_norm": 0.5299728512763977, + "learning_rate": 9.852355758317523e-05, + "loss": 1.6414, + "step": 1668 + }, + { + "epoch": 0.09302714452929045, + "grad_norm": 0.47446316480636597, + "learning_rate": 9.85214089586798e-05, + "loss": 1.561, + "step": 1669 + }, + { + "epoch": 0.0930828827824536, + "grad_norm": 0.5260158181190491, + "learning_rate": 9.851925879536817e-05, + "loss": 1.7192, + "step": 1670 + }, + { + "epoch": 0.09313862103561675, + "grad_norm": 0.5200673341751099, + "learning_rate": 9.851710709330855e-05, + "loss": 1.6869, + "step": 1671 + }, + { + "epoch": 0.0931943592887799, + "grad_norm": 0.5707138180732727, + "learning_rate": 9.851495385256915e-05, + "loss": 1.7307, + "step": 1672 + }, + { + "epoch": 0.09325009754194304, + "grad_norm": 0.6008026003837585, + "learning_rate": 9.851279907321829e-05, + "loss": 1.8593, + "step": 1673 + }, + { + "epoch": 0.09330583579510618, + "grad_norm": 0.4921055734157562, + "learning_rate": 9.851064275532428e-05, + "loss": 1.7155, + "step": 1674 + }, + { + "epoch": 0.09336157404826932, + "grad_norm": 0.48389917612075806, + "learning_rate": 9.850848489895553e-05, + "loss": 1.7011, + "step": 1675 + }, + { + "epoch": 0.09341731230143248, + "grad_norm": 0.6712982058525085, + "learning_rate": 9.850632550418046e-05, + "loss": 1.8851, + "step": 1676 + }, + { + "epoch": 0.09347305055459562, + "grad_norm": 0.49884751439094543, + "learning_rate": 9.850416457106755e-05, + "loss": 1.7392, + "step": 1677 + }, + { + "epoch": 0.09352878880775876, + "grad_norm": 0.5436164736747742, + "learning_rate": 9.850200209968535e-05, + "loss": 1.8583, + "step": 1678 + }, + { + "epoch": 0.09358452706092191, + "grad_norm": 0.543387234210968, + "learning_rate": 9.849983809010242e-05, + "loss": 1.9008, + "step": 1679 + }, + { + "epoch": 0.09364026531408505, + "grad_norm": 0.5220986604690552, + "learning_rate": 9.849767254238741e-05, + "loss": 1.8536, + "step": 1680 + }, + { + "epoch": 0.0936960035672482, + "grad_norm": 0.5086224675178528, + "learning_rate": 9.849550545660898e-05, + "loss": 1.6492, + "step": 1681 + }, + { + "epoch": 0.09375174182041135, + "grad_norm": 0.5263844728469849, + "learning_rate": 9.849333683283587e-05, + "loss": 1.8646, + "step": 1682 + }, + { + "epoch": 0.09380748007357449, + "grad_norm": 0.48118674755096436, + "learning_rate": 9.849116667113684e-05, + "loss": 1.6978, + "step": 1683 + }, + { + "epoch": 0.09386321832673764, + "grad_norm": 0.5442405939102173, + "learning_rate": 9.848899497158075e-05, + "loss": 1.7446, + "step": 1684 + }, + { + "epoch": 0.09391895657990079, + "grad_norm": 0.5518308877944946, + "learning_rate": 9.848682173423642e-05, + "loss": 1.9409, + "step": 1685 + }, + { + "epoch": 0.09397469483306393, + "grad_norm": 0.5064495205879211, + "learning_rate": 9.848464695917283e-05, + "loss": 1.9023, + "step": 1686 + }, + { + "epoch": 0.09403043308622708, + "grad_norm": 0.5437746644020081, + "learning_rate": 9.84824706464589e-05, + "loss": 1.8456, + "step": 1687 + }, + { + "epoch": 0.09408617133939022, + "grad_norm": 0.4933926463127136, + "learning_rate": 9.848029279616369e-05, + "loss": 1.6156, + "step": 1688 + }, + { + "epoch": 0.09414190959255336, + "grad_norm": 0.5288189649581909, + "learning_rate": 9.847811340835625e-05, + "loss": 1.8053, + "step": 1689 + }, + { + "epoch": 0.09419764784571652, + "grad_norm": 0.5238629579544067, + "learning_rate": 9.847593248310569e-05, + "loss": 1.8396, + "step": 1690 + }, + { + "epoch": 0.09425338609887966, + "grad_norm": 0.5135747790336609, + "learning_rate": 9.847375002048119e-05, + "loss": 1.702, + "step": 1691 + }, + { + "epoch": 0.0943091243520428, + "grad_norm": 0.48049938678741455, + "learning_rate": 9.847156602055196e-05, + "loss": 1.7258, + "step": 1692 + }, + { + "epoch": 0.09436486260520595, + "grad_norm": 0.5790214538574219, + "learning_rate": 9.846938048338728e-05, + "loss": 1.9521, + "step": 1693 + }, + { + "epoch": 0.0944206008583691, + "grad_norm": 0.49259278178215027, + "learning_rate": 9.846719340905643e-05, + "loss": 1.7358, + "step": 1694 + }, + { + "epoch": 0.09447633911153225, + "grad_norm": 0.5396574139595032, + "learning_rate": 9.846500479762879e-05, + "loss": 1.9847, + "step": 1695 + }, + { + "epoch": 0.09453207736469539, + "grad_norm": 0.5003666877746582, + "learning_rate": 9.846281464917377e-05, + "loss": 1.777, + "step": 1696 + }, + { + "epoch": 0.09458781561785853, + "grad_norm": 0.5158617496490479, + "learning_rate": 9.846062296376083e-05, + "loss": 1.6861, + "step": 1697 + }, + { + "epoch": 0.09464355387102168, + "grad_norm": 0.5154086351394653, + "learning_rate": 9.845842974145947e-05, + "loss": 1.8176, + "step": 1698 + }, + { + "epoch": 0.09469929212418483, + "grad_norm": 0.5052759051322937, + "learning_rate": 9.845623498233926e-05, + "loss": 1.6658, + "step": 1699 + }, + { + "epoch": 0.09475503037734798, + "grad_norm": 0.6677058339118958, + "learning_rate": 9.845403868646979e-05, + "loss": 1.7287, + "step": 1700 + }, + { + "epoch": 0.09481076863051112, + "grad_norm": 0.5167236924171448, + "learning_rate": 9.845184085392072e-05, + "loss": 1.6861, + "step": 1701 + }, + { + "epoch": 0.09486650688367426, + "grad_norm": 0.57721346616745, + "learning_rate": 9.844964148476175e-05, + "loss": 1.9309, + "step": 1702 + }, + { + "epoch": 0.0949222451368374, + "grad_norm": 0.4876415729522705, + "learning_rate": 9.844744057906263e-05, + "loss": 1.738, + "step": 1703 + }, + { + "epoch": 0.09497798339000056, + "grad_norm": 0.5089074373245239, + "learning_rate": 9.844523813689316e-05, + "loss": 1.8729, + "step": 1704 + }, + { + "epoch": 0.0950337216431637, + "grad_norm": 0.5102959871292114, + "learning_rate": 9.844303415832322e-05, + "loss": 1.901, + "step": 1705 + }, + { + "epoch": 0.09508945989632685, + "grad_norm": 0.5445943474769592, + "learning_rate": 9.844082864342265e-05, + "loss": 1.7838, + "step": 1706 + }, + { + "epoch": 0.09514519814948999, + "grad_norm": 0.5227236151695251, + "learning_rate": 9.843862159226142e-05, + "loss": 1.7044, + "step": 1707 + }, + { + "epoch": 0.09520093640265315, + "grad_norm": 0.5036524534225464, + "learning_rate": 9.843641300490956e-05, + "loss": 1.6637, + "step": 1708 + }, + { + "epoch": 0.09525667465581629, + "grad_norm": 0.5071728825569153, + "learning_rate": 9.843420288143706e-05, + "loss": 1.5714, + "step": 1709 + }, + { + "epoch": 0.09531241290897943, + "grad_norm": 0.563736081123352, + "learning_rate": 9.843199122191404e-05, + "loss": 2.0123, + "step": 1710 + }, + { + "epoch": 0.09536815116214258, + "grad_norm": 0.5531306266784668, + "learning_rate": 9.842977802641065e-05, + "loss": 1.74, + "step": 1711 + }, + { + "epoch": 0.09542388941530572, + "grad_norm": 0.5610520243644714, + "learning_rate": 9.842756329499704e-05, + "loss": 1.8003, + "step": 1712 + }, + { + "epoch": 0.09547962766846887, + "grad_norm": 0.498121440410614, + "learning_rate": 9.842534702774349e-05, + "loss": 1.6448, + "step": 1713 + }, + { + "epoch": 0.09553536592163202, + "grad_norm": 0.5231457948684692, + "learning_rate": 9.842312922472028e-05, + "loss": 1.8862, + "step": 1714 + }, + { + "epoch": 0.09559110417479516, + "grad_norm": 0.520879864692688, + "learning_rate": 9.842090988599772e-05, + "loss": 1.7858, + "step": 1715 + }, + { + "epoch": 0.0956468424279583, + "grad_norm": 0.5959715247154236, + "learning_rate": 9.841868901164622e-05, + "loss": 1.8487, + "step": 1716 + }, + { + "epoch": 0.09570258068112146, + "grad_norm": 0.5337534546852112, + "learning_rate": 9.84164666017362e-05, + "loss": 1.5147, + "step": 1717 + }, + { + "epoch": 0.0957583189342846, + "grad_norm": 0.5244635939598083, + "learning_rate": 9.841424265633816e-05, + "loss": 1.9583, + "step": 1718 + }, + { + "epoch": 0.09581405718744775, + "grad_norm": 0.5573442578315735, + "learning_rate": 9.84120171755226e-05, + "loss": 1.7111, + "step": 1719 + }, + { + "epoch": 0.09586979544061089, + "grad_norm": 0.5416032671928406, + "learning_rate": 9.840979015936014e-05, + "loss": 1.9152, + "step": 1720 + }, + { + "epoch": 0.09592553369377403, + "grad_norm": 0.5546048283576965, + "learning_rate": 9.840756160792138e-05, + "loss": 1.7902, + "step": 1721 + }, + { + "epoch": 0.09598127194693719, + "grad_norm": 0.5208713412284851, + "learning_rate": 9.840533152127697e-05, + "loss": 1.864, + "step": 1722 + }, + { + "epoch": 0.09603701020010033, + "grad_norm": 0.5275363326072693, + "learning_rate": 9.840309989949769e-05, + "loss": 1.7866, + "step": 1723 + }, + { + "epoch": 0.09609274845326347, + "grad_norm": 0.5389683246612549, + "learning_rate": 9.84008667426543e-05, + "loss": 1.8186, + "step": 1724 + }, + { + "epoch": 0.09614848670642662, + "grad_norm": 0.5352590680122375, + "learning_rate": 9.839863205081761e-05, + "loss": 1.8207, + "step": 1725 + }, + { + "epoch": 0.09620422495958976, + "grad_norm": 0.5303811430931091, + "learning_rate": 9.839639582405849e-05, + "loss": 1.8912, + "step": 1726 + }, + { + "epoch": 0.09625996321275292, + "grad_norm": 0.4606251120567322, + "learning_rate": 9.839415806244785e-05, + "loss": 1.6001, + "step": 1727 + }, + { + "epoch": 0.09631570146591606, + "grad_norm": 0.48041149973869324, + "learning_rate": 9.839191876605668e-05, + "loss": 1.6385, + "step": 1728 + }, + { + "epoch": 0.0963714397190792, + "grad_norm": 0.5307428240776062, + "learning_rate": 9.838967793495601e-05, + "loss": 1.8683, + "step": 1729 + }, + { + "epoch": 0.09642717797224234, + "grad_norm": 0.48561206459999084, + "learning_rate": 9.838743556921688e-05, + "loss": 1.7169, + "step": 1730 + }, + { + "epoch": 0.0964829162254055, + "grad_norm": 0.5501610040664673, + "learning_rate": 9.83851916689104e-05, + "loss": 1.7714, + "step": 1731 + }, + { + "epoch": 0.09653865447856864, + "grad_norm": 0.5766540765762329, + "learning_rate": 9.838294623410776e-05, + "loss": 1.961, + "step": 1732 + }, + { + "epoch": 0.09659439273173179, + "grad_norm": 0.5572078824043274, + "learning_rate": 9.838069926488016e-05, + "loss": 1.9466, + "step": 1733 + }, + { + "epoch": 0.09665013098489493, + "grad_norm": 0.5235105156898499, + "learning_rate": 9.837845076129885e-05, + "loss": 1.6369, + "step": 1734 + }, + { + "epoch": 0.09670586923805807, + "grad_norm": 0.49561917781829834, + "learning_rate": 9.837620072343514e-05, + "loss": 1.6879, + "step": 1735 + }, + { + "epoch": 0.09676160749122123, + "grad_norm": 0.577617883682251, + "learning_rate": 9.83739491513604e-05, + "loss": 2.0888, + "step": 1736 + }, + { + "epoch": 0.09681734574438437, + "grad_norm": 0.559758722782135, + "learning_rate": 9.837169604514605e-05, + "loss": 2.0155, + "step": 1737 + }, + { + "epoch": 0.09687308399754752, + "grad_norm": 0.4803854525089264, + "learning_rate": 9.83694414048635e-05, + "loss": 1.7143, + "step": 1738 + }, + { + "epoch": 0.09692882225071066, + "grad_norm": 0.5286114811897278, + "learning_rate": 9.83671852305843e-05, + "loss": 1.7708, + "step": 1739 + }, + { + "epoch": 0.09698456050387382, + "grad_norm": 0.5186529159545898, + "learning_rate": 9.836492752237998e-05, + "loss": 1.8367, + "step": 1740 + }, + { + "epoch": 0.09704029875703696, + "grad_norm": 0.5168614983558655, + "learning_rate": 9.836266828032214e-05, + "loss": 1.6913, + "step": 1741 + }, + { + "epoch": 0.0970960370102001, + "grad_norm": 0.5508823990821838, + "learning_rate": 9.836040750448246e-05, + "loss": 1.8108, + "step": 1742 + }, + { + "epoch": 0.09715177526336324, + "grad_norm": 0.5152462720870972, + "learning_rate": 9.835814519493258e-05, + "loss": 1.7643, + "step": 1743 + }, + { + "epoch": 0.09720751351652639, + "grad_norm": 0.5197470188140869, + "learning_rate": 9.835588135174432e-05, + "loss": 1.753, + "step": 1744 + }, + { + "epoch": 0.09726325176968954, + "grad_norm": 0.5595375895500183, + "learning_rate": 9.83536159749894e-05, + "loss": 1.9646, + "step": 1745 + }, + { + "epoch": 0.09731899002285269, + "grad_norm": 0.5276100635528564, + "learning_rate": 9.835134906473973e-05, + "loss": 1.8053, + "step": 1746 + }, + { + "epoch": 0.09737472827601583, + "grad_norm": 0.543694257736206, + "learning_rate": 9.834908062106716e-05, + "loss": 1.9073, + "step": 1747 + }, + { + "epoch": 0.09743046652917897, + "grad_norm": 0.5280660390853882, + "learning_rate": 9.834681064404366e-05, + "loss": 1.8642, + "step": 1748 + }, + { + "epoch": 0.09748620478234211, + "grad_norm": 0.5228556394577026, + "learning_rate": 9.83445391337412e-05, + "loss": 1.7084, + "step": 1749 + }, + { + "epoch": 0.09754194303550527, + "grad_norm": 0.5147905349731445, + "learning_rate": 9.834226609023183e-05, + "loss": 1.7273, + "step": 1750 + }, + { + "epoch": 0.09759768128866841, + "grad_norm": 0.6363779306411743, + "learning_rate": 9.833999151358763e-05, + "loss": 2.3455, + "step": 1751 + }, + { + "epoch": 0.09765341954183156, + "grad_norm": 0.4779658317565918, + "learning_rate": 9.833771540388074e-05, + "loss": 1.5965, + "step": 1752 + }, + { + "epoch": 0.0977091577949947, + "grad_norm": 0.5493218302726746, + "learning_rate": 9.833543776118334e-05, + "loss": 1.7655, + "step": 1753 + }, + { + "epoch": 0.09776489604815786, + "grad_norm": 0.5027639865875244, + "learning_rate": 9.833315858556769e-05, + "loss": 1.6425, + "step": 1754 + }, + { + "epoch": 0.097820634301321, + "grad_norm": 0.5259470343589783, + "learning_rate": 9.833087787710604e-05, + "loss": 1.8848, + "step": 1755 + }, + { + "epoch": 0.09787637255448414, + "grad_norm": 0.5296250581741333, + "learning_rate": 9.832859563587073e-05, + "loss": 1.6713, + "step": 1756 + }, + { + "epoch": 0.09793211080764729, + "grad_norm": 0.5273899435997009, + "learning_rate": 9.832631186193414e-05, + "loss": 1.7833, + "step": 1757 + }, + { + "epoch": 0.09798784906081043, + "grad_norm": 0.5987624526023865, + "learning_rate": 9.832402655536869e-05, + "loss": 2.0934, + "step": 1758 + }, + { + "epoch": 0.09804358731397358, + "grad_norm": 0.5442295074462891, + "learning_rate": 9.83217397162469e-05, + "loss": 1.6506, + "step": 1759 + }, + { + "epoch": 0.09809932556713673, + "grad_norm": 0.6511545181274414, + "learning_rate": 9.831945134464123e-05, + "loss": 2.1311, + "step": 1760 + }, + { + "epoch": 0.09815506382029987, + "grad_norm": 0.5505144596099854, + "learning_rate": 9.831716144062431e-05, + "loss": 1.7606, + "step": 1761 + }, + { + "epoch": 0.09821080207346301, + "grad_norm": 0.5241886973381042, + "learning_rate": 9.831487000426871e-05, + "loss": 1.7404, + "step": 1762 + }, + { + "epoch": 0.09826654032662617, + "grad_norm": 0.5306397080421448, + "learning_rate": 9.831257703564715e-05, + "loss": 1.7232, + "step": 1763 + }, + { + "epoch": 0.09832227857978931, + "grad_norm": 0.5829235315322876, + "learning_rate": 9.831028253483232e-05, + "loss": 1.8867, + "step": 1764 + }, + { + "epoch": 0.09837801683295246, + "grad_norm": 0.5258575677871704, + "learning_rate": 9.8307986501897e-05, + "loss": 1.6442, + "step": 1765 + }, + { + "epoch": 0.0984337550861156, + "grad_norm": 0.5493606328964233, + "learning_rate": 9.8305688936914e-05, + "loss": 2.025, + "step": 1766 + }, + { + "epoch": 0.09848949333927874, + "grad_norm": 0.5285725593566895, + "learning_rate": 9.83033898399562e-05, + "loss": 1.683, + "step": 1767 + }, + { + "epoch": 0.0985452315924419, + "grad_norm": 0.590203046798706, + "learning_rate": 9.830108921109648e-05, + "loss": 2.0356, + "step": 1768 + }, + { + "epoch": 0.09860096984560504, + "grad_norm": 0.47736695408821106, + "learning_rate": 9.829878705040784e-05, + "loss": 1.2685, + "step": 1769 + }, + { + "epoch": 0.09865670809876818, + "grad_norm": 0.5433778762817383, + "learning_rate": 9.829648335796327e-05, + "loss": 1.5734, + "step": 1770 + }, + { + "epoch": 0.09871244635193133, + "grad_norm": 0.533301591873169, + "learning_rate": 9.829417813383584e-05, + "loss": 1.6253, + "step": 1771 + }, + { + "epoch": 0.09876818460509447, + "grad_norm": 0.5619016289710999, + "learning_rate": 9.829187137809865e-05, + "loss": 1.9336, + "step": 1772 + }, + { + "epoch": 0.09882392285825763, + "grad_norm": 0.5166584849357605, + "learning_rate": 9.828956309082487e-05, + "loss": 1.6934, + "step": 1773 + }, + { + "epoch": 0.09887966111142077, + "grad_norm": 0.550294041633606, + "learning_rate": 9.828725327208769e-05, + "loss": 1.7357, + "step": 1774 + }, + { + "epoch": 0.09893539936458391, + "grad_norm": 0.5708268880844116, + "learning_rate": 9.828494192196037e-05, + "loss": 1.75, + "step": 1775 + }, + { + "epoch": 0.09899113761774705, + "grad_norm": 0.5142853856086731, + "learning_rate": 9.828262904051621e-05, + "loss": 1.8905, + "step": 1776 + }, + { + "epoch": 0.09904687587091021, + "grad_norm": 0.5133590698242188, + "learning_rate": 9.828031462782858e-05, + "loss": 1.7111, + "step": 1777 + }, + { + "epoch": 0.09910261412407335, + "grad_norm": 0.491804838180542, + "learning_rate": 9.827799868397086e-05, + "loss": 1.7898, + "step": 1778 + }, + { + "epoch": 0.0991583523772365, + "grad_norm": 0.5558345913887024, + "learning_rate": 9.827568120901649e-05, + "loss": 1.8621, + "step": 1779 + }, + { + "epoch": 0.09921409063039964, + "grad_norm": 0.5390424132347107, + "learning_rate": 9.827336220303898e-05, + "loss": 1.5574, + "step": 1780 + }, + { + "epoch": 0.09926982888356278, + "grad_norm": 0.5201495885848999, + "learning_rate": 9.827104166611188e-05, + "loss": 1.7218, + "step": 1781 + }, + { + "epoch": 0.09932556713672594, + "grad_norm": 0.49533358216285706, + "learning_rate": 9.826871959830877e-05, + "loss": 1.6587, + "step": 1782 + }, + { + "epoch": 0.09938130538988908, + "grad_norm": 0.5522517561912537, + "learning_rate": 9.826639599970331e-05, + "loss": 1.9942, + "step": 1783 + }, + { + "epoch": 0.09943704364305223, + "grad_norm": 0.5211175680160522, + "learning_rate": 9.826407087036918e-05, + "loss": 1.7953, + "step": 1784 + }, + { + "epoch": 0.09949278189621537, + "grad_norm": 0.5591548681259155, + "learning_rate": 9.82617442103801e-05, + "loss": 1.7257, + "step": 1785 + }, + { + "epoch": 0.09954852014937852, + "grad_norm": 0.5057593584060669, + "learning_rate": 9.82594160198099e-05, + "loss": 1.6209, + "step": 1786 + }, + { + "epoch": 0.09960425840254167, + "grad_norm": 0.4974839389324188, + "learning_rate": 9.82570862987324e-05, + "loss": 1.7242, + "step": 1787 + }, + { + "epoch": 0.09965999665570481, + "grad_norm": 0.580697238445282, + "learning_rate": 9.825475504722147e-05, + "loss": 1.8402, + "step": 1788 + }, + { + "epoch": 0.09971573490886795, + "grad_norm": 0.5298492908477783, + "learning_rate": 9.825242226535106e-05, + "loss": 1.5434, + "step": 1789 + }, + { + "epoch": 0.0997714731620311, + "grad_norm": 0.5714828372001648, + "learning_rate": 9.825008795319514e-05, + "loss": 1.8505, + "step": 1790 + }, + { + "epoch": 0.09982721141519425, + "grad_norm": 0.5840202569961548, + "learning_rate": 9.824775211082776e-05, + "loss": 1.9345, + "step": 1791 + }, + { + "epoch": 0.0998829496683574, + "grad_norm": 0.495969295501709, + "learning_rate": 9.824541473832298e-05, + "loss": 1.6482, + "step": 1792 + }, + { + "epoch": 0.09993868792152054, + "grad_norm": 0.537111759185791, + "learning_rate": 9.824307583575494e-05, + "loss": 1.6791, + "step": 1793 + }, + { + "epoch": 0.09999442617468368, + "grad_norm": 0.5053449869155884, + "learning_rate": 9.82407354031978e-05, + "loss": 1.6764, + "step": 1794 + }, + { + "epoch": 0.10005016442784682, + "grad_norm": 0.5327693223953247, + "learning_rate": 9.82383934407258e-05, + "loss": 1.7993, + "step": 1795 + }, + { + "epoch": 0.10010590268100998, + "grad_norm": 0.49914291501045227, + "learning_rate": 9.823604994841322e-05, + "loss": 1.9674, + "step": 1796 + }, + { + "epoch": 0.10016164093417312, + "grad_norm": 0.5144324898719788, + "learning_rate": 9.823370492633435e-05, + "loss": 1.7585, + "step": 1797 + }, + { + "epoch": 0.10021737918733627, + "grad_norm": 0.5108045935630798, + "learning_rate": 9.823135837456362e-05, + "loss": 1.7215, + "step": 1798 + }, + { + "epoch": 0.10027311744049941, + "grad_norm": 0.5693103671073914, + "learning_rate": 9.822901029317537e-05, + "loss": 1.7812, + "step": 1799 + }, + { + "epoch": 0.10032885569366257, + "grad_norm": 0.49847400188446045, + "learning_rate": 9.822666068224412e-05, + "loss": 1.6675, + "step": 1800 + }, + { + "epoch": 0.10038459394682571, + "grad_norm": 0.5565662384033203, + "learning_rate": 9.822430954184439e-05, + "loss": 1.8071, + "step": 1801 + }, + { + "epoch": 0.10044033219998885, + "grad_norm": 0.5412677526473999, + "learning_rate": 9.82219568720507e-05, + "loss": 1.7311, + "step": 1802 + }, + { + "epoch": 0.100496070453152, + "grad_norm": 0.5256420373916626, + "learning_rate": 9.821960267293771e-05, + "loss": 1.8179, + "step": 1803 + }, + { + "epoch": 0.10055180870631514, + "grad_norm": 0.486968457698822, + "learning_rate": 9.821724694458006e-05, + "loss": 1.7443, + "step": 1804 + }, + { + "epoch": 0.1006075469594783, + "grad_norm": 0.5230684280395508, + "learning_rate": 9.821488968705246e-05, + "loss": 1.8426, + "step": 1805 + }, + { + "epoch": 0.10066328521264144, + "grad_norm": 0.5057176351547241, + "learning_rate": 9.821253090042967e-05, + "loss": 1.6857, + "step": 1806 + }, + { + "epoch": 0.10071902346580458, + "grad_norm": 0.5477109551429749, + "learning_rate": 9.821017058478653e-05, + "loss": 1.904, + "step": 1807 + }, + { + "epoch": 0.10077476171896772, + "grad_norm": 0.5054430961608887, + "learning_rate": 9.820780874019782e-05, + "loss": 1.8538, + "step": 1808 + }, + { + "epoch": 0.10083049997213088, + "grad_norm": 0.5614181160926819, + "learning_rate": 9.82054453667385e-05, + "loss": 1.9318, + "step": 1809 + }, + { + "epoch": 0.10088623822529402, + "grad_norm": 0.49829983711242676, + "learning_rate": 9.820308046448353e-05, + "loss": 1.6044, + "step": 1810 + }, + { + "epoch": 0.10094197647845717, + "grad_norm": 0.53876793384552, + "learning_rate": 9.820071403350787e-05, + "loss": 1.7234, + "step": 1811 + }, + { + "epoch": 0.10099771473162031, + "grad_norm": 0.5352075695991516, + "learning_rate": 9.81983460738866e-05, + "loss": 1.7911, + "step": 1812 + }, + { + "epoch": 0.10105345298478345, + "grad_norm": 0.5328055024147034, + "learning_rate": 9.819597658569479e-05, + "loss": 1.8147, + "step": 1813 + }, + { + "epoch": 0.10110919123794661, + "grad_norm": 0.5261515378952026, + "learning_rate": 9.819360556900763e-05, + "loss": 1.8057, + "step": 1814 + }, + { + "epoch": 0.10116492949110975, + "grad_norm": 0.5476046204566956, + "learning_rate": 9.819123302390027e-05, + "loss": 1.7813, + "step": 1815 + }, + { + "epoch": 0.1012206677442729, + "grad_norm": 0.5293675661087036, + "learning_rate": 9.818885895044799e-05, + "loss": 1.7398, + "step": 1816 + }, + { + "epoch": 0.10127640599743604, + "grad_norm": 0.6075041890144348, + "learning_rate": 9.818648334872607e-05, + "loss": 1.985, + "step": 1817 + }, + { + "epoch": 0.10133214425059918, + "grad_norm": 0.5815473794937134, + "learning_rate": 9.818410621880982e-05, + "loss": 1.7932, + "step": 1818 + }, + { + "epoch": 0.10138788250376234, + "grad_norm": 0.546378493309021, + "learning_rate": 9.818172756077466e-05, + "loss": 1.8672, + "step": 1819 + }, + { + "epoch": 0.10144362075692548, + "grad_norm": 0.5089141130447388, + "learning_rate": 9.817934737469603e-05, + "loss": 1.4847, + "step": 1820 + }, + { + "epoch": 0.10149935901008862, + "grad_norm": 0.5070534348487854, + "learning_rate": 9.81769656606494e-05, + "loss": 1.6301, + "step": 1821 + }, + { + "epoch": 0.10155509726325176, + "grad_norm": 0.5128391981124878, + "learning_rate": 9.817458241871032e-05, + "loss": 1.8199, + "step": 1822 + }, + { + "epoch": 0.10161083551641492, + "grad_norm": 0.5569765567779541, + "learning_rate": 9.817219764895435e-05, + "loss": 1.7238, + "step": 1823 + }, + { + "epoch": 0.10166657376957806, + "grad_norm": 0.5038780570030212, + "learning_rate": 9.816981135145714e-05, + "loss": 1.7099, + "step": 1824 + }, + { + "epoch": 0.10172231202274121, + "grad_norm": 0.5122333765029907, + "learning_rate": 9.816742352629437e-05, + "loss": 1.7679, + "step": 1825 + }, + { + "epoch": 0.10177805027590435, + "grad_norm": 0.5544700026512146, + "learning_rate": 9.816503417354174e-05, + "loss": 2.0049, + "step": 1826 + }, + { + "epoch": 0.10183378852906749, + "grad_norm": 0.5663131475448608, + "learning_rate": 9.816264329327507e-05, + "loss": 1.7042, + "step": 1827 + }, + { + "epoch": 0.10188952678223065, + "grad_norm": 0.5186511278152466, + "learning_rate": 9.816025088557015e-05, + "loss": 1.7472, + "step": 1828 + }, + { + "epoch": 0.10194526503539379, + "grad_norm": 0.5595180988311768, + "learning_rate": 9.815785695050288e-05, + "loss": 1.6525, + "step": 1829 + }, + { + "epoch": 0.10200100328855694, + "grad_norm": 0.49748462438583374, + "learning_rate": 9.815546148814915e-05, + "loss": 1.6744, + "step": 1830 + }, + { + "epoch": 0.10205674154172008, + "grad_norm": 0.47154897451400757, + "learning_rate": 9.815306449858497e-05, + "loss": 1.6183, + "step": 1831 + }, + { + "epoch": 0.10211247979488323, + "grad_norm": 0.5415584444999695, + "learning_rate": 9.815066598188631e-05, + "loss": 1.842, + "step": 1832 + }, + { + "epoch": 0.10216821804804638, + "grad_norm": 0.5106571912765503, + "learning_rate": 9.814826593812928e-05, + "loss": 1.6504, + "step": 1833 + }, + { + "epoch": 0.10222395630120952, + "grad_norm": 0.5451028347015381, + "learning_rate": 9.814586436738998e-05, + "loss": 1.8817, + "step": 1834 + }, + { + "epoch": 0.10227969455437266, + "grad_norm": 0.5032516121864319, + "learning_rate": 9.814346126974455e-05, + "loss": 1.8143, + "step": 1835 + }, + { + "epoch": 0.1023354328075358, + "grad_norm": 0.4844000041484833, + "learning_rate": 9.814105664526925e-05, + "loss": 1.8255, + "step": 1836 + }, + { + "epoch": 0.10239117106069896, + "grad_norm": 0.8231089115142822, + "learning_rate": 9.81386504940403e-05, + "loss": 1.5754, + "step": 1837 + }, + { + "epoch": 0.1024469093138621, + "grad_norm": 0.5142394304275513, + "learning_rate": 9.813624281613403e-05, + "loss": 1.7516, + "step": 1838 + }, + { + "epoch": 0.10250264756702525, + "grad_norm": 0.5010998249053955, + "learning_rate": 9.813383361162678e-05, + "loss": 1.7164, + "step": 1839 + }, + { + "epoch": 0.10255838582018839, + "grad_norm": 0.5169504284858704, + "learning_rate": 9.813142288059497e-05, + "loss": 1.4974, + "step": 1840 + }, + { + "epoch": 0.10261412407335155, + "grad_norm": 0.5264306664466858, + "learning_rate": 9.812901062311507e-05, + "loss": 1.6087, + "step": 1841 + }, + { + "epoch": 0.10266986232651469, + "grad_norm": 0.5117889642715454, + "learning_rate": 9.812659683926355e-05, + "loss": 1.734, + "step": 1842 + }, + { + "epoch": 0.10272560057967783, + "grad_norm": 0.5216721296310425, + "learning_rate": 9.812418152911697e-05, + "loss": 1.7643, + "step": 1843 + }, + { + "epoch": 0.10278133883284098, + "grad_norm": 0.5514086484909058, + "learning_rate": 9.812176469275196e-05, + "loss": 1.7052, + "step": 1844 + }, + { + "epoch": 0.10283707708600412, + "grad_norm": 0.5310468077659607, + "learning_rate": 9.811934633024514e-05, + "loss": 1.8478, + "step": 1845 + }, + { + "epoch": 0.10289281533916728, + "grad_norm": 0.5535829067230225, + "learning_rate": 9.811692644167318e-05, + "loss": 1.7884, + "step": 1846 + }, + { + "epoch": 0.10294855359233042, + "grad_norm": 0.5332193374633789, + "learning_rate": 9.811450502711288e-05, + "loss": 1.7511, + "step": 1847 + }, + { + "epoch": 0.10300429184549356, + "grad_norm": 0.5547590851783752, + "learning_rate": 9.8112082086641e-05, + "loss": 1.7348, + "step": 1848 + }, + { + "epoch": 0.1030600300986567, + "grad_norm": 0.5098549127578735, + "learning_rate": 9.810965762033439e-05, + "loss": 1.8117, + "step": 1849 + }, + { + "epoch": 0.10311576835181985, + "grad_norm": 0.4965379238128662, + "learning_rate": 9.810723162826994e-05, + "loss": 1.6535, + "step": 1850 + }, + { + "epoch": 0.103171506604983, + "grad_norm": 0.5498190522193909, + "learning_rate": 9.810480411052458e-05, + "loss": 1.8094, + "step": 1851 + }, + { + "epoch": 0.10322724485814615, + "grad_norm": 0.5419559478759766, + "learning_rate": 9.81023750671753e-05, + "loss": 1.8347, + "step": 1852 + }, + { + "epoch": 0.10328298311130929, + "grad_norm": 0.5136609077453613, + "learning_rate": 9.809994449829916e-05, + "loss": 1.8038, + "step": 1853 + }, + { + "epoch": 0.10333872136447243, + "grad_norm": 0.4600328207015991, + "learning_rate": 9.809751240397321e-05, + "loss": 1.5616, + "step": 1854 + }, + { + "epoch": 0.10339445961763559, + "grad_norm": 0.5725501775741577, + "learning_rate": 9.80950787842746e-05, + "loss": 2.0217, + "step": 1855 + }, + { + "epoch": 0.10345019787079873, + "grad_norm": 0.4968816936016083, + "learning_rate": 9.809264363928049e-05, + "loss": 1.6151, + "step": 1856 + }, + { + "epoch": 0.10350593612396188, + "grad_norm": 0.5521273016929626, + "learning_rate": 9.809020696906815e-05, + "loss": 1.5242, + "step": 1857 + }, + { + "epoch": 0.10356167437712502, + "grad_norm": 0.526759684085846, + "learning_rate": 9.80877687737148e-05, + "loss": 1.6917, + "step": 1858 + }, + { + "epoch": 0.10361741263028816, + "grad_norm": 0.5235029458999634, + "learning_rate": 9.808532905329781e-05, + "loss": 1.785, + "step": 1859 + }, + { + "epoch": 0.10367315088345132, + "grad_norm": 0.5284624099731445, + "learning_rate": 9.808288780789454e-05, + "loss": 1.8857, + "step": 1860 + }, + { + "epoch": 0.10372888913661446, + "grad_norm": 0.5086808800697327, + "learning_rate": 9.80804450375824e-05, + "loss": 1.5768, + "step": 1861 + }, + { + "epoch": 0.1037846273897776, + "grad_norm": 0.6029835343360901, + "learning_rate": 9.807800074243888e-05, + "loss": 2.1482, + "step": 1862 + }, + { + "epoch": 0.10384036564294075, + "grad_norm": 0.5451070666313171, + "learning_rate": 9.80755549225415e-05, + "loss": 1.6884, + "step": 1863 + }, + { + "epoch": 0.1038961038961039, + "grad_norm": 0.5617519021034241, + "learning_rate": 9.807310757796781e-05, + "loss": 1.9665, + "step": 1864 + }, + { + "epoch": 0.10395184214926705, + "grad_norm": 0.6114406585693359, + "learning_rate": 9.807065870879544e-05, + "loss": 1.9696, + "step": 1865 + }, + { + "epoch": 0.10400758040243019, + "grad_norm": 0.5124810338020325, + "learning_rate": 9.806820831510204e-05, + "loss": 1.6848, + "step": 1866 + }, + { + "epoch": 0.10406331865559333, + "grad_norm": 0.5385152697563171, + "learning_rate": 9.806575639696533e-05, + "loss": 1.6808, + "step": 1867 + }, + { + "epoch": 0.10411905690875647, + "grad_norm": 0.49392756819725037, + "learning_rate": 9.806330295446307e-05, + "loss": 1.8179, + "step": 1868 + }, + { + "epoch": 0.10417479516191963, + "grad_norm": 0.49383312463760376, + "learning_rate": 9.806084798767307e-05, + "loss": 1.5517, + "step": 1869 + }, + { + "epoch": 0.10423053341508277, + "grad_norm": 0.5276709198951721, + "learning_rate": 9.805839149667319e-05, + "loss": 1.7125, + "step": 1870 + }, + { + "epoch": 0.10428627166824592, + "grad_norm": 0.5694584250450134, + "learning_rate": 9.805593348154131e-05, + "loss": 1.9891, + "step": 1871 + }, + { + "epoch": 0.10434200992140906, + "grad_norm": 0.5705782771110535, + "learning_rate": 9.805347394235543e-05, + "loss": 1.779, + "step": 1872 + }, + { + "epoch": 0.1043977481745722, + "grad_norm": 0.543282151222229, + "learning_rate": 9.805101287919352e-05, + "loss": 1.898, + "step": 1873 + }, + { + "epoch": 0.10445348642773536, + "grad_norm": 0.5607357025146484, + "learning_rate": 9.804855029213365e-05, + "loss": 1.9422, + "step": 1874 + }, + { + "epoch": 0.1045092246808985, + "grad_norm": 0.548055112361908, + "learning_rate": 9.804608618125388e-05, + "loss": 1.776, + "step": 1875 + }, + { + "epoch": 0.10456496293406164, + "grad_norm": 0.528634250164032, + "learning_rate": 9.804362054663241e-05, + "loss": 1.7196, + "step": 1876 + }, + { + "epoch": 0.10462070118722479, + "grad_norm": 0.5074811577796936, + "learning_rate": 9.80411533883474e-05, + "loss": 1.6667, + "step": 1877 + }, + { + "epoch": 0.10467643944038794, + "grad_norm": 0.5272465944290161, + "learning_rate": 9.80386847064771e-05, + "loss": 1.8897, + "step": 1878 + }, + { + "epoch": 0.10473217769355109, + "grad_norm": 0.5819423198699951, + "learning_rate": 9.80362145010998e-05, + "loss": 1.868, + "step": 1879 + }, + { + "epoch": 0.10478791594671423, + "grad_norm": 0.4952581226825714, + "learning_rate": 9.803374277229387e-05, + "loss": 1.7449, + "step": 1880 + }, + { + "epoch": 0.10484365419987737, + "grad_norm": 0.5459893345832825, + "learning_rate": 9.803126952013766e-05, + "loss": 1.7454, + "step": 1881 + }, + { + "epoch": 0.10489939245304052, + "grad_norm": 0.4974026381969452, + "learning_rate": 9.802879474470964e-05, + "loss": 1.5892, + "step": 1882 + }, + { + "epoch": 0.10495513070620367, + "grad_norm": 0.503982424736023, + "learning_rate": 9.802631844608825e-05, + "loss": 1.608, + "step": 1883 + }, + { + "epoch": 0.10501086895936682, + "grad_norm": 0.5444994568824768, + "learning_rate": 9.802384062435206e-05, + "loss": 1.8286, + "step": 1884 + }, + { + "epoch": 0.10506660721252996, + "grad_norm": 0.5099791288375854, + "learning_rate": 9.802136127957965e-05, + "loss": 1.7811, + "step": 1885 + }, + { + "epoch": 0.1051223454656931, + "grad_norm": 0.5670564770698547, + "learning_rate": 9.801888041184963e-05, + "loss": 2.0036, + "step": 1886 + }, + { + "epoch": 0.10517808371885626, + "grad_norm": 0.5026718378067017, + "learning_rate": 9.801639802124071e-05, + "loss": 1.6716, + "step": 1887 + }, + { + "epoch": 0.1052338219720194, + "grad_norm": 0.519005298614502, + "learning_rate": 9.801391410783161e-05, + "loss": 1.6815, + "step": 1888 + }, + { + "epoch": 0.10528956022518254, + "grad_norm": 0.46930474042892456, + "learning_rate": 9.801142867170106e-05, + "loss": 1.7429, + "step": 1889 + }, + { + "epoch": 0.10534529847834569, + "grad_norm": 0.5434656143188477, + "learning_rate": 9.800894171292793e-05, + "loss": 1.8671, + "step": 1890 + }, + { + "epoch": 0.10540103673150883, + "grad_norm": 0.5062917470932007, + "learning_rate": 9.80064532315911e-05, + "loss": 1.6347, + "step": 1891 + }, + { + "epoch": 0.10545677498467199, + "grad_norm": 0.5208712220191956, + "learning_rate": 9.800396322776945e-05, + "loss": 1.601, + "step": 1892 + }, + { + "epoch": 0.10551251323783513, + "grad_norm": 0.49505361914634705, + "learning_rate": 9.800147170154199e-05, + "loss": 1.7157, + "step": 1893 + }, + { + "epoch": 0.10556825149099827, + "grad_norm": 0.5282744765281677, + "learning_rate": 9.79989786529877e-05, + "loss": 1.7322, + "step": 1894 + }, + { + "epoch": 0.10562398974416141, + "grad_norm": 0.5821601748466492, + "learning_rate": 9.799648408218567e-05, + "loss": 2.0407, + "step": 1895 + }, + { + "epoch": 0.10567972799732456, + "grad_norm": 0.5044925212860107, + "learning_rate": 9.7993987989215e-05, + "loss": 1.6443, + "step": 1896 + }, + { + "epoch": 0.10573546625048771, + "grad_norm": 0.5207780599594116, + "learning_rate": 9.799149037415485e-05, + "loss": 1.6341, + "step": 1897 + }, + { + "epoch": 0.10579120450365086, + "grad_norm": 0.5176671743392944, + "learning_rate": 9.798899123708444e-05, + "loss": 1.7532, + "step": 1898 + }, + { + "epoch": 0.105846942756814, + "grad_norm": 0.585341215133667, + "learning_rate": 9.798649057808302e-05, + "loss": 1.7511, + "step": 1899 + }, + { + "epoch": 0.10590268100997714, + "grad_norm": 0.5633143782615662, + "learning_rate": 9.798398839722991e-05, + "loss": 1.8548, + "step": 1900 + }, + { + "epoch": 0.1059584192631403, + "grad_norm": 0.5425167083740234, + "learning_rate": 9.798148469460444e-05, + "loss": 1.7457, + "step": 1901 + }, + { + "epoch": 0.10601415751630344, + "grad_norm": 0.5065333247184753, + "learning_rate": 9.797897947028602e-05, + "loss": 1.6342, + "step": 1902 + }, + { + "epoch": 0.10606989576946659, + "grad_norm": 0.4805918037891388, + "learning_rate": 9.797647272435413e-05, + "loss": 1.6272, + "step": 1903 + }, + { + "epoch": 0.10612563402262973, + "grad_norm": 0.49736079573631287, + "learning_rate": 9.797396445688825e-05, + "loss": 1.6666, + "step": 1904 + }, + { + "epoch": 0.10618137227579287, + "grad_norm": 0.5496745705604553, + "learning_rate": 9.797145466796791e-05, + "loss": 1.7214, + "step": 1905 + }, + { + "epoch": 0.10623711052895603, + "grad_norm": 0.5134656429290771, + "learning_rate": 9.796894335767272e-05, + "loss": 1.7156, + "step": 1906 + }, + { + "epoch": 0.10629284878211917, + "grad_norm": 0.5449696183204651, + "learning_rate": 9.796643052608232e-05, + "loss": 1.7284, + "step": 1907 + }, + { + "epoch": 0.10634858703528231, + "grad_norm": 0.5344961881637573, + "learning_rate": 9.796391617327643e-05, + "loss": 1.514, + "step": 1908 + }, + { + "epoch": 0.10640432528844546, + "grad_norm": 0.5717931389808655, + "learning_rate": 9.796140029933474e-05, + "loss": 1.9562, + "step": 1909 + }, + { + "epoch": 0.10646006354160861, + "grad_norm": 0.5507314205169678, + "learning_rate": 9.795888290433708e-05, + "loss": 1.8475, + "step": 1910 + }, + { + "epoch": 0.10651580179477176, + "grad_norm": 0.4807168245315552, + "learning_rate": 9.795636398836328e-05, + "loss": 1.4198, + "step": 1911 + }, + { + "epoch": 0.1065715400479349, + "grad_norm": 0.5163860321044922, + "learning_rate": 9.795384355149321e-05, + "loss": 1.7098, + "step": 1912 + }, + { + "epoch": 0.10662727830109804, + "grad_norm": 0.5876139998435974, + "learning_rate": 9.795132159380683e-05, + "loss": 1.8379, + "step": 1913 + }, + { + "epoch": 0.10668301655426118, + "grad_norm": 0.5147418975830078, + "learning_rate": 9.794879811538409e-05, + "loss": 1.8069, + "step": 1914 + }, + { + "epoch": 0.10673875480742434, + "grad_norm": 0.5539793372154236, + "learning_rate": 9.794627311630503e-05, + "loss": 1.9336, + "step": 1915 + }, + { + "epoch": 0.10679449306058748, + "grad_norm": 0.5565729737281799, + "learning_rate": 9.794374659664975e-05, + "loss": 1.8024, + "step": 1916 + }, + { + "epoch": 0.10685023131375063, + "grad_norm": 0.509848952293396, + "learning_rate": 9.794121855649834e-05, + "loss": 1.6553, + "step": 1917 + }, + { + "epoch": 0.10690596956691377, + "grad_norm": 0.5031093955039978, + "learning_rate": 9.793868899593101e-05, + "loss": 1.6452, + "step": 1918 + }, + { + "epoch": 0.10696170782007691, + "grad_norm": 0.5101149082183838, + "learning_rate": 9.793615791502794e-05, + "loss": 1.5787, + "step": 1919 + }, + { + "epoch": 0.10701744607324007, + "grad_norm": 0.5462785363197327, + "learning_rate": 9.793362531386946e-05, + "loss": 1.7273, + "step": 1920 + }, + { + "epoch": 0.10707318432640321, + "grad_norm": 0.5313560366630554, + "learning_rate": 9.793109119253584e-05, + "loss": 1.7061, + "step": 1921 + }, + { + "epoch": 0.10712892257956635, + "grad_norm": 0.49144747853279114, + "learning_rate": 9.792855555110747e-05, + "loss": 1.6418, + "step": 1922 + }, + { + "epoch": 0.1071846608327295, + "grad_norm": 0.5435053110122681, + "learning_rate": 9.792601838966477e-05, + "loss": 1.8774, + "step": 1923 + }, + { + "epoch": 0.10724039908589265, + "grad_norm": 0.5598286390304565, + "learning_rate": 9.792347970828819e-05, + "loss": 1.8705, + "step": 1924 + }, + { + "epoch": 0.1072961373390558, + "grad_norm": 0.5478824377059937, + "learning_rate": 9.792093950705824e-05, + "loss": 1.6882, + "step": 1925 + }, + { + "epoch": 0.10735187559221894, + "grad_norm": 0.5779083967208862, + "learning_rate": 9.79183977860555e-05, + "loss": 1.993, + "step": 1926 + }, + { + "epoch": 0.10740761384538208, + "grad_norm": 0.5614520907402039, + "learning_rate": 9.791585454536054e-05, + "loss": 1.7984, + "step": 1927 + }, + { + "epoch": 0.10746335209854523, + "grad_norm": 0.5752551555633545, + "learning_rate": 9.791330978505406e-05, + "loss": 1.781, + "step": 1928 + }, + { + "epoch": 0.10751909035170838, + "grad_norm": 0.5250864624977112, + "learning_rate": 9.791076350521675e-05, + "loss": 1.8367, + "step": 1929 + }, + { + "epoch": 0.10757482860487153, + "grad_norm": 0.5408803224563599, + "learning_rate": 9.790821570592937e-05, + "loss": 1.9812, + "step": 1930 + }, + { + "epoch": 0.10763056685803467, + "grad_norm": 0.5511845350265503, + "learning_rate": 9.790566638727268e-05, + "loss": 1.9631, + "step": 1931 + }, + { + "epoch": 0.10768630511119781, + "grad_norm": 0.5966324806213379, + "learning_rate": 9.790311554932758e-05, + "loss": 1.6961, + "step": 1932 + }, + { + "epoch": 0.10774204336436097, + "grad_norm": 0.5062892436981201, + "learning_rate": 9.790056319217495e-05, + "loss": 1.4829, + "step": 1933 + }, + { + "epoch": 0.10779778161752411, + "grad_norm": 0.5916358232498169, + "learning_rate": 9.789800931589574e-05, + "loss": 1.7646, + "step": 1934 + }, + { + "epoch": 0.10785351987068725, + "grad_norm": 0.5008646845817566, + "learning_rate": 9.789545392057093e-05, + "loss": 1.6985, + "step": 1935 + }, + { + "epoch": 0.1079092581238504, + "grad_norm": 0.557442843914032, + "learning_rate": 9.789289700628158e-05, + "loss": 1.6734, + "step": 1936 + }, + { + "epoch": 0.10796499637701354, + "grad_norm": 0.5303389430046082, + "learning_rate": 9.789033857310876e-05, + "loss": 1.8051, + "step": 1937 + }, + { + "epoch": 0.1080207346301767, + "grad_norm": 0.5422589182853699, + "learning_rate": 9.788777862113363e-05, + "loss": 1.7073, + "step": 1938 + }, + { + "epoch": 0.10807647288333984, + "grad_norm": 0.49321499466896057, + "learning_rate": 9.788521715043736e-05, + "loss": 1.6106, + "step": 1939 + }, + { + "epoch": 0.10813221113650298, + "grad_norm": 0.5515221953392029, + "learning_rate": 9.78826541611012e-05, + "loss": 1.9005, + "step": 1940 + }, + { + "epoch": 0.10818794938966612, + "grad_norm": 0.5055232048034668, + "learning_rate": 9.788008965320643e-05, + "loss": 1.6169, + "step": 1941 + }, + { + "epoch": 0.10824368764282927, + "grad_norm": 0.5074330568313599, + "learning_rate": 9.787752362683438e-05, + "loss": 1.6712, + "step": 1942 + }, + { + "epoch": 0.10829942589599242, + "grad_norm": 0.5290434956550598, + "learning_rate": 9.78749560820664e-05, + "loss": 1.6697, + "step": 1943 + }, + { + "epoch": 0.10835516414915557, + "grad_norm": 0.5382573008537292, + "learning_rate": 9.787238701898397e-05, + "loss": 1.6955, + "step": 1944 + }, + { + "epoch": 0.10841090240231871, + "grad_norm": 0.5350417494773865, + "learning_rate": 9.786981643766852e-05, + "loss": 1.695, + "step": 1945 + }, + { + "epoch": 0.10846664065548185, + "grad_norm": 0.5305573344230652, + "learning_rate": 9.78672443382016e-05, + "loss": 1.8205, + "step": 1946 + }, + { + "epoch": 0.10852237890864501, + "grad_norm": 0.5057222247123718, + "learning_rate": 9.786467072066478e-05, + "loss": 1.7815, + "step": 1947 + }, + { + "epoch": 0.10857811716180815, + "grad_norm": 0.5606647729873657, + "learning_rate": 9.786209558513968e-05, + "loss": 2.0612, + "step": 1948 + }, + { + "epoch": 0.1086338554149713, + "grad_norm": 0.5300911068916321, + "learning_rate": 9.785951893170795e-05, + "loss": 1.8648, + "step": 1949 + }, + { + "epoch": 0.10868959366813444, + "grad_norm": 0.5408658385276794, + "learning_rate": 9.785694076045133e-05, + "loss": 1.7291, + "step": 1950 + }, + { + "epoch": 0.10874533192129758, + "grad_norm": 0.5921101570129395, + "learning_rate": 9.785436107145156e-05, + "loss": 1.9079, + "step": 1951 + }, + { + "epoch": 0.10880107017446074, + "grad_norm": 0.5365302562713623, + "learning_rate": 9.785177986479048e-05, + "loss": 1.888, + "step": 1952 + }, + { + "epoch": 0.10885680842762388, + "grad_norm": 0.5375866293907166, + "learning_rate": 9.784919714054993e-05, + "loss": 1.7309, + "step": 1953 + }, + { + "epoch": 0.10891254668078702, + "grad_norm": 0.5292702317237854, + "learning_rate": 9.784661289881183e-05, + "loss": 1.7366, + "step": 1954 + }, + { + "epoch": 0.10896828493395017, + "grad_norm": 0.5953987240791321, + "learning_rate": 9.784402713965815e-05, + "loss": 1.6749, + "step": 1955 + }, + { + "epoch": 0.10902402318711332, + "grad_norm": 0.5666269659996033, + "learning_rate": 9.784143986317084e-05, + "loss": 1.8123, + "step": 1956 + }, + { + "epoch": 0.10907976144027647, + "grad_norm": 0.4942094683647156, + "learning_rate": 9.783885106943203e-05, + "loss": 1.5919, + "step": 1957 + }, + { + "epoch": 0.10913549969343961, + "grad_norm": 0.5365981459617615, + "learning_rate": 9.783626075852377e-05, + "loss": 1.8938, + "step": 1958 + }, + { + "epoch": 0.10919123794660275, + "grad_norm": 0.4730222523212433, + "learning_rate": 9.783366893052822e-05, + "loss": 1.6972, + "step": 1959 + }, + { + "epoch": 0.1092469761997659, + "grad_norm": 0.5012983679771423, + "learning_rate": 9.783107558552759e-05, + "loss": 1.5967, + "step": 1960 + }, + { + "epoch": 0.10930271445292905, + "grad_norm": 0.47032400965690613, + "learning_rate": 9.782848072360411e-05, + "loss": 1.4359, + "step": 1961 + }, + { + "epoch": 0.1093584527060922, + "grad_norm": 0.6051558256149292, + "learning_rate": 9.782588434484008e-05, + "loss": 1.8727, + "step": 1962 + }, + { + "epoch": 0.10941419095925534, + "grad_norm": 0.5087974667549133, + "learning_rate": 9.782328644931784e-05, + "loss": 1.6863, + "step": 1963 + }, + { + "epoch": 0.10946992921241848, + "grad_norm": 0.5419572591781616, + "learning_rate": 9.782068703711979e-05, + "loss": 1.8686, + "step": 1964 + }, + { + "epoch": 0.10952566746558162, + "grad_norm": 0.5740787386894226, + "learning_rate": 9.781808610832837e-05, + "loss": 1.8671, + "step": 1965 + }, + { + "epoch": 0.10958140571874478, + "grad_norm": 0.5375397801399231, + "learning_rate": 9.781548366302604e-05, + "loss": 1.855, + "step": 1966 + }, + { + "epoch": 0.10963714397190792, + "grad_norm": 0.5186393857002258, + "learning_rate": 9.781287970129536e-05, + "loss": 1.8296, + "step": 1967 + }, + { + "epoch": 0.10969288222507106, + "grad_norm": 0.5058977007865906, + "learning_rate": 9.781027422321891e-05, + "loss": 1.6181, + "step": 1968 + }, + { + "epoch": 0.10974862047823421, + "grad_norm": 0.5131574273109436, + "learning_rate": 9.78076672288793e-05, + "loss": 1.8194, + "step": 1969 + }, + { + "epoch": 0.10980435873139736, + "grad_norm": 0.5668989419937134, + "learning_rate": 9.780505871835924e-05, + "loss": 1.857, + "step": 1970 + }, + { + "epoch": 0.1098600969845605, + "grad_norm": 0.5090118646621704, + "learning_rate": 9.780244869174142e-05, + "loss": 1.5722, + "step": 1971 + }, + { + "epoch": 0.10991583523772365, + "grad_norm": 0.5472584962844849, + "learning_rate": 9.779983714910865e-05, + "loss": 1.7926, + "step": 1972 + }, + { + "epoch": 0.10997157349088679, + "grad_norm": 0.5904543399810791, + "learning_rate": 9.779722409054374e-05, + "loss": 1.9054, + "step": 1973 + }, + { + "epoch": 0.11002731174404994, + "grad_norm": 0.4884478747844696, + "learning_rate": 9.779460951612955e-05, + "loss": 1.5573, + "step": 1974 + }, + { + "epoch": 0.11008304999721309, + "grad_norm": 0.6380166411399841, + "learning_rate": 9.779199342594902e-05, + "loss": 2.0516, + "step": 1975 + }, + { + "epoch": 0.11013878825037623, + "grad_norm": 0.5148760080337524, + "learning_rate": 9.778937582008509e-05, + "loss": 1.7119, + "step": 1976 + }, + { + "epoch": 0.11019452650353938, + "grad_norm": 0.5153675079345703, + "learning_rate": 9.77867566986208e-05, + "loss": 1.6784, + "step": 1977 + }, + { + "epoch": 0.11025026475670252, + "grad_norm": 0.5181575417518616, + "learning_rate": 9.77841360616392e-05, + "loss": 1.4993, + "step": 1978 + }, + { + "epoch": 0.11030600300986568, + "grad_norm": 0.557270348072052, + "learning_rate": 9.778151390922341e-05, + "loss": 1.8278, + "step": 1979 + }, + { + "epoch": 0.11036174126302882, + "grad_norm": 0.570976972579956, + "learning_rate": 9.777889024145657e-05, + "loss": 1.9032, + "step": 1980 + }, + { + "epoch": 0.11041747951619196, + "grad_norm": 0.5794844031333923, + "learning_rate": 9.777626505842193e-05, + "loss": 1.8758, + "step": 1981 + }, + { + "epoch": 0.1104732177693551, + "grad_norm": 0.5161063075065613, + "learning_rate": 9.777363836020268e-05, + "loss": 1.8698, + "step": 1982 + }, + { + "epoch": 0.11052895602251825, + "grad_norm": 0.5546018481254578, + "learning_rate": 9.777101014688219e-05, + "loss": 1.87, + "step": 1983 + }, + { + "epoch": 0.1105846942756814, + "grad_norm": 0.5865330696105957, + "learning_rate": 9.776838041854377e-05, + "loss": 1.9022, + "step": 1984 + }, + { + "epoch": 0.11064043252884455, + "grad_norm": 0.5667337775230408, + "learning_rate": 9.776574917527083e-05, + "loss": 2.0603, + "step": 1985 + }, + { + "epoch": 0.11069617078200769, + "grad_norm": 0.5092570185661316, + "learning_rate": 9.776311641714683e-05, + "loss": 1.7887, + "step": 1986 + }, + { + "epoch": 0.11075190903517083, + "grad_norm": 0.5329071879386902, + "learning_rate": 9.776048214425525e-05, + "loss": 1.7294, + "step": 1987 + }, + { + "epoch": 0.11080764728833398, + "grad_norm": 0.5048893690109253, + "learning_rate": 9.775784635667964e-05, + "loss": 1.7357, + "step": 1988 + }, + { + "epoch": 0.11086338554149713, + "grad_norm": 0.4852405786514282, + "learning_rate": 9.77552090545036e-05, + "loss": 1.7027, + "step": 1989 + }, + { + "epoch": 0.11091912379466028, + "grad_norm": 0.5363536477088928, + "learning_rate": 9.775257023781074e-05, + "loss": 1.9082, + "step": 1990 + }, + { + "epoch": 0.11097486204782342, + "grad_norm": 0.5514358878135681, + "learning_rate": 9.774992990668479e-05, + "loss": 1.8572, + "step": 1991 + }, + { + "epoch": 0.11103060030098656, + "grad_norm": 0.5773457884788513, + "learning_rate": 9.774728806120945e-05, + "loss": 1.9287, + "step": 1992 + }, + { + "epoch": 0.11108633855414972, + "grad_norm": 0.5018163323402405, + "learning_rate": 9.774464470146851e-05, + "loss": 1.6721, + "step": 1993 + }, + { + "epoch": 0.11114207680731286, + "grad_norm": 0.5004386305809021, + "learning_rate": 9.774199982754584e-05, + "loss": 1.6999, + "step": 1994 + }, + { + "epoch": 0.111197815060476, + "grad_norm": 0.5078005194664001, + "learning_rate": 9.773935343952527e-05, + "loss": 1.6968, + "step": 1995 + }, + { + "epoch": 0.11125355331363915, + "grad_norm": 0.5355806946754456, + "learning_rate": 9.773670553749075e-05, + "loss": 1.8122, + "step": 1996 + }, + { + "epoch": 0.11130929156680229, + "grad_norm": 0.5051989555358887, + "learning_rate": 9.773405612152626e-05, + "loss": 1.6712, + "step": 1997 + }, + { + "epoch": 0.11136502981996545, + "grad_norm": 0.5549625754356384, + "learning_rate": 9.773140519171582e-05, + "loss": 1.8872, + "step": 1998 + }, + { + "epoch": 0.11142076807312859, + "grad_norm": 0.5879496335983276, + "learning_rate": 9.77287527481435e-05, + "loss": 1.7659, + "step": 1999 + }, + { + "epoch": 0.11147650632629173, + "grad_norm": 0.6350980401039124, + "learning_rate": 9.772609879089341e-05, + "loss": 1.9805, + "step": 2000 + }, + { + "epoch": 0.11153224457945488, + "grad_norm": 0.5255335569381714, + "learning_rate": 9.772344332004975e-05, + "loss": 1.7215, + "step": 2001 + }, + { + "epoch": 0.11158798283261803, + "grad_norm": 0.5538710355758667, + "learning_rate": 9.77207863356967e-05, + "loss": 1.8071, + "step": 2002 + }, + { + "epoch": 0.11164372108578118, + "grad_norm": 0.5447118878364563, + "learning_rate": 9.771812783791854e-05, + "loss": 1.6401, + "step": 2003 + }, + { + "epoch": 0.11169945933894432, + "grad_norm": 0.5420034527778625, + "learning_rate": 9.771546782679959e-05, + "loss": 1.7649, + "step": 2004 + }, + { + "epoch": 0.11175519759210746, + "grad_norm": 0.5717622637748718, + "learning_rate": 9.771280630242419e-05, + "loss": 1.9245, + "step": 2005 + }, + { + "epoch": 0.1118109358452706, + "grad_norm": 0.533752977848053, + "learning_rate": 9.771014326487675e-05, + "loss": 1.6562, + "step": 2006 + }, + { + "epoch": 0.11186667409843376, + "grad_norm": 0.5668651461601257, + "learning_rate": 9.770747871424175e-05, + "loss": 1.8504, + "step": 2007 + }, + { + "epoch": 0.1119224123515969, + "grad_norm": 0.5300382375717163, + "learning_rate": 9.770481265060368e-05, + "loss": 1.5858, + "step": 2008 + }, + { + "epoch": 0.11197815060476005, + "grad_norm": 0.5205538272857666, + "learning_rate": 9.770214507404709e-05, + "loss": 1.8421, + "step": 2009 + }, + { + "epoch": 0.11203388885792319, + "grad_norm": 0.5641254186630249, + "learning_rate": 9.769947598465657e-05, + "loss": 1.7521, + "step": 2010 + }, + { + "epoch": 0.11208962711108633, + "grad_norm": 0.5881509184837341, + "learning_rate": 9.76968053825168e-05, + "loss": 1.8359, + "step": 2011 + }, + { + "epoch": 0.11214536536424949, + "grad_norm": 0.5264688730239868, + "learning_rate": 9.769413326771243e-05, + "loss": 1.7792, + "step": 2012 + }, + { + "epoch": 0.11220110361741263, + "grad_norm": 0.5596029758453369, + "learning_rate": 9.769145964032824e-05, + "loss": 1.8502, + "step": 2013 + }, + { + "epoch": 0.11225684187057577, + "grad_norm": 0.5555474758148193, + "learning_rate": 9.768878450044902e-05, + "loss": 1.9158, + "step": 2014 + }, + { + "epoch": 0.11231258012373892, + "grad_norm": 0.5508490800857544, + "learning_rate": 9.768610784815959e-05, + "loss": 1.5545, + "step": 2015 + }, + { + "epoch": 0.11236831837690207, + "grad_norm": 0.5072826743125916, + "learning_rate": 9.768342968354484e-05, + "loss": 1.6679, + "step": 2016 + }, + { + "epoch": 0.11242405663006522, + "grad_norm": 0.4995681941509247, + "learning_rate": 9.768075000668974e-05, + "loss": 1.7114, + "step": 2017 + }, + { + "epoch": 0.11247979488322836, + "grad_norm": 0.5590416789054871, + "learning_rate": 9.767806881767923e-05, + "loss": 1.8553, + "step": 2018 + }, + { + "epoch": 0.1125355331363915, + "grad_norm": 0.542676568031311, + "learning_rate": 9.767538611659837e-05, + "loss": 1.9799, + "step": 2019 + }, + { + "epoch": 0.11259127138955465, + "grad_norm": 0.6015095710754395, + "learning_rate": 9.767270190353221e-05, + "loss": 2.0631, + "step": 2020 + }, + { + "epoch": 0.1126470096427178, + "grad_norm": 0.5182809829711914, + "learning_rate": 9.767001617856591e-05, + "loss": 1.8081, + "step": 2021 + }, + { + "epoch": 0.11270274789588094, + "grad_norm": 0.539851725101471, + "learning_rate": 9.766732894178463e-05, + "loss": 1.6224, + "step": 2022 + }, + { + "epoch": 0.11275848614904409, + "grad_norm": 0.5738646388053894, + "learning_rate": 9.766464019327359e-05, + "loss": 1.8425, + "step": 2023 + }, + { + "epoch": 0.11281422440220723, + "grad_norm": 0.5035516619682312, + "learning_rate": 9.766194993311809e-05, + "loss": 1.8101, + "step": 2024 + }, + { + "epoch": 0.11286996265537039, + "grad_norm": 0.4765785038471222, + "learning_rate": 9.76592581614034e-05, + "loss": 1.7461, + "step": 2025 + }, + { + "epoch": 0.11292570090853353, + "grad_norm": 0.5692024230957031, + "learning_rate": 9.765656487821492e-05, + "loss": 1.9905, + "step": 2026 + }, + { + "epoch": 0.11298143916169667, + "grad_norm": 0.5034509301185608, + "learning_rate": 9.765387008363807e-05, + "loss": 1.7689, + "step": 2027 + }, + { + "epoch": 0.11303717741485982, + "grad_norm": 0.5591553449630737, + "learning_rate": 9.76511737777583e-05, + "loss": 1.7994, + "step": 2028 + }, + { + "epoch": 0.11309291566802296, + "grad_norm": 0.533530592918396, + "learning_rate": 9.764847596066111e-05, + "loss": 1.5192, + "step": 2029 + }, + { + "epoch": 0.11314865392118612, + "grad_norm": 0.5049347281455994, + "learning_rate": 9.764577663243209e-05, + "loss": 1.5906, + "step": 2030 + }, + { + "epoch": 0.11320439217434926, + "grad_norm": 0.4710226058959961, + "learning_rate": 9.764307579315681e-05, + "loss": 1.4503, + "step": 2031 + }, + { + "epoch": 0.1132601304275124, + "grad_norm": 0.5490729212760925, + "learning_rate": 9.764037344292096e-05, + "loss": 1.7865, + "step": 2032 + }, + { + "epoch": 0.11331586868067554, + "grad_norm": 0.5714886784553528, + "learning_rate": 9.763766958181022e-05, + "loss": 1.6803, + "step": 2033 + }, + { + "epoch": 0.11337160693383869, + "grad_norm": 0.5637816786766052, + "learning_rate": 9.763496420991037e-05, + "loss": 1.902, + "step": 2034 + }, + { + "epoch": 0.11342734518700184, + "grad_norm": 0.5324851870536804, + "learning_rate": 9.763225732730716e-05, + "loss": 1.7774, + "step": 2035 + }, + { + "epoch": 0.11348308344016499, + "grad_norm": 0.542209267616272, + "learning_rate": 9.762954893408646e-05, + "loss": 1.7369, + "step": 2036 + }, + { + "epoch": 0.11353882169332813, + "grad_norm": 0.5353888273239136, + "learning_rate": 9.762683903033419e-05, + "loss": 1.914, + "step": 2037 + }, + { + "epoch": 0.11359455994649127, + "grad_norm": 0.5152493119239807, + "learning_rate": 9.762412761613624e-05, + "loss": 1.8155, + "step": 2038 + }, + { + "epoch": 0.11365029819965443, + "grad_norm": 0.4723453223705292, + "learning_rate": 9.762141469157865e-05, + "loss": 1.6183, + "step": 2039 + }, + { + "epoch": 0.11370603645281757, + "grad_norm": 0.5671008229255676, + "learning_rate": 9.761870025674743e-05, + "loss": 1.887, + "step": 2040 + }, + { + "epoch": 0.11376177470598071, + "grad_norm": 0.5240710377693176, + "learning_rate": 9.761598431172868e-05, + "loss": 1.7928, + "step": 2041 + }, + { + "epoch": 0.11381751295914386, + "grad_norm": 0.4852540194988251, + "learning_rate": 9.761326685660852e-05, + "loss": 1.6132, + "step": 2042 + }, + { + "epoch": 0.113873251212307, + "grad_norm": 0.46512627601623535, + "learning_rate": 9.761054789147315e-05, + "loss": 1.4053, + "step": 2043 + }, + { + "epoch": 0.11392898946547016, + "grad_norm": 0.5127692222595215, + "learning_rate": 9.760782741640879e-05, + "loss": 1.649, + "step": 2044 + }, + { + "epoch": 0.1139847277186333, + "grad_norm": 0.5368222594261169, + "learning_rate": 9.76051054315017e-05, + "loss": 1.7286, + "step": 2045 + }, + { + "epoch": 0.11404046597179644, + "grad_norm": 0.5699864625930786, + "learning_rate": 9.760238193683824e-05, + "loss": 1.7911, + "step": 2046 + }, + { + "epoch": 0.11409620422495959, + "grad_norm": 0.59310382604599, + "learning_rate": 9.759965693250477e-05, + "loss": 1.7731, + "step": 2047 + }, + { + "epoch": 0.11415194247812274, + "grad_norm": 0.5524492859840393, + "learning_rate": 9.75969304185877e-05, + "loss": 1.7917, + "step": 2048 + }, + { + "epoch": 0.11420768073128588, + "grad_norm": 0.529346227645874, + "learning_rate": 9.75942023951735e-05, + "loss": 1.7298, + "step": 2049 + }, + { + "epoch": 0.11426341898444903, + "grad_norm": 0.5188475847244263, + "learning_rate": 9.75914728623487e-05, + "loss": 1.8422, + "step": 2050 + }, + { + "epoch": 0.11431915723761217, + "grad_norm": 0.5141621232032776, + "learning_rate": 9.758874182019986e-05, + "loss": 1.7194, + "step": 2051 + }, + { + "epoch": 0.11437489549077531, + "grad_norm": 0.5103389024734497, + "learning_rate": 9.758600926881358e-05, + "loss": 1.782, + "step": 2052 + }, + { + "epoch": 0.11443063374393847, + "grad_norm": 0.5371511578559875, + "learning_rate": 9.758327520827654e-05, + "loss": 1.8925, + "step": 2053 + }, + { + "epoch": 0.11448637199710161, + "grad_norm": 0.528293788433075, + "learning_rate": 9.758053963867544e-05, + "loss": 1.5632, + "step": 2054 + }, + { + "epoch": 0.11454211025026476, + "grad_norm": 0.5670381784439087, + "learning_rate": 9.757780256009704e-05, + "loss": 2.0612, + "step": 2055 + }, + { + "epoch": 0.1145978485034279, + "grad_norm": 0.4997304677963257, + "learning_rate": 9.757506397262814e-05, + "loss": 1.4963, + "step": 2056 + }, + { + "epoch": 0.11465358675659104, + "grad_norm": 0.5154783129692078, + "learning_rate": 9.757232387635559e-05, + "loss": 1.7024, + "step": 2057 + }, + { + "epoch": 0.1147093250097542, + "grad_norm": 0.5076404213905334, + "learning_rate": 9.75695822713663e-05, + "loss": 1.7356, + "step": 2058 + }, + { + "epoch": 0.11476506326291734, + "grad_norm": 0.5490261912345886, + "learning_rate": 9.75668391577472e-05, + "loss": 1.9454, + "step": 2059 + }, + { + "epoch": 0.11482080151608048, + "grad_norm": 0.49244236946105957, + "learning_rate": 9.756409453558531e-05, + "loss": 1.7741, + "step": 2060 + }, + { + "epoch": 0.11487653976924363, + "grad_norm": 0.5007554292678833, + "learning_rate": 9.756134840496763e-05, + "loss": 1.6877, + "step": 2061 + }, + { + "epoch": 0.11493227802240678, + "grad_norm": 0.5688347816467285, + "learning_rate": 9.75586007659813e-05, + "loss": 1.8947, + "step": 2062 + }, + { + "epoch": 0.11498801627556993, + "grad_norm": 0.49076688289642334, + "learning_rate": 9.755585161871344e-05, + "loss": 1.632, + "step": 2063 + }, + { + "epoch": 0.11504375452873307, + "grad_norm": 0.5263219475746155, + "learning_rate": 9.755310096325123e-05, + "loss": 1.8176, + "step": 2064 + }, + { + "epoch": 0.11509949278189621, + "grad_norm": 0.5379471778869629, + "learning_rate": 9.755034879968193e-05, + "loss": 1.9844, + "step": 2065 + }, + { + "epoch": 0.11515523103505935, + "grad_norm": 0.6128638982772827, + "learning_rate": 9.754759512809277e-05, + "loss": 2.0891, + "step": 2066 + }, + { + "epoch": 0.11521096928822251, + "grad_norm": 0.513877272605896, + "learning_rate": 9.754483994857115e-05, + "loss": 1.7906, + "step": 2067 + }, + { + "epoch": 0.11526670754138565, + "grad_norm": 0.5699423551559448, + "learning_rate": 9.75420832612044e-05, + "loss": 1.9245, + "step": 2068 + }, + { + "epoch": 0.1153224457945488, + "grad_norm": 0.49974846839904785, + "learning_rate": 9.753932506607995e-05, + "loss": 1.5529, + "step": 2069 + }, + { + "epoch": 0.11537818404771194, + "grad_norm": 0.5551686882972717, + "learning_rate": 9.753656536328528e-05, + "loss": 1.7138, + "step": 2070 + }, + { + "epoch": 0.1154339223008751, + "grad_norm": 0.5302468538284302, + "learning_rate": 9.753380415290792e-05, + "loss": 1.7991, + "step": 2071 + }, + { + "epoch": 0.11548966055403824, + "grad_norm": 0.5461943745613098, + "learning_rate": 9.753104143503544e-05, + "loss": 1.6249, + "step": 2072 + }, + { + "epoch": 0.11554539880720138, + "grad_norm": 0.5242646336555481, + "learning_rate": 9.752827720975544e-05, + "loss": 1.7194, + "step": 2073 + }, + { + "epoch": 0.11560113706036453, + "grad_norm": 0.5647328495979309, + "learning_rate": 9.75255114771556e-05, + "loss": 1.6221, + "step": 2074 + }, + { + "epoch": 0.11565687531352767, + "grad_norm": 0.5108300447463989, + "learning_rate": 9.752274423732364e-05, + "loss": 1.5454, + "step": 2075 + }, + { + "epoch": 0.11571261356669083, + "grad_norm": 0.5370137691497803, + "learning_rate": 9.75199754903473e-05, + "loss": 1.8162, + "step": 2076 + }, + { + "epoch": 0.11576835181985397, + "grad_norm": 0.5308608412742615, + "learning_rate": 9.75172052363144e-05, + "loss": 1.8913, + "step": 2077 + }, + { + "epoch": 0.11582409007301711, + "grad_norm": 0.5060725808143616, + "learning_rate": 9.751443347531279e-05, + "loss": 1.6392, + "step": 2078 + }, + { + "epoch": 0.11587982832618025, + "grad_norm": 0.5402329564094543, + "learning_rate": 9.751166020743037e-05, + "loss": 1.6481, + "step": 2079 + }, + { + "epoch": 0.1159355665793434, + "grad_norm": 0.5728126168251038, + "learning_rate": 9.750888543275511e-05, + "loss": 1.7507, + "step": 2080 + }, + { + "epoch": 0.11599130483250655, + "grad_norm": 0.5055838227272034, + "learning_rate": 9.750610915137502e-05, + "loss": 1.7667, + "step": 2081 + }, + { + "epoch": 0.1160470430856697, + "grad_norm": 0.5178690552711487, + "learning_rate": 9.750333136337811e-05, + "loss": 1.7303, + "step": 2082 + }, + { + "epoch": 0.11610278133883284, + "grad_norm": 0.5922085642814636, + "learning_rate": 9.750055206885249e-05, + "loss": 1.9936, + "step": 2083 + }, + { + "epoch": 0.11615851959199598, + "grad_norm": 0.5285540223121643, + "learning_rate": 9.74977712678863e-05, + "loss": 1.8642, + "step": 2084 + }, + { + "epoch": 0.11621425784515914, + "grad_norm": 0.5517610907554626, + "learning_rate": 9.749498896056775e-05, + "loss": 1.8, + "step": 2085 + }, + { + "epoch": 0.11626999609832228, + "grad_norm": 0.519136905670166, + "learning_rate": 9.749220514698505e-05, + "loss": 1.8553, + "step": 2086 + }, + { + "epoch": 0.11632573435148542, + "grad_norm": 0.47392770648002625, + "learning_rate": 9.748941982722652e-05, + "loss": 1.5635, + "step": 2087 + }, + { + "epoch": 0.11638147260464857, + "grad_norm": 0.5580193400382996, + "learning_rate": 9.748663300138046e-05, + "loss": 2.0887, + "step": 2088 + }, + { + "epoch": 0.11643721085781171, + "grad_norm": 0.5110911726951599, + "learning_rate": 9.748384466953529e-05, + "loss": 1.7254, + "step": 2089 + }, + { + "epoch": 0.11649294911097487, + "grad_norm": 0.5411677360534668, + "learning_rate": 9.748105483177939e-05, + "loss": 2.0895, + "step": 2090 + }, + { + "epoch": 0.11654868736413801, + "grad_norm": 0.5149423480033875, + "learning_rate": 9.747826348820129e-05, + "loss": 1.6339, + "step": 2091 + }, + { + "epoch": 0.11660442561730115, + "grad_norm": 0.48806729912757874, + "learning_rate": 9.747547063888947e-05, + "loss": 1.8714, + "step": 2092 + }, + { + "epoch": 0.1166601638704643, + "grad_norm": 0.5147302746772766, + "learning_rate": 9.747267628393252e-05, + "loss": 1.8269, + "step": 2093 + }, + { + "epoch": 0.11671590212362745, + "grad_norm": 0.512217104434967, + "learning_rate": 9.746988042341906e-05, + "loss": 1.7604, + "step": 2094 + }, + { + "epoch": 0.1167716403767906, + "grad_norm": 0.66917484998703, + "learning_rate": 9.746708305743778e-05, + "loss": 2.2348, + "step": 2095 + }, + { + "epoch": 0.11682737862995374, + "grad_norm": 0.5376080870628357, + "learning_rate": 9.746428418607737e-05, + "loss": 1.811, + "step": 2096 + }, + { + "epoch": 0.11688311688311688, + "grad_norm": 0.5490595102310181, + "learning_rate": 9.746148380942661e-05, + "loss": 1.7822, + "step": 2097 + }, + { + "epoch": 0.11693885513628002, + "grad_norm": 0.5195513367652893, + "learning_rate": 9.745868192757429e-05, + "loss": 1.815, + "step": 2098 + }, + { + "epoch": 0.11699459338944318, + "grad_norm": 0.4978055953979492, + "learning_rate": 9.745587854060929e-05, + "loss": 1.6799, + "step": 2099 + }, + { + "epoch": 0.11705033164260632, + "grad_norm": 0.47539737820625305, + "learning_rate": 9.74530736486205e-05, + "loss": 1.3444, + "step": 2100 + }, + { + "epoch": 0.11710606989576947, + "grad_norm": 0.49834421277046204, + "learning_rate": 9.74502672516969e-05, + "loss": 1.6343, + "step": 2101 + }, + { + "epoch": 0.11716180814893261, + "grad_norm": 0.5414234399795532, + "learning_rate": 9.744745934992747e-05, + "loss": 1.8732, + "step": 2102 + }, + { + "epoch": 0.11721754640209577, + "grad_norm": 0.55171799659729, + "learning_rate": 9.744464994340126e-05, + "loss": 1.823, + "step": 2103 + }, + { + "epoch": 0.11727328465525891, + "grad_norm": 0.545732319355011, + "learning_rate": 9.744183903220738e-05, + "loss": 1.6152, + "step": 2104 + }, + { + "epoch": 0.11732902290842205, + "grad_norm": 0.5116435885429382, + "learning_rate": 9.743902661643498e-05, + "loss": 1.8159, + "step": 2105 + }, + { + "epoch": 0.1173847611615852, + "grad_norm": 0.5736915469169617, + "learning_rate": 9.743621269617324e-05, + "loss": 2.0891, + "step": 2106 + }, + { + "epoch": 0.11744049941474834, + "grad_norm": 0.5401880741119385, + "learning_rate": 9.74333972715114e-05, + "loss": 1.6851, + "step": 2107 + }, + { + "epoch": 0.1174962376679115, + "grad_norm": 0.4980708658695221, + "learning_rate": 9.743058034253876e-05, + "loss": 1.7487, + "step": 2108 + }, + { + "epoch": 0.11755197592107464, + "grad_norm": 0.5513383150100708, + "learning_rate": 9.742776190934464e-05, + "loss": 1.7077, + "step": 2109 + }, + { + "epoch": 0.11760771417423778, + "grad_norm": 0.48612821102142334, + "learning_rate": 9.742494197201845e-05, + "loss": 1.7193, + "step": 2110 + }, + { + "epoch": 0.11766345242740092, + "grad_norm": 0.5319970846176147, + "learning_rate": 9.742212053064959e-05, + "loss": 1.8341, + "step": 2111 + }, + { + "epoch": 0.11771919068056406, + "grad_norm": 0.5188704133033752, + "learning_rate": 9.741929758532758e-05, + "loss": 1.7452, + "step": 2112 + }, + { + "epoch": 0.11777492893372722, + "grad_norm": 0.569303035736084, + "learning_rate": 9.741647313614191e-05, + "loss": 1.7242, + "step": 2113 + }, + { + "epoch": 0.11783066718689036, + "grad_norm": 0.5230869650840759, + "learning_rate": 9.741364718318216e-05, + "loss": 1.7484, + "step": 2114 + }, + { + "epoch": 0.11788640544005351, + "grad_norm": 0.5458916425704956, + "learning_rate": 9.741081972653798e-05, + "loss": 1.8975, + "step": 2115 + }, + { + "epoch": 0.11794214369321665, + "grad_norm": 0.5454350113868713, + "learning_rate": 9.740799076629902e-05, + "loss": 1.7848, + "step": 2116 + }, + { + "epoch": 0.1179978819463798, + "grad_norm": 0.5229981541633606, + "learning_rate": 9.7405160302555e-05, + "loss": 1.7087, + "step": 2117 + }, + { + "epoch": 0.11805362019954295, + "grad_norm": 0.5540334582328796, + "learning_rate": 9.740232833539567e-05, + "loss": 1.712, + "step": 2118 + }, + { + "epoch": 0.11810935845270609, + "grad_norm": 0.5371966361999512, + "learning_rate": 9.739949486491088e-05, + "loss": 1.6682, + "step": 2119 + }, + { + "epoch": 0.11816509670586924, + "grad_norm": 0.5578680038452148, + "learning_rate": 9.739665989119047e-05, + "loss": 1.7035, + "step": 2120 + }, + { + "epoch": 0.11822083495903238, + "grad_norm": 0.49404215812683105, + "learning_rate": 9.739382341432434e-05, + "loss": 1.6535, + "step": 2121 + }, + { + "epoch": 0.11827657321219553, + "grad_norm": 0.5198866724967957, + "learning_rate": 9.739098543440246e-05, + "loss": 1.9483, + "step": 2122 + }, + { + "epoch": 0.11833231146535868, + "grad_norm": 0.5561308860778809, + "learning_rate": 9.738814595151481e-05, + "loss": 1.6287, + "step": 2123 + }, + { + "epoch": 0.11838804971852182, + "grad_norm": 0.5929575562477112, + "learning_rate": 9.73853049657515e-05, + "loss": 1.8991, + "step": 2124 + }, + { + "epoch": 0.11844378797168496, + "grad_norm": 0.5198292136192322, + "learning_rate": 9.738246247720257e-05, + "loss": 1.7004, + "step": 2125 + }, + { + "epoch": 0.11849952622484812, + "grad_norm": 0.4800911247730255, + "learning_rate": 9.73796184859582e-05, + "loss": 1.8126, + "step": 2126 + }, + { + "epoch": 0.11855526447801126, + "grad_norm": 0.5122108459472656, + "learning_rate": 9.737677299210857e-05, + "loss": 1.6761, + "step": 2127 + }, + { + "epoch": 0.1186110027311744, + "grad_norm": 0.5015464425086975, + "learning_rate": 9.737392599574391e-05, + "loss": 1.6405, + "step": 2128 + }, + { + "epoch": 0.11866674098433755, + "grad_norm": 0.560658872127533, + "learning_rate": 9.737107749695456e-05, + "loss": 1.8458, + "step": 2129 + }, + { + "epoch": 0.11872247923750069, + "grad_norm": 0.5312667489051819, + "learning_rate": 9.73682274958308e-05, + "loss": 1.9419, + "step": 2130 + }, + { + "epoch": 0.11877821749066385, + "grad_norm": 0.5537664294242859, + "learning_rate": 9.736537599246305e-05, + "loss": 2.0495, + "step": 2131 + }, + { + "epoch": 0.11883395574382699, + "grad_norm": 0.5166563391685486, + "learning_rate": 9.736252298694172e-05, + "loss": 1.7997, + "step": 2132 + }, + { + "epoch": 0.11888969399699013, + "grad_norm": 0.5567119121551514, + "learning_rate": 9.735966847935732e-05, + "loss": 2.0086, + "step": 2133 + }, + { + "epoch": 0.11894543225015328, + "grad_norm": 0.5614973306655884, + "learning_rate": 9.735681246980035e-05, + "loss": 1.8669, + "step": 2134 + }, + { + "epoch": 0.11900117050331642, + "grad_norm": 0.4755729138851166, + "learning_rate": 9.73539549583614e-05, + "loss": 1.4678, + "step": 2135 + }, + { + "epoch": 0.11905690875647958, + "grad_norm": 0.5338446497917175, + "learning_rate": 9.73510959451311e-05, + "loss": 1.758, + "step": 2136 + }, + { + "epoch": 0.11911264700964272, + "grad_norm": 0.5301800966262817, + "learning_rate": 9.734823543020009e-05, + "loss": 1.6377, + "step": 2137 + }, + { + "epoch": 0.11916838526280586, + "grad_norm": 0.5584478378295898, + "learning_rate": 9.734537341365914e-05, + "loss": 1.8973, + "step": 2138 + }, + { + "epoch": 0.119224123515969, + "grad_norm": 0.5499609112739563, + "learning_rate": 9.734250989559896e-05, + "loss": 1.8316, + "step": 2139 + }, + { + "epoch": 0.11927986176913216, + "grad_norm": 0.5567249655723572, + "learning_rate": 9.733964487611042e-05, + "loss": 1.9231, + "step": 2140 + }, + { + "epoch": 0.1193356000222953, + "grad_norm": 0.5121795535087585, + "learning_rate": 9.733677835528434e-05, + "loss": 1.7316, + "step": 2141 + }, + { + "epoch": 0.11939133827545845, + "grad_norm": 0.5235653519630432, + "learning_rate": 9.733391033321164e-05, + "loss": 1.7328, + "step": 2142 + }, + { + "epoch": 0.11944707652862159, + "grad_norm": 0.5482314229011536, + "learning_rate": 9.733104080998329e-05, + "loss": 1.9832, + "step": 2143 + }, + { + "epoch": 0.11950281478178473, + "grad_norm": 0.4945628345012665, + "learning_rate": 9.732816978569028e-05, + "loss": 1.6102, + "step": 2144 + }, + { + "epoch": 0.11955855303494789, + "grad_norm": 0.532642126083374, + "learning_rate": 9.732529726042365e-05, + "loss": 1.6543, + "step": 2145 + }, + { + "epoch": 0.11961429128811103, + "grad_norm": 0.5531574487686157, + "learning_rate": 9.732242323427455e-05, + "loss": 1.8017, + "step": 2146 + }, + { + "epoch": 0.11967002954127418, + "grad_norm": 0.595876932144165, + "learning_rate": 9.731954770733407e-05, + "loss": 2.0041, + "step": 2147 + }, + { + "epoch": 0.11972576779443732, + "grad_norm": 0.5025404095649719, + "learning_rate": 9.731667067969344e-05, + "loss": 1.716, + "step": 2148 + }, + { + "epoch": 0.11978150604760048, + "grad_norm": 0.5070561766624451, + "learning_rate": 9.731379215144388e-05, + "loss": 1.8201, + "step": 2149 + }, + { + "epoch": 0.11983724430076362, + "grad_norm": 0.5182836651802063, + "learning_rate": 9.73109121226767e-05, + "loss": 1.51, + "step": 2150 + }, + { + "epoch": 0.11989298255392676, + "grad_norm": 0.5657908320426941, + "learning_rate": 9.730803059348323e-05, + "loss": 2.0817, + "step": 2151 + }, + { + "epoch": 0.1199487208070899, + "grad_norm": 0.5556692481040955, + "learning_rate": 9.730514756395485e-05, + "loss": 1.854, + "step": 2152 + }, + { + "epoch": 0.12000445906025305, + "grad_norm": 0.4503386616706848, + "learning_rate": 9.7302263034183e-05, + "loss": 1.4719, + "step": 2153 + }, + { + "epoch": 0.1200601973134162, + "grad_norm": 0.5425733327865601, + "learning_rate": 9.729937700425916e-05, + "loss": 1.8686, + "step": 2154 + }, + { + "epoch": 0.12011593556657935, + "grad_norm": 0.5144285559654236, + "learning_rate": 9.729648947427484e-05, + "loss": 1.8232, + "step": 2155 + }, + { + "epoch": 0.12017167381974249, + "grad_norm": 0.5346119999885559, + "learning_rate": 9.729360044432166e-05, + "loss": 1.7735, + "step": 2156 + }, + { + "epoch": 0.12022741207290563, + "grad_norm": 0.5558546185493469, + "learning_rate": 9.729070991449119e-05, + "loss": 1.9485, + "step": 2157 + }, + { + "epoch": 0.12028315032606877, + "grad_norm": 0.495919406414032, + "learning_rate": 9.728781788487513e-05, + "loss": 1.6713, + "step": 2158 + }, + { + "epoch": 0.12033888857923193, + "grad_norm": 0.5348759889602661, + "learning_rate": 9.72849243555652e-05, + "loss": 1.6913, + "step": 2159 + }, + { + "epoch": 0.12039462683239507, + "grad_norm": 0.5228710174560547, + "learning_rate": 9.728202932665316e-05, + "loss": 1.6557, + "step": 2160 + }, + { + "epoch": 0.12045036508555822, + "grad_norm": 0.49766623973846436, + "learning_rate": 9.727913279823081e-05, + "loss": 1.6087, + "step": 2161 + }, + { + "epoch": 0.12050610333872136, + "grad_norm": 0.5042500495910645, + "learning_rate": 9.727623477039005e-05, + "loss": 1.8017, + "step": 2162 + }, + { + "epoch": 0.12056184159188452, + "grad_norm": 0.5221708416938782, + "learning_rate": 9.727333524322274e-05, + "loss": 1.7577, + "step": 2163 + }, + { + "epoch": 0.12061757984504766, + "grad_norm": 0.5310743451118469, + "learning_rate": 9.727043421682087e-05, + "loss": 1.7025, + "step": 2164 + }, + { + "epoch": 0.1206733180982108, + "grad_norm": 0.5771050453186035, + "learning_rate": 9.726753169127643e-05, + "loss": 1.8185, + "step": 2165 + }, + { + "epoch": 0.12072905635137395, + "grad_norm": 0.4827874004840851, + "learning_rate": 9.726462766668147e-05, + "loss": 1.5869, + "step": 2166 + }, + { + "epoch": 0.12078479460453709, + "grad_norm": 0.5001873970031738, + "learning_rate": 9.72617221431281e-05, + "loss": 1.6207, + "step": 2167 + }, + { + "epoch": 0.12084053285770024, + "grad_norm": 0.47895923256874084, + "learning_rate": 9.725881512070845e-05, + "loss": 1.5611, + "step": 2168 + }, + { + "epoch": 0.12089627111086339, + "grad_norm": 0.5227773785591125, + "learning_rate": 9.725590659951473e-05, + "loss": 1.7524, + "step": 2169 + }, + { + "epoch": 0.12095200936402653, + "grad_norm": 0.5513851046562195, + "learning_rate": 9.725299657963916e-05, + "loss": 1.9093, + "step": 2170 + }, + { + "epoch": 0.12100774761718967, + "grad_norm": 0.5206924080848694, + "learning_rate": 9.725008506117405e-05, + "loss": 1.6196, + "step": 2171 + }, + { + "epoch": 0.12106348587035283, + "grad_norm": 0.5124804377555847, + "learning_rate": 9.724717204421175e-05, + "loss": 1.5592, + "step": 2172 + }, + { + "epoch": 0.12111922412351597, + "grad_norm": 0.49579185247421265, + "learning_rate": 9.724425752884458e-05, + "loss": 1.7796, + "step": 2173 + }, + { + "epoch": 0.12117496237667912, + "grad_norm": 0.4806743562221527, + "learning_rate": 9.724134151516504e-05, + "loss": 1.5684, + "step": 2174 + }, + { + "epoch": 0.12123070062984226, + "grad_norm": 0.5735479593276978, + "learning_rate": 9.72384240032656e-05, + "loss": 1.9183, + "step": 2175 + }, + { + "epoch": 0.1212864388830054, + "grad_norm": 0.49125027656555176, + "learning_rate": 9.723550499323874e-05, + "loss": 1.5609, + "step": 2176 + }, + { + "epoch": 0.12134217713616856, + "grad_norm": 0.5535476207733154, + "learning_rate": 9.723258448517707e-05, + "loss": 1.8593, + "step": 2177 + }, + { + "epoch": 0.1213979153893317, + "grad_norm": 0.5923840403556824, + "learning_rate": 9.722966247917322e-05, + "loss": 1.8673, + "step": 2178 + }, + { + "epoch": 0.12145365364249484, + "grad_norm": 0.5120698809623718, + "learning_rate": 9.722673897531983e-05, + "loss": 1.6219, + "step": 2179 + }, + { + "epoch": 0.12150939189565799, + "grad_norm": 0.5636369585990906, + "learning_rate": 9.722381397370963e-05, + "loss": 1.9298, + "step": 2180 + }, + { + "epoch": 0.12156513014882113, + "grad_norm": 0.5421077609062195, + "learning_rate": 9.722088747443539e-05, + "loss": 1.4028, + "step": 2181 + }, + { + "epoch": 0.12162086840198429, + "grad_norm": 0.5058643817901611, + "learning_rate": 9.721795947758991e-05, + "loss": 1.6988, + "step": 2182 + }, + { + "epoch": 0.12167660665514743, + "grad_norm": 0.5012438297271729, + "learning_rate": 9.721502998326607e-05, + "loss": 1.6624, + "step": 2183 + }, + { + "epoch": 0.12173234490831057, + "grad_norm": 0.47187769412994385, + "learning_rate": 9.721209899155675e-05, + "loss": 1.5275, + "step": 2184 + }, + { + "epoch": 0.12178808316147371, + "grad_norm": 0.525303065776825, + "learning_rate": 9.720916650255492e-05, + "loss": 1.7458, + "step": 2185 + }, + { + "epoch": 0.12184382141463687, + "grad_norm": 0.586681604385376, + "learning_rate": 9.720623251635357e-05, + "loss": 1.7205, + "step": 2186 + }, + { + "epoch": 0.12189955966780001, + "grad_norm": 0.5550994873046875, + "learning_rate": 9.720329703304577e-05, + "loss": 1.6508, + "step": 2187 + }, + { + "epoch": 0.12195529792096316, + "grad_norm": 0.5518259406089783, + "learning_rate": 9.720036005272459e-05, + "loss": 1.7847, + "step": 2188 + }, + { + "epoch": 0.1220110361741263, + "grad_norm": 0.4833231270313263, + "learning_rate": 9.719742157548319e-05, + "loss": 1.578, + "step": 2189 + }, + { + "epoch": 0.12206677442728944, + "grad_norm": 0.5002262592315674, + "learning_rate": 9.719448160141476e-05, + "loss": 1.7526, + "step": 2190 + }, + { + "epoch": 0.1221225126804526, + "grad_norm": 0.4701862335205078, + "learning_rate": 9.719154013061253e-05, + "loss": 1.369, + "step": 2191 + }, + { + "epoch": 0.12217825093361574, + "grad_norm": 0.5255539417266846, + "learning_rate": 9.71885971631698e-05, + "loss": 1.9266, + "step": 2192 + }, + { + "epoch": 0.12223398918677889, + "grad_norm": 0.5181805491447449, + "learning_rate": 9.71856526991799e-05, + "loss": 1.8049, + "step": 2193 + }, + { + "epoch": 0.12228972743994203, + "grad_norm": 0.5119277834892273, + "learning_rate": 9.71827067387362e-05, + "loss": 1.6141, + "step": 2194 + }, + { + "epoch": 0.12234546569310518, + "grad_norm": 0.46822264790534973, + "learning_rate": 9.717975928193214e-05, + "loss": 1.4462, + "step": 2195 + }, + { + "epoch": 0.12240120394626833, + "grad_norm": 0.5520098209381104, + "learning_rate": 9.717681032886119e-05, + "loss": 1.7872, + "step": 2196 + }, + { + "epoch": 0.12245694219943147, + "grad_norm": 0.5204572677612305, + "learning_rate": 9.717385987961686e-05, + "loss": 1.7539, + "step": 2197 + }, + { + "epoch": 0.12251268045259461, + "grad_norm": 0.5343250036239624, + "learning_rate": 9.717090793429276e-05, + "loss": 1.8575, + "step": 2198 + }, + { + "epoch": 0.12256841870575776, + "grad_norm": 0.521108865737915, + "learning_rate": 9.716795449298248e-05, + "loss": 1.9104, + "step": 2199 + }, + { + "epoch": 0.12262415695892091, + "grad_norm": 0.49352675676345825, + "learning_rate": 9.71649995557797e-05, + "loss": 1.6201, + "step": 2200 + }, + { + "epoch": 0.12267989521208406, + "grad_norm": 0.5716384649276733, + "learning_rate": 9.716204312277812e-05, + "loss": 1.928, + "step": 2201 + }, + { + "epoch": 0.1227356334652472, + "grad_norm": 0.5332071781158447, + "learning_rate": 9.715908519407149e-05, + "loss": 1.6348, + "step": 2202 + }, + { + "epoch": 0.12279137171841034, + "grad_norm": 0.5008523464202881, + "learning_rate": 9.715612576975366e-05, + "loss": 1.8211, + "step": 2203 + }, + { + "epoch": 0.12284710997157348, + "grad_norm": 0.5112088322639465, + "learning_rate": 9.715316484991845e-05, + "loss": 1.8334, + "step": 2204 + }, + { + "epoch": 0.12290284822473664, + "grad_norm": 0.5519534349441528, + "learning_rate": 9.715020243465976e-05, + "loss": 1.8001, + "step": 2205 + }, + { + "epoch": 0.12295858647789978, + "grad_norm": 0.46493321657180786, + "learning_rate": 9.714723852407157e-05, + "loss": 1.4173, + "step": 2206 + }, + { + "epoch": 0.12301432473106293, + "grad_norm": 0.5702951550483704, + "learning_rate": 9.714427311824786e-05, + "loss": 1.7186, + "step": 2207 + }, + { + "epoch": 0.12307006298422607, + "grad_norm": 0.5255847573280334, + "learning_rate": 9.714130621728266e-05, + "loss": 1.6884, + "step": 2208 + }, + { + "epoch": 0.12312580123738923, + "grad_norm": 0.581146776676178, + "learning_rate": 9.713833782127008e-05, + "loss": 1.8707, + "step": 2209 + }, + { + "epoch": 0.12318153949055237, + "grad_norm": 0.5044531226158142, + "learning_rate": 9.713536793030429e-05, + "loss": 1.555, + "step": 2210 + }, + { + "epoch": 0.12323727774371551, + "grad_norm": 0.543787956237793, + "learning_rate": 9.713239654447943e-05, + "loss": 1.8188, + "step": 2211 + }, + { + "epoch": 0.12329301599687865, + "grad_norm": 0.6438772678375244, + "learning_rate": 9.712942366388975e-05, + "loss": 1.8096, + "step": 2212 + }, + { + "epoch": 0.1233487542500418, + "grad_norm": 0.5758397579193115, + "learning_rate": 9.712644928862953e-05, + "loss": 1.8329, + "step": 2213 + }, + { + "epoch": 0.12340449250320495, + "grad_norm": 0.5573188066482544, + "learning_rate": 9.712347341879311e-05, + "loss": 1.8994, + "step": 2214 + }, + { + "epoch": 0.1234602307563681, + "grad_norm": 0.5477108359336853, + "learning_rate": 9.712049605447486e-05, + "loss": 1.8856, + "step": 2215 + }, + { + "epoch": 0.12351596900953124, + "grad_norm": 0.5133275985717773, + "learning_rate": 9.711751719576922e-05, + "loss": 1.7319, + "step": 2216 + }, + { + "epoch": 0.12357170726269438, + "grad_norm": 0.5406665802001953, + "learning_rate": 9.711453684277063e-05, + "loss": 1.9889, + "step": 2217 + }, + { + "epoch": 0.12362744551585754, + "grad_norm": 0.48421719670295715, + "learning_rate": 9.711155499557364e-05, + "loss": 1.5177, + "step": 2218 + }, + { + "epoch": 0.12368318376902068, + "grad_norm": 0.5295604467391968, + "learning_rate": 9.710857165427281e-05, + "loss": 1.5376, + "step": 2219 + }, + { + "epoch": 0.12373892202218383, + "grad_norm": 0.5241243243217468, + "learning_rate": 9.710558681896274e-05, + "loss": 1.7389, + "step": 2220 + }, + { + "epoch": 0.12379466027534697, + "grad_norm": 0.48620593547821045, + "learning_rate": 9.71026004897381e-05, + "loss": 1.7281, + "step": 2221 + }, + { + "epoch": 0.12385039852851011, + "grad_norm": 0.5162755846977234, + "learning_rate": 9.70996126666936e-05, + "loss": 1.6421, + "step": 2222 + }, + { + "epoch": 0.12390613678167327, + "grad_norm": 0.5603106021881104, + "learning_rate": 9.7096623349924e-05, + "loss": 2.0405, + "step": 2223 + }, + { + "epoch": 0.12396187503483641, + "grad_norm": 0.5636157393455505, + "learning_rate": 9.70936325395241e-05, + "loss": 1.7629, + "step": 2224 + }, + { + "epoch": 0.12401761328799955, + "grad_norm": 0.5287961363792419, + "learning_rate": 9.709064023558874e-05, + "loss": 1.7357, + "step": 2225 + }, + { + "epoch": 0.1240733515411627, + "grad_norm": 0.5584306120872498, + "learning_rate": 9.708764643821284e-05, + "loss": 1.905, + "step": 2226 + }, + { + "epoch": 0.12412908979432584, + "grad_norm": 0.5021309852600098, + "learning_rate": 9.708465114749132e-05, + "loss": 1.7439, + "step": 2227 + }, + { + "epoch": 0.124184828047489, + "grad_norm": 0.5482348799705505, + "learning_rate": 9.708165436351921e-05, + "loss": 1.7851, + "step": 2228 + }, + { + "epoch": 0.12424056630065214, + "grad_norm": 0.498470276594162, + "learning_rate": 9.707865608639152e-05, + "loss": 1.494, + "step": 2229 + }, + { + "epoch": 0.12429630455381528, + "grad_norm": 0.5526018142700195, + "learning_rate": 9.707565631620334e-05, + "loss": 1.973, + "step": 2230 + }, + { + "epoch": 0.12435204280697842, + "grad_norm": 0.5773054957389832, + "learning_rate": 9.707265505304982e-05, + "loss": 1.9693, + "step": 2231 + }, + { + "epoch": 0.12440778106014158, + "grad_norm": 0.5307757258415222, + "learning_rate": 9.706965229702614e-05, + "loss": 1.8978, + "step": 2232 + }, + { + "epoch": 0.12446351931330472, + "grad_norm": 0.5740475654602051, + "learning_rate": 9.70666480482275e-05, + "loss": 2.0298, + "step": 2233 + }, + { + "epoch": 0.12451925756646787, + "grad_norm": 0.5156608819961548, + "learning_rate": 9.706364230674923e-05, + "loss": 1.5383, + "step": 2234 + }, + { + "epoch": 0.12457499581963101, + "grad_norm": 0.4921102225780487, + "learning_rate": 9.706063507268661e-05, + "loss": 1.6472, + "step": 2235 + }, + { + "epoch": 0.12463073407279415, + "grad_norm": 0.5701449513435364, + "learning_rate": 9.705762634613502e-05, + "loss": 1.7692, + "step": 2236 + }, + { + "epoch": 0.12468647232595731, + "grad_norm": 0.49713411927223206, + "learning_rate": 9.705461612718991e-05, + "loss": 1.5998, + "step": 2237 + }, + { + "epoch": 0.12474221057912045, + "grad_norm": 0.5252828598022461, + "learning_rate": 9.705160441594671e-05, + "loss": 1.6545, + "step": 2238 + }, + { + "epoch": 0.1247979488322836, + "grad_norm": 0.543063759803772, + "learning_rate": 9.704859121250095e-05, + "loss": 1.8984, + "step": 2239 + }, + { + "epoch": 0.12485368708544674, + "grad_norm": 0.5450255274772644, + "learning_rate": 9.704557651694818e-05, + "loss": 1.7794, + "step": 2240 + }, + { + "epoch": 0.1249094253386099, + "grad_norm": 0.4936400353908539, + "learning_rate": 9.704256032938403e-05, + "loss": 1.4191, + "step": 2241 + }, + { + "epoch": 0.12496516359177304, + "grad_norm": 0.5075535774230957, + "learning_rate": 9.703954264990414e-05, + "loss": 1.7634, + "step": 2242 + }, + { + "epoch": 0.12502090184493617, + "grad_norm": 0.5337166786193848, + "learning_rate": 9.703652347860422e-05, + "loss": 1.9257, + "step": 2243 + }, + { + "epoch": 0.12507664009809932, + "grad_norm": 0.5265361666679382, + "learning_rate": 9.703350281558002e-05, + "loss": 1.8102, + "step": 2244 + }, + { + "epoch": 0.12513237835126248, + "grad_norm": 0.5706486701965332, + "learning_rate": 9.703048066092733e-05, + "loss": 2.1658, + "step": 2245 + }, + { + "epoch": 0.1251881166044256, + "grad_norm": 0.5012516975402832, + "learning_rate": 9.7027457014742e-05, + "loss": 1.6586, + "step": 2246 + }, + { + "epoch": 0.12524385485758877, + "grad_norm": 0.5617608428001404, + "learning_rate": 9.702443187711992e-05, + "loss": 1.7678, + "step": 2247 + }, + { + "epoch": 0.12529959311075192, + "grad_norm": 0.5820160508155823, + "learning_rate": 9.702140524815704e-05, + "loss": 1.848, + "step": 2248 + }, + { + "epoch": 0.12535533136391505, + "grad_norm": 0.5511069297790527, + "learning_rate": 9.701837712794932e-05, + "loss": 1.8369, + "step": 2249 + }, + { + "epoch": 0.1254110696170782, + "grad_norm": 0.5301650166511536, + "learning_rate": 9.701534751659283e-05, + "loss": 1.8621, + "step": 2250 + }, + { + "epoch": 0.12546680787024134, + "grad_norm": 0.519693911075592, + "learning_rate": 9.701231641418363e-05, + "loss": 1.7069, + "step": 2251 + }, + { + "epoch": 0.1255225461234045, + "grad_norm": 0.5177733302116394, + "learning_rate": 9.700928382081786e-05, + "loss": 1.7311, + "step": 2252 + }, + { + "epoch": 0.12557828437656765, + "grad_norm": 0.5452710390090942, + "learning_rate": 9.700624973659169e-05, + "loss": 1.6022, + "step": 2253 + }, + { + "epoch": 0.12563402262973078, + "grad_norm": 0.49126002192497253, + "learning_rate": 9.700321416160134e-05, + "loss": 1.6004, + "step": 2254 + }, + { + "epoch": 0.12568976088289394, + "grad_norm": 0.4859536290168762, + "learning_rate": 9.70001770959431e-05, + "loss": 1.627, + "step": 2255 + }, + { + "epoch": 0.12574549913605707, + "grad_norm": 0.5808461308479309, + "learning_rate": 9.699713853971324e-05, + "loss": 1.9893, + "step": 2256 + }, + { + "epoch": 0.12580123738922022, + "grad_norm": 0.5044426321983337, + "learning_rate": 9.699409849300818e-05, + "loss": 1.6269, + "step": 2257 + }, + { + "epoch": 0.12585697564238338, + "grad_norm": 0.5458354353904724, + "learning_rate": 9.69910569559243e-05, + "loss": 1.6803, + "step": 2258 + }, + { + "epoch": 0.1259127138955465, + "grad_norm": 0.5350721478462219, + "learning_rate": 9.698801392855808e-05, + "loss": 1.7217, + "step": 2259 + }, + { + "epoch": 0.12596845214870966, + "grad_norm": 0.511223554611206, + "learning_rate": 9.698496941100601e-05, + "loss": 1.6904, + "step": 2260 + }, + { + "epoch": 0.1260241904018728, + "grad_norm": 0.46969008445739746, + "learning_rate": 9.698192340336468e-05, + "loss": 1.5411, + "step": 2261 + }, + { + "epoch": 0.12607992865503595, + "grad_norm": 0.5638684630393982, + "learning_rate": 9.697887590573063e-05, + "loss": 1.6144, + "step": 2262 + }, + { + "epoch": 0.1261356669081991, + "grad_norm": 0.5146279335021973, + "learning_rate": 9.697582691820054e-05, + "loss": 1.605, + "step": 2263 + }, + { + "epoch": 0.12619140516136224, + "grad_norm": 0.46321019530296326, + "learning_rate": 9.697277644087113e-05, + "loss": 1.0444, + "step": 2264 + }, + { + "epoch": 0.1262471434145254, + "grad_norm": 0.5038657784461975, + "learning_rate": 9.69697244738391e-05, + "loss": 1.7319, + "step": 2265 + }, + { + "epoch": 0.12630288166768852, + "grad_norm": 0.593559205532074, + "learning_rate": 9.696667101720127e-05, + "loss": 1.9173, + "step": 2266 + }, + { + "epoch": 0.12635861992085168, + "grad_norm": 0.5412843227386475, + "learning_rate": 9.696361607105448e-05, + "loss": 1.6603, + "step": 2267 + }, + { + "epoch": 0.12641435817401483, + "grad_norm": 0.5422548055648804, + "learning_rate": 9.69605596354956e-05, + "loss": 1.7048, + "step": 2268 + }, + { + "epoch": 0.12647009642717796, + "grad_norm": 0.5455138087272644, + "learning_rate": 9.695750171062156e-05, + "loss": 1.669, + "step": 2269 + }, + { + "epoch": 0.12652583468034112, + "grad_norm": 0.5468176007270813, + "learning_rate": 9.695444229652935e-05, + "loss": 1.6744, + "step": 2270 + }, + { + "epoch": 0.12658157293350428, + "grad_norm": 0.49385011196136475, + "learning_rate": 9.6951381393316e-05, + "loss": 1.6182, + "step": 2271 + }, + { + "epoch": 0.1266373111866674, + "grad_norm": 0.5301021933555603, + "learning_rate": 9.694831900107857e-05, + "loss": 1.7818, + "step": 2272 + }, + { + "epoch": 0.12669304943983056, + "grad_norm": 0.6178646087646484, + "learning_rate": 9.69452551199142e-05, + "loss": 1.9646, + "step": 2273 + }, + { + "epoch": 0.1267487876929937, + "grad_norm": 0.5421885848045349, + "learning_rate": 9.694218974992005e-05, + "loss": 1.6862, + "step": 2274 + }, + { + "epoch": 0.12680452594615685, + "grad_norm": 0.5251665115356445, + "learning_rate": 9.693912289119332e-05, + "loss": 1.7259, + "step": 2275 + }, + { + "epoch": 0.12686026419932, + "grad_norm": 0.5069818496704102, + "learning_rate": 9.693605454383128e-05, + "loss": 1.8426, + "step": 2276 + }, + { + "epoch": 0.12691600245248313, + "grad_norm": 0.5525764226913452, + "learning_rate": 9.693298470793126e-05, + "loss": 1.9999, + "step": 2277 + }, + { + "epoch": 0.1269717407056463, + "grad_norm": 0.5717039108276367, + "learning_rate": 9.69299133835906e-05, + "loss": 1.736, + "step": 2278 + }, + { + "epoch": 0.12702747895880942, + "grad_norm": 0.4768933057785034, + "learning_rate": 9.69268405709067e-05, + "loss": 1.4284, + "step": 2279 + }, + { + "epoch": 0.12708321721197258, + "grad_norm": 0.5677302479743958, + "learning_rate": 9.692376626997703e-05, + "loss": 1.8972, + "step": 2280 + }, + { + "epoch": 0.12713895546513573, + "grad_norm": 0.5202549695968628, + "learning_rate": 9.692069048089907e-05, + "loss": 1.6173, + "step": 2281 + }, + { + "epoch": 0.12719469371829886, + "grad_norm": 0.5106683373451233, + "learning_rate": 9.691761320377037e-05, + "loss": 1.5599, + "step": 2282 + }, + { + "epoch": 0.12725043197146202, + "grad_norm": 0.5042096376419067, + "learning_rate": 9.691453443868854e-05, + "loss": 1.7705, + "step": 2283 + }, + { + "epoch": 0.12730617022462515, + "grad_norm": 0.5391340255737305, + "learning_rate": 9.691145418575122e-05, + "loss": 1.9065, + "step": 2284 + }, + { + "epoch": 0.1273619084777883, + "grad_norm": 0.5074059963226318, + "learning_rate": 9.690837244505607e-05, + "loss": 1.7623, + "step": 2285 + }, + { + "epoch": 0.12741764673095146, + "grad_norm": 0.5277912616729736, + "learning_rate": 9.690528921670084e-05, + "loss": 1.7758, + "step": 2286 + }, + { + "epoch": 0.1274733849841146, + "grad_norm": 0.5068628787994385, + "learning_rate": 9.69022045007833e-05, + "loss": 1.6409, + "step": 2287 + }, + { + "epoch": 0.12752912323727775, + "grad_norm": 0.5209136009216309, + "learning_rate": 9.689911829740133e-05, + "loss": 1.6144, + "step": 2288 + }, + { + "epoch": 0.12758486149044088, + "grad_norm": 0.5280535221099854, + "learning_rate": 9.689603060665273e-05, + "loss": 1.8711, + "step": 2289 + }, + { + "epoch": 0.12764059974360403, + "grad_norm": 0.5511658191680908, + "learning_rate": 9.689294142863548e-05, + "loss": 1.8228, + "step": 2290 + }, + { + "epoch": 0.1276963379967672, + "grad_norm": 0.5436153411865234, + "learning_rate": 9.688985076344754e-05, + "loss": 1.696, + "step": 2291 + }, + { + "epoch": 0.12775207624993032, + "grad_norm": 0.5065414309501648, + "learning_rate": 9.68867586111869e-05, + "loss": 1.6989, + "step": 2292 + }, + { + "epoch": 0.12780781450309348, + "grad_norm": 0.5280441045761108, + "learning_rate": 9.688366497195166e-05, + "loss": 1.6764, + "step": 2293 + }, + { + "epoch": 0.12786355275625663, + "grad_norm": 0.46777546405792236, + "learning_rate": 9.68805698458399e-05, + "loss": 1.4595, + "step": 2294 + }, + { + "epoch": 0.12791929100941976, + "grad_norm": 0.5001897811889648, + "learning_rate": 9.687747323294982e-05, + "loss": 1.4642, + "step": 2295 + }, + { + "epoch": 0.12797502926258292, + "grad_norm": 0.5615783929824829, + "learning_rate": 9.687437513337961e-05, + "loss": 1.7116, + "step": 2296 + }, + { + "epoch": 0.12803076751574605, + "grad_norm": 0.5208621621131897, + "learning_rate": 9.687127554722749e-05, + "loss": 1.637, + "step": 2297 + }, + { + "epoch": 0.1280865057689092, + "grad_norm": 0.5435874462127686, + "learning_rate": 9.68681744745918e-05, + "loss": 1.7629, + "step": 2298 + }, + { + "epoch": 0.12814224402207236, + "grad_norm": 0.5296335220336914, + "learning_rate": 9.686507191557089e-05, + "loss": 1.827, + "step": 2299 + }, + { + "epoch": 0.1281979822752355, + "grad_norm": 0.5191251635551453, + "learning_rate": 9.686196787026311e-05, + "loss": 1.9385, + "step": 2300 + }, + { + "epoch": 0.12825372052839865, + "grad_norm": 0.5494365096092224, + "learning_rate": 9.685886233876695e-05, + "loss": 1.8378, + "step": 2301 + }, + { + "epoch": 0.12830945878156177, + "grad_norm": 0.583207905292511, + "learning_rate": 9.685575532118089e-05, + "loss": 1.6812, + "step": 2302 + }, + { + "epoch": 0.12836519703472493, + "grad_norm": 0.5473710894584656, + "learning_rate": 9.685264681760345e-05, + "loss": 1.9602, + "step": 2303 + }, + { + "epoch": 0.1284209352878881, + "grad_norm": 0.567272424697876, + "learning_rate": 9.684953682813322e-05, + "loss": 1.8125, + "step": 2304 + }, + { + "epoch": 0.12847667354105122, + "grad_norm": 0.4732169806957245, + "learning_rate": 9.684642535286885e-05, + "loss": 1.5566, + "step": 2305 + }, + { + "epoch": 0.12853241179421437, + "grad_norm": 0.516720712184906, + "learning_rate": 9.684331239190899e-05, + "loss": 1.5688, + "step": 2306 + }, + { + "epoch": 0.1285881500473775, + "grad_norm": 0.5574965476989746, + "learning_rate": 9.684019794535237e-05, + "loss": 1.7452, + "step": 2307 + }, + { + "epoch": 0.12864388830054066, + "grad_norm": 0.5443317294120789, + "learning_rate": 9.683708201329777e-05, + "loss": 1.6624, + "step": 2308 + }, + { + "epoch": 0.12869962655370382, + "grad_norm": 0.5809649229049683, + "learning_rate": 9.683396459584404e-05, + "loss": 1.7721, + "step": 2309 + }, + { + "epoch": 0.12875536480686695, + "grad_norm": 0.5913598537445068, + "learning_rate": 9.683084569308997e-05, + "loss": 2.1623, + "step": 2310 + }, + { + "epoch": 0.1288111030600301, + "grad_norm": 0.5404501557350159, + "learning_rate": 9.682772530513453e-05, + "loss": 1.7165, + "step": 2311 + }, + { + "epoch": 0.12886684131319323, + "grad_norm": 0.4902174174785614, + "learning_rate": 9.682460343207669e-05, + "loss": 1.6391, + "step": 2312 + }, + { + "epoch": 0.1289225795663564, + "grad_norm": 0.5791998505592346, + "learning_rate": 9.682148007401541e-05, + "loss": 1.891, + "step": 2313 + }, + { + "epoch": 0.12897831781951954, + "grad_norm": 0.5695587992668152, + "learning_rate": 9.681835523104978e-05, + "loss": 1.9901, + "step": 2314 + }, + { + "epoch": 0.12903405607268267, + "grad_norm": 0.6025593876838684, + "learning_rate": 9.681522890327889e-05, + "loss": 1.7748, + "step": 2315 + }, + { + "epoch": 0.12908979432584583, + "grad_norm": 0.5111005902290344, + "learning_rate": 9.681210109080189e-05, + "loss": 1.6, + "step": 2316 + }, + { + "epoch": 0.129145532579009, + "grad_norm": 0.533204972743988, + "learning_rate": 9.680897179371798e-05, + "loss": 1.6863, + "step": 2317 + }, + { + "epoch": 0.12920127083217212, + "grad_norm": 0.5172824859619141, + "learning_rate": 9.68058410121264e-05, + "loss": 1.7456, + "step": 2318 + }, + { + "epoch": 0.12925700908533527, + "grad_norm": 0.5905986428260803, + "learning_rate": 9.680270874612643e-05, + "loss": 1.572, + "step": 2319 + }, + { + "epoch": 0.1293127473384984, + "grad_norm": 0.5090576410293579, + "learning_rate": 9.679957499581742e-05, + "loss": 1.7946, + "step": 2320 + }, + { + "epoch": 0.12936848559166156, + "grad_norm": 0.5587893724441528, + "learning_rate": 9.679643976129876e-05, + "loss": 1.7792, + "step": 2321 + }, + { + "epoch": 0.12942422384482472, + "grad_norm": 0.6383116841316223, + "learning_rate": 9.679330304266988e-05, + "loss": 2.0051, + "step": 2322 + }, + { + "epoch": 0.12947996209798784, + "grad_norm": 0.5700294375419617, + "learning_rate": 9.679016484003023e-05, + "loss": 1.8419, + "step": 2323 + }, + { + "epoch": 0.129535700351151, + "grad_norm": 0.6416967511177063, + "learning_rate": 9.678702515347938e-05, + "loss": 1.7893, + "step": 2324 + }, + { + "epoch": 0.12959143860431413, + "grad_norm": 0.5761459469795227, + "learning_rate": 9.678388398311686e-05, + "loss": 1.8868, + "step": 2325 + }, + { + "epoch": 0.1296471768574773, + "grad_norm": 0.5779362320899963, + "learning_rate": 9.678074132904231e-05, + "loss": 1.6472, + "step": 2326 + }, + { + "epoch": 0.12970291511064044, + "grad_norm": 0.5250251293182373, + "learning_rate": 9.677759719135542e-05, + "loss": 1.8353, + "step": 2327 + }, + { + "epoch": 0.12975865336380357, + "grad_norm": 0.5306884050369263, + "learning_rate": 9.677445157015585e-05, + "loss": 1.8419, + "step": 2328 + }, + { + "epoch": 0.12981439161696673, + "grad_norm": 0.5761096477508545, + "learning_rate": 9.67713044655434e-05, + "loss": 1.846, + "step": 2329 + }, + { + "epoch": 0.12987012987012986, + "grad_norm": 0.5438225269317627, + "learning_rate": 9.676815587761787e-05, + "loss": 1.734, + "step": 2330 + }, + { + "epoch": 0.12992586812329301, + "grad_norm": 0.5154998898506165, + "learning_rate": 9.676500580647912e-05, + "loss": 1.8124, + "step": 2331 + }, + { + "epoch": 0.12998160637645617, + "grad_norm": 0.5288179516792297, + "learning_rate": 9.676185425222704e-05, + "loss": 2.0132, + "step": 2332 + }, + { + "epoch": 0.1300373446296193, + "grad_norm": 0.5507707595825195, + "learning_rate": 9.675870121496158e-05, + "loss": 1.7686, + "step": 2333 + }, + { + "epoch": 0.13009308288278246, + "grad_norm": 0.4893222451210022, + "learning_rate": 9.675554669478272e-05, + "loss": 1.8113, + "step": 2334 + }, + { + "epoch": 0.13014882113594559, + "grad_norm": 0.5455611944198608, + "learning_rate": 9.675239069179056e-05, + "loss": 1.7593, + "step": 2335 + }, + { + "epoch": 0.13020455938910874, + "grad_norm": 0.5068415403366089, + "learning_rate": 9.674923320608513e-05, + "loss": 1.5302, + "step": 2336 + }, + { + "epoch": 0.1302602976422719, + "grad_norm": 0.5160056948661804, + "learning_rate": 9.674607423776661e-05, + "loss": 1.5793, + "step": 2337 + }, + { + "epoch": 0.13031603589543503, + "grad_norm": 0.5414824485778809, + "learning_rate": 9.674291378693515e-05, + "loss": 1.6392, + "step": 2338 + }, + { + "epoch": 0.13037177414859819, + "grad_norm": 0.5210713744163513, + "learning_rate": 9.673975185369098e-05, + "loss": 1.9403, + "step": 2339 + }, + { + "epoch": 0.13042751240176134, + "grad_norm": 0.5296798944473267, + "learning_rate": 9.673658843813442e-05, + "loss": 1.7093, + "step": 2340 + }, + { + "epoch": 0.13048325065492447, + "grad_norm": 0.5705276131629944, + "learning_rate": 9.673342354036574e-05, + "loss": 1.7645, + "step": 2341 + }, + { + "epoch": 0.13053898890808763, + "grad_norm": 0.5289913415908813, + "learning_rate": 9.673025716048536e-05, + "loss": 1.81, + "step": 2342 + }, + { + "epoch": 0.13059472716125076, + "grad_norm": 0.5237072706222534, + "learning_rate": 9.672708929859368e-05, + "loss": 2.0053, + "step": 2343 + }, + { + "epoch": 0.1306504654144139, + "grad_norm": 0.5144554376602173, + "learning_rate": 9.672391995479115e-05, + "loss": 1.7236, + "step": 2344 + }, + { + "epoch": 0.13070620366757707, + "grad_norm": 0.5384603142738342, + "learning_rate": 9.672074912917831e-05, + "loss": 1.7492, + "step": 2345 + }, + { + "epoch": 0.1307619419207402, + "grad_norm": 0.5475570559501648, + "learning_rate": 9.67175768218557e-05, + "loss": 1.9068, + "step": 2346 + }, + { + "epoch": 0.13081768017390336, + "grad_norm": 0.512937068939209, + "learning_rate": 9.671440303292395e-05, + "loss": 1.7364, + "step": 2347 + }, + { + "epoch": 0.13087341842706648, + "grad_norm": 0.48609036207199097, + "learning_rate": 9.67112277624837e-05, + "loss": 1.5916, + "step": 2348 + }, + { + "epoch": 0.13092915668022964, + "grad_norm": 0.5132019519805908, + "learning_rate": 9.670805101063563e-05, + "loss": 1.7222, + "step": 2349 + }, + { + "epoch": 0.1309848949333928, + "grad_norm": 0.5112780928611755, + "learning_rate": 9.670487277748052e-05, + "loss": 1.6418, + "step": 2350 + }, + { + "epoch": 0.13104063318655593, + "grad_norm": 0.531306803226471, + "learning_rate": 9.670169306311916e-05, + "loss": 1.7323, + "step": 2351 + }, + { + "epoch": 0.13109637143971908, + "grad_norm": 0.48118212819099426, + "learning_rate": 9.669851186765238e-05, + "loss": 1.4822, + "step": 2352 + }, + { + "epoch": 0.1311521096928822, + "grad_norm": 0.5309464931488037, + "learning_rate": 9.669532919118108e-05, + "loss": 1.767, + "step": 2353 + }, + { + "epoch": 0.13120784794604537, + "grad_norm": 0.532576322555542, + "learning_rate": 9.669214503380617e-05, + "loss": 1.7228, + "step": 2354 + }, + { + "epoch": 0.13126358619920853, + "grad_norm": 0.49597617983818054, + "learning_rate": 9.668895939562868e-05, + "loss": 1.4792, + "step": 2355 + }, + { + "epoch": 0.13131932445237166, + "grad_norm": 0.5480032563209534, + "learning_rate": 9.66857722767496e-05, + "loss": 1.7285, + "step": 2356 + }, + { + "epoch": 0.1313750627055348, + "grad_norm": 0.5191400647163391, + "learning_rate": 9.668258367727002e-05, + "loss": 1.5942, + "step": 2357 + }, + { + "epoch": 0.13143080095869794, + "grad_norm": 0.5335458517074585, + "learning_rate": 9.667939359729109e-05, + "loss": 1.8991, + "step": 2358 + }, + { + "epoch": 0.1314865392118611, + "grad_norm": 0.5872248411178589, + "learning_rate": 9.667620203691393e-05, + "loss": 1.8247, + "step": 2359 + }, + { + "epoch": 0.13154227746502425, + "grad_norm": 0.5811527967453003, + "learning_rate": 9.667300899623976e-05, + "loss": 2.0837, + "step": 2360 + }, + { + "epoch": 0.13159801571818738, + "grad_norm": 0.5214108824729919, + "learning_rate": 9.66698144753699e-05, + "loss": 1.681, + "step": 2361 + }, + { + "epoch": 0.13165375397135054, + "grad_norm": 0.5067755579948425, + "learning_rate": 9.666661847440563e-05, + "loss": 1.7168, + "step": 2362 + }, + { + "epoch": 0.1317094922245137, + "grad_norm": 0.5883169770240784, + "learning_rate": 9.666342099344829e-05, + "loss": 1.8355, + "step": 2363 + }, + { + "epoch": 0.13176523047767683, + "grad_norm": 0.5047624111175537, + "learning_rate": 9.666022203259931e-05, + "loss": 1.6872, + "step": 2364 + }, + { + "epoch": 0.13182096873083998, + "grad_norm": 0.5165308117866516, + "learning_rate": 9.665702159196013e-05, + "loss": 1.6867, + "step": 2365 + }, + { + "epoch": 0.1318767069840031, + "grad_norm": 0.5131801962852478, + "learning_rate": 9.665381967163227e-05, + "loss": 1.5836, + "step": 2366 + }, + { + "epoch": 0.13193244523716627, + "grad_norm": 0.5561967492103577, + "learning_rate": 9.665061627171726e-05, + "loss": 1.6933, + "step": 2367 + }, + { + "epoch": 0.13198818349032942, + "grad_norm": 0.6118646860122681, + "learning_rate": 9.664741139231668e-05, + "loss": 2.0988, + "step": 2368 + }, + { + "epoch": 0.13204392174349255, + "grad_norm": 0.5255211591720581, + "learning_rate": 9.664420503353218e-05, + "loss": 1.7087, + "step": 2369 + }, + { + "epoch": 0.1320996599966557, + "grad_norm": 0.555664598941803, + "learning_rate": 9.664099719546547e-05, + "loss": 1.8029, + "step": 2370 + }, + { + "epoch": 0.13215539824981884, + "grad_norm": 0.5417226552963257, + "learning_rate": 9.663778787821825e-05, + "loss": 1.7483, + "step": 2371 + }, + { + "epoch": 0.132211136502982, + "grad_norm": 0.5773631930351257, + "learning_rate": 9.663457708189232e-05, + "loss": 1.7137, + "step": 2372 + }, + { + "epoch": 0.13226687475614515, + "grad_norm": 0.5354270935058594, + "learning_rate": 9.66313648065895e-05, + "loss": 1.8748, + "step": 2373 + }, + { + "epoch": 0.13232261300930828, + "grad_norm": 0.5149551033973694, + "learning_rate": 9.662815105241168e-05, + "loss": 1.5948, + "step": 2374 + }, + { + "epoch": 0.13237835126247144, + "grad_norm": 0.5566468238830566, + "learning_rate": 9.662493581946074e-05, + "loss": 1.7724, + "step": 2375 + }, + { + "epoch": 0.13243408951563457, + "grad_norm": 0.5304192304611206, + "learning_rate": 9.66217191078387e-05, + "loss": 1.8068, + "step": 2376 + }, + { + "epoch": 0.13248982776879772, + "grad_norm": 0.5885264873504639, + "learning_rate": 9.661850091764756e-05, + "loss": 1.9129, + "step": 2377 + }, + { + "epoch": 0.13254556602196088, + "grad_norm": 0.4796747863292694, + "learning_rate": 9.661528124898937e-05, + "loss": 1.6931, + "step": 2378 + }, + { + "epoch": 0.132601304275124, + "grad_norm": 0.49771320819854736, + "learning_rate": 9.661206010196624e-05, + "loss": 1.5938, + "step": 2379 + }, + { + "epoch": 0.13265704252828717, + "grad_norm": 0.530432939529419, + "learning_rate": 9.660883747668034e-05, + "loss": 2.0283, + "step": 2380 + }, + { + "epoch": 0.1327127807814503, + "grad_norm": 0.515631914138794, + "learning_rate": 9.660561337323385e-05, + "loss": 1.8549, + "step": 2381 + }, + { + "epoch": 0.13276851903461345, + "grad_norm": 0.6954619288444519, + "learning_rate": 9.660238779172905e-05, + "loss": 2.0152, + "step": 2382 + }, + { + "epoch": 0.1328242572877766, + "grad_norm": 0.5233824253082275, + "learning_rate": 9.65991607322682e-05, + "loss": 1.7353, + "step": 2383 + }, + { + "epoch": 0.13287999554093974, + "grad_norm": 0.5527575016021729, + "learning_rate": 9.659593219495368e-05, + "loss": 1.6361, + "step": 2384 + }, + { + "epoch": 0.1329357337941029, + "grad_norm": 0.48741617798805237, + "learning_rate": 9.659270217988786e-05, + "loss": 1.682, + "step": 2385 + }, + { + "epoch": 0.13299147204726605, + "grad_norm": 0.5804024338722229, + "learning_rate": 9.658947068717316e-05, + "loss": 1.5736, + "step": 2386 + }, + { + "epoch": 0.13304721030042918, + "grad_norm": 0.5614018440246582, + "learning_rate": 9.658623771691211e-05, + "loss": 1.9172, + "step": 2387 + }, + { + "epoch": 0.13310294855359234, + "grad_norm": 0.5239617824554443, + "learning_rate": 9.658300326920722e-05, + "loss": 1.7751, + "step": 2388 + }, + { + "epoch": 0.13315868680675547, + "grad_norm": 0.5195541381835938, + "learning_rate": 9.657976734416106e-05, + "loss": 1.875, + "step": 2389 + }, + { + "epoch": 0.13321442505991862, + "grad_norm": 0.531480610370636, + "learning_rate": 9.657652994187625e-05, + "loss": 1.7631, + "step": 2390 + }, + { + "epoch": 0.13327016331308178, + "grad_norm": 0.5037621259689331, + "learning_rate": 9.657329106245547e-05, + "loss": 1.6134, + "step": 2391 + }, + { + "epoch": 0.1333259015662449, + "grad_norm": 0.4974221885204315, + "learning_rate": 9.657005070600144e-05, + "loss": 1.7501, + "step": 2392 + }, + { + "epoch": 0.13338163981940807, + "grad_norm": 0.5308098196983337, + "learning_rate": 9.656680887261693e-05, + "loss": 1.7283, + "step": 2393 + }, + { + "epoch": 0.1334373780725712, + "grad_norm": 0.4996281862258911, + "learning_rate": 9.656356556240473e-05, + "loss": 1.7897, + "step": 2394 + }, + { + "epoch": 0.13349311632573435, + "grad_norm": 0.6450517773628235, + "learning_rate": 9.656032077546772e-05, + "loss": 1.7089, + "step": 2395 + }, + { + "epoch": 0.1335488545788975, + "grad_norm": 0.5968025326728821, + "learning_rate": 9.655707451190883e-05, + "loss": 1.8664, + "step": 2396 + }, + { + "epoch": 0.13360459283206064, + "grad_norm": 0.470813512802124, + "learning_rate": 9.655382677183095e-05, + "loss": 1.5199, + "step": 2397 + }, + { + "epoch": 0.1336603310852238, + "grad_norm": 0.5651730298995972, + "learning_rate": 9.655057755533712e-05, + "loss": 1.9733, + "step": 2398 + }, + { + "epoch": 0.13371606933838692, + "grad_norm": 0.5370044112205505, + "learning_rate": 9.654732686253039e-05, + "loss": 1.8281, + "step": 2399 + }, + { + "epoch": 0.13377180759155008, + "grad_norm": 0.5285357236862183, + "learning_rate": 9.654407469351383e-05, + "loss": 1.592, + "step": 2400 + }, + { + "epoch": 0.13382754584471324, + "grad_norm": 0.5265277624130249, + "learning_rate": 9.654082104839059e-05, + "loss": 1.8503, + "step": 2401 + }, + { + "epoch": 0.13388328409787636, + "grad_norm": 0.5449655652046204, + "learning_rate": 9.653756592726386e-05, + "loss": 1.8579, + "step": 2402 + }, + { + "epoch": 0.13393902235103952, + "grad_norm": 0.5737154483795166, + "learning_rate": 9.653430933023689e-05, + "loss": 1.8618, + "step": 2403 + }, + { + "epoch": 0.13399476060420265, + "grad_norm": 0.5164530873298645, + "learning_rate": 9.653105125741292e-05, + "loss": 1.6213, + "step": 2404 + }, + { + "epoch": 0.1340504988573658, + "grad_norm": 0.5017974376678467, + "learning_rate": 9.65277917088953e-05, + "loss": 1.6255, + "step": 2405 + }, + { + "epoch": 0.13410623711052896, + "grad_norm": 0.5122340321540833, + "learning_rate": 9.652453068478741e-05, + "loss": 1.5653, + "step": 2406 + }, + { + "epoch": 0.1341619753636921, + "grad_norm": 0.6067832708358765, + "learning_rate": 9.652126818519266e-05, + "loss": 2.0985, + "step": 2407 + }, + { + "epoch": 0.13421771361685525, + "grad_norm": 0.5796366333961487, + "learning_rate": 9.651800421021453e-05, + "loss": 1.9636, + "step": 2408 + }, + { + "epoch": 0.1342734518700184, + "grad_norm": 0.5619643926620483, + "learning_rate": 9.651473875995651e-05, + "loss": 1.7129, + "step": 2409 + }, + { + "epoch": 0.13432919012318154, + "grad_norm": 0.5060097575187683, + "learning_rate": 9.651147183452219e-05, + "loss": 1.5304, + "step": 2410 + }, + { + "epoch": 0.1343849283763447, + "grad_norm": 0.532145619392395, + "learning_rate": 9.650820343401515e-05, + "loss": 1.7844, + "step": 2411 + }, + { + "epoch": 0.13444066662950782, + "grad_norm": 0.5342923402786255, + "learning_rate": 9.650493355853906e-05, + "loss": 1.8585, + "step": 2412 + }, + { + "epoch": 0.13449640488267098, + "grad_norm": 0.49805736541748047, + "learning_rate": 9.650166220819764e-05, + "loss": 1.4576, + "step": 2413 + }, + { + "epoch": 0.13455214313583413, + "grad_norm": 0.5234712362289429, + "learning_rate": 9.64983893830946e-05, + "loss": 1.6994, + "step": 2414 + }, + { + "epoch": 0.13460788138899726, + "grad_norm": 0.5124284029006958, + "learning_rate": 9.649511508333375e-05, + "loss": 1.6614, + "step": 2415 + }, + { + "epoch": 0.13466361964216042, + "grad_norm": 0.4958679676055908, + "learning_rate": 9.649183930901895e-05, + "loss": 1.56, + "step": 2416 + }, + { + "epoch": 0.13471935789532355, + "grad_norm": 0.5191091895103455, + "learning_rate": 9.648856206025407e-05, + "loss": 1.7004, + "step": 2417 + }, + { + "epoch": 0.1347750961484867, + "grad_norm": 0.5366125702857971, + "learning_rate": 9.648528333714304e-05, + "loss": 1.7206, + "step": 2418 + }, + { + "epoch": 0.13483083440164986, + "grad_norm": 0.5979599952697754, + "learning_rate": 9.648200313978986e-05, + "loss": 1.757, + "step": 2419 + }, + { + "epoch": 0.134886572654813, + "grad_norm": 0.5878745317459106, + "learning_rate": 9.647872146829855e-05, + "loss": 1.7236, + "step": 2420 + }, + { + "epoch": 0.13494231090797615, + "grad_norm": 0.5160901546478271, + "learning_rate": 9.647543832277317e-05, + "loss": 1.7274, + "step": 2421 + }, + { + "epoch": 0.13499804916113928, + "grad_norm": 0.5626492500305176, + "learning_rate": 9.647215370331786e-05, + "loss": 1.9507, + "step": 2422 + }, + { + "epoch": 0.13505378741430243, + "grad_norm": 0.5624846816062927, + "learning_rate": 9.646886761003679e-05, + "loss": 1.9476, + "step": 2423 + }, + { + "epoch": 0.1351095256674656, + "grad_norm": 0.5468912720680237, + "learning_rate": 9.646558004303419e-05, + "loss": 1.7836, + "step": 2424 + }, + { + "epoch": 0.13516526392062872, + "grad_norm": 0.5446691513061523, + "learning_rate": 9.646229100241429e-05, + "loss": 1.7664, + "step": 2425 + }, + { + "epoch": 0.13522100217379188, + "grad_norm": 0.5568925738334656, + "learning_rate": 9.64590004882814e-05, + "loss": 2.0063, + "step": 2426 + }, + { + "epoch": 0.135276740426955, + "grad_norm": 0.560264527797699, + "learning_rate": 9.64557085007399e-05, + "loss": 1.8132, + "step": 2427 + }, + { + "epoch": 0.13533247868011816, + "grad_norm": 0.5093153715133667, + "learning_rate": 9.64524150398942e-05, + "loss": 1.4198, + "step": 2428 + }, + { + "epoch": 0.13538821693328132, + "grad_norm": 0.5184745192527771, + "learning_rate": 9.64491201058487e-05, + "loss": 1.6062, + "step": 2429 + }, + { + "epoch": 0.13544395518644445, + "grad_norm": 0.5188031792640686, + "learning_rate": 9.644582369870794e-05, + "loss": 1.8179, + "step": 2430 + }, + { + "epoch": 0.1354996934396076, + "grad_norm": 0.537381112575531, + "learning_rate": 9.644252581857647e-05, + "loss": 1.9697, + "step": 2431 + }, + { + "epoch": 0.13555543169277076, + "grad_norm": 0.5132935047149658, + "learning_rate": 9.643922646555883e-05, + "loss": 1.6746, + "step": 2432 + }, + { + "epoch": 0.1356111699459339, + "grad_norm": 0.5265336036682129, + "learning_rate": 9.64359256397597e-05, + "loss": 1.6561, + "step": 2433 + }, + { + "epoch": 0.13566690819909705, + "grad_norm": 0.5241510272026062, + "learning_rate": 9.643262334128374e-05, + "loss": 1.577, + "step": 2434 + }, + { + "epoch": 0.13572264645226018, + "grad_norm": 0.5073732137680054, + "learning_rate": 9.642931957023569e-05, + "loss": 1.6821, + "step": 2435 + }, + { + "epoch": 0.13577838470542333, + "grad_norm": 0.4868320822715759, + "learning_rate": 9.642601432672034e-05, + "loss": 1.4476, + "step": 2436 + }, + { + "epoch": 0.1358341229585865, + "grad_norm": 0.5248389840126038, + "learning_rate": 9.642270761084249e-05, + "loss": 1.9406, + "step": 2437 + }, + { + "epoch": 0.13588986121174962, + "grad_norm": 0.492227166891098, + "learning_rate": 9.641939942270701e-05, + "loss": 1.6538, + "step": 2438 + }, + { + "epoch": 0.13594559946491278, + "grad_norm": 0.5446291565895081, + "learning_rate": 9.641608976241883e-05, + "loss": 1.8208, + "step": 2439 + }, + { + "epoch": 0.1360013377180759, + "grad_norm": 0.5214070677757263, + "learning_rate": 9.64127786300829e-05, + "loss": 1.6889, + "step": 2440 + }, + { + "epoch": 0.13605707597123906, + "grad_norm": 0.5892273187637329, + "learning_rate": 9.640946602580426e-05, + "loss": 2.0888, + "step": 2441 + }, + { + "epoch": 0.13611281422440222, + "grad_norm": 0.5230244994163513, + "learning_rate": 9.640615194968791e-05, + "loss": 1.7068, + "step": 2442 + }, + { + "epoch": 0.13616855247756535, + "grad_norm": 0.5090706944465637, + "learning_rate": 9.640283640183903e-05, + "loss": 1.7328, + "step": 2443 + }, + { + "epoch": 0.1362242907307285, + "grad_norm": 0.5167303681373596, + "learning_rate": 9.639951938236269e-05, + "loss": 1.7062, + "step": 2444 + }, + { + "epoch": 0.13628002898389163, + "grad_norm": 0.5717843770980835, + "learning_rate": 9.639620089136413e-05, + "loss": 1.8633, + "step": 2445 + }, + { + "epoch": 0.1363357672370548, + "grad_norm": 0.514242947101593, + "learning_rate": 9.63928809289486e-05, + "loss": 1.9126, + "step": 2446 + }, + { + "epoch": 0.13639150549021795, + "grad_norm": 0.5159420371055603, + "learning_rate": 9.638955949522137e-05, + "loss": 1.6795, + "step": 2447 + }, + { + "epoch": 0.13644724374338107, + "grad_norm": 0.4026312828063965, + "learning_rate": 9.638623659028779e-05, + "loss": 1.008, + "step": 2448 + }, + { + "epoch": 0.13650298199654423, + "grad_norm": 0.5365085601806641, + "learning_rate": 9.63829122142532e-05, + "loss": 1.9597, + "step": 2449 + }, + { + "epoch": 0.13655872024970736, + "grad_norm": 0.528103768825531, + "learning_rate": 9.637958636722311e-05, + "loss": 1.8801, + "step": 2450 + }, + { + "epoch": 0.13661445850287052, + "grad_norm": 0.5581492185592651, + "learning_rate": 9.637625904930292e-05, + "loss": 1.6802, + "step": 2451 + }, + { + "epoch": 0.13667019675603367, + "grad_norm": 0.5182628631591797, + "learning_rate": 9.63729302605982e-05, + "loss": 1.8041, + "step": 2452 + }, + { + "epoch": 0.1367259350091968, + "grad_norm": 0.48804765939712524, + "learning_rate": 9.636960000121451e-05, + "loss": 1.7381, + "step": 2453 + }, + { + "epoch": 0.13678167326235996, + "grad_norm": 0.5185055136680603, + "learning_rate": 9.636626827125745e-05, + "loss": 1.8356, + "step": 2454 + }, + { + "epoch": 0.13683741151552312, + "grad_norm": 0.5890060663223267, + "learning_rate": 9.63629350708327e-05, + "loss": 1.8636, + "step": 2455 + }, + { + "epoch": 0.13689314976868625, + "grad_norm": 0.5501379370689392, + "learning_rate": 9.635960040004597e-05, + "loss": 2.0967, + "step": 2456 + }, + { + "epoch": 0.1369488880218494, + "grad_norm": 0.5753256678581238, + "learning_rate": 9.635626425900301e-05, + "loss": 1.8931, + "step": 2457 + }, + { + "epoch": 0.13700462627501253, + "grad_norm": 0.5230208039283752, + "learning_rate": 9.635292664780962e-05, + "loss": 1.6546, + "step": 2458 + }, + { + "epoch": 0.1370603645281757, + "grad_norm": 0.507422149181366, + "learning_rate": 9.634958756657165e-05, + "loss": 1.7135, + "step": 2459 + }, + { + "epoch": 0.13711610278133884, + "grad_norm": 0.48532143235206604, + "learning_rate": 9.634624701539498e-05, + "loss": 1.5297, + "step": 2460 + }, + { + "epoch": 0.13717184103450197, + "grad_norm": 0.5039069652557373, + "learning_rate": 9.63429049943856e-05, + "loss": 1.9089, + "step": 2461 + }, + { + "epoch": 0.13722757928766513, + "grad_norm": 0.5480893850326538, + "learning_rate": 9.633956150364947e-05, + "loss": 1.7987, + "step": 2462 + }, + { + "epoch": 0.13728331754082826, + "grad_norm": 0.5339971780776978, + "learning_rate": 9.633621654329261e-05, + "loss": 1.7035, + "step": 2463 + }, + { + "epoch": 0.13733905579399142, + "grad_norm": 0.5058174133300781, + "learning_rate": 9.633287011342113e-05, + "loss": 1.6676, + "step": 2464 + }, + { + "epoch": 0.13739479404715457, + "grad_norm": 0.5697671175003052, + "learning_rate": 9.632952221414116e-05, + "loss": 1.9683, + "step": 2465 + }, + { + "epoch": 0.1374505323003177, + "grad_norm": 0.5071194767951965, + "learning_rate": 9.632617284555886e-05, + "loss": 1.9232, + "step": 2466 + }, + { + "epoch": 0.13750627055348086, + "grad_norm": 0.5929427742958069, + "learning_rate": 9.632282200778045e-05, + "loss": 1.8352, + "step": 2467 + }, + { + "epoch": 0.137562008806644, + "grad_norm": 0.528889000415802, + "learning_rate": 9.631946970091221e-05, + "loss": 1.7636, + "step": 2468 + }, + { + "epoch": 0.13761774705980714, + "grad_norm": 1.3195804357528687, + "learning_rate": 9.631611592506046e-05, + "loss": 1.7929, + "step": 2469 + }, + { + "epoch": 0.1376734853129703, + "grad_norm": 0.5272727608680725, + "learning_rate": 9.631276068033154e-05, + "loss": 1.89, + "step": 2470 + }, + { + "epoch": 0.13772922356613343, + "grad_norm": 0.5453211665153503, + "learning_rate": 9.630940396683188e-05, + "loss": 1.6766, + "step": 2471 + }, + { + "epoch": 0.1377849618192966, + "grad_norm": 0.5383656620979309, + "learning_rate": 9.630604578466794e-05, + "loss": 1.6168, + "step": 2472 + }, + { + "epoch": 0.13784070007245972, + "grad_norm": 0.5008901953697205, + "learning_rate": 9.63026861339462e-05, + "loss": 1.5592, + "step": 2473 + }, + { + "epoch": 0.13789643832562287, + "grad_norm": 0.5986757874488831, + "learning_rate": 9.629932501477321e-05, + "loss": 2.0793, + "step": 2474 + }, + { + "epoch": 0.13795217657878603, + "grad_norm": 0.5368151664733887, + "learning_rate": 9.629596242725558e-05, + "loss": 1.6693, + "step": 2475 + }, + { + "epoch": 0.13800791483194916, + "grad_norm": 0.5330533385276794, + "learning_rate": 9.629259837149995e-05, + "loss": 1.7398, + "step": 2476 + }, + { + "epoch": 0.13806365308511231, + "grad_norm": 0.5093852877616882, + "learning_rate": 9.6289232847613e-05, + "loss": 1.6665, + "step": 2477 + }, + { + "epoch": 0.13811939133827547, + "grad_norm": 0.5469667911529541, + "learning_rate": 9.628586585570149e-05, + "loss": 1.8411, + "step": 2478 + }, + { + "epoch": 0.1381751295914386, + "grad_norm": 0.5832191705703735, + "learning_rate": 9.628249739587217e-05, + "loss": 1.8821, + "step": 2479 + }, + { + "epoch": 0.13823086784460176, + "grad_norm": 0.5154137015342712, + "learning_rate": 9.627912746823187e-05, + "loss": 1.6075, + "step": 2480 + }, + { + "epoch": 0.13828660609776489, + "grad_norm": 0.5499826669692993, + "learning_rate": 9.627575607288745e-05, + "loss": 1.735, + "step": 2481 + }, + { + "epoch": 0.13834234435092804, + "grad_norm": 0.6152673959732056, + "learning_rate": 9.627238320994589e-05, + "loss": 2.0207, + "step": 2482 + }, + { + "epoch": 0.1383980826040912, + "grad_norm": 0.49340128898620605, + "learning_rate": 9.626900887951412e-05, + "loss": 1.64, + "step": 2483 + }, + { + "epoch": 0.13845382085725433, + "grad_norm": 0.5563956499099731, + "learning_rate": 9.626563308169914e-05, + "loss": 1.9062, + "step": 2484 + }, + { + "epoch": 0.13850955911041749, + "grad_norm": 0.4945386052131653, + "learning_rate": 9.626225581660803e-05, + "loss": 1.4852, + "step": 2485 + }, + { + "epoch": 0.13856529736358061, + "grad_norm": 0.5170808434486389, + "learning_rate": 9.625887708434788e-05, + "loss": 1.7517, + "step": 2486 + }, + { + "epoch": 0.13862103561674377, + "grad_norm": 0.5459514260292053, + "learning_rate": 9.625549688502589e-05, + "loss": 1.6785, + "step": 2487 + }, + { + "epoch": 0.13867677386990693, + "grad_norm": 0.5073458552360535, + "learning_rate": 9.62521152187492e-05, + "loss": 1.7213, + "step": 2488 + }, + { + "epoch": 0.13873251212307006, + "grad_norm": 0.4946017563343048, + "learning_rate": 9.624873208562509e-05, + "loss": 1.6256, + "step": 2489 + }, + { + "epoch": 0.1387882503762332, + "grad_norm": 0.5971960425376892, + "learning_rate": 9.624534748576085e-05, + "loss": 1.9997, + "step": 2490 + }, + { + "epoch": 0.13884398862939634, + "grad_norm": 0.5135798454284668, + "learning_rate": 9.624196141926381e-05, + "loss": 1.6544, + "step": 2491 + }, + { + "epoch": 0.1388997268825595, + "grad_norm": 0.5550069212913513, + "learning_rate": 9.623857388624138e-05, + "loss": 1.8297, + "step": 2492 + }, + { + "epoch": 0.13895546513572266, + "grad_norm": 0.5476080179214478, + "learning_rate": 9.623518488680095e-05, + "loss": 1.9136, + "step": 2493 + }, + { + "epoch": 0.13901120338888578, + "grad_norm": 0.5327604413032532, + "learning_rate": 9.623179442105004e-05, + "loss": 1.7471, + "step": 2494 + }, + { + "epoch": 0.13906694164204894, + "grad_norm": 0.5192773938179016, + "learning_rate": 9.622840248909617e-05, + "loss": 1.6395, + "step": 2495 + }, + { + "epoch": 0.13912267989521207, + "grad_norm": 0.5261735916137695, + "learning_rate": 9.622500909104689e-05, + "loss": 1.6751, + "step": 2496 + }, + { + "epoch": 0.13917841814837523, + "grad_norm": 0.5256398916244507, + "learning_rate": 9.622161422700984e-05, + "loss": 1.7681, + "step": 2497 + }, + { + "epoch": 0.13923415640153838, + "grad_norm": 0.5021438002586365, + "learning_rate": 9.621821789709267e-05, + "loss": 1.6317, + "step": 2498 + }, + { + "epoch": 0.1392898946547015, + "grad_norm": 0.5900087952613831, + "learning_rate": 9.62148201014031e-05, + "loss": 1.8691, + "step": 2499 + }, + { + "epoch": 0.13934563290786467, + "grad_norm": 0.492544025182724, + "learning_rate": 9.621142084004889e-05, + "loss": 1.6061, + "step": 2500 + }, + { + "epoch": 0.13940137116102783, + "grad_norm": 0.5590608716011047, + "learning_rate": 9.620802011313785e-05, + "loss": 1.9551, + "step": 2501 + }, + { + "epoch": 0.13945710941419096, + "grad_norm": 0.5163889527320862, + "learning_rate": 9.620461792077782e-05, + "loss": 1.8419, + "step": 2502 + }, + { + "epoch": 0.1395128476673541, + "grad_norm": 0.5565062165260315, + "learning_rate": 9.620121426307669e-05, + "loss": 1.9454, + "step": 2503 + }, + { + "epoch": 0.13956858592051724, + "grad_norm": 0.5010280013084412, + "learning_rate": 9.619780914014242e-05, + "loss": 1.6189, + "step": 2504 + }, + { + "epoch": 0.1396243241736804, + "grad_norm": 0.5342069268226624, + "learning_rate": 9.619440255208301e-05, + "loss": 1.7667, + "step": 2505 + }, + { + "epoch": 0.13968006242684355, + "grad_norm": 0.5092571377754211, + "learning_rate": 9.619099449900646e-05, + "loss": 1.6797, + "step": 2506 + }, + { + "epoch": 0.13973580068000668, + "grad_norm": 0.5784452557563782, + "learning_rate": 9.618758498102089e-05, + "loss": 1.9559, + "step": 2507 + }, + { + "epoch": 0.13979153893316984, + "grad_norm": 0.5389965176582336, + "learning_rate": 9.618417399823441e-05, + "loss": 1.7971, + "step": 2508 + }, + { + "epoch": 0.13984727718633297, + "grad_norm": 0.5197558999061584, + "learning_rate": 9.618076155075521e-05, + "loss": 1.8631, + "step": 2509 + }, + { + "epoch": 0.13990301543949613, + "grad_norm": 0.5198122262954712, + "learning_rate": 9.617734763869151e-05, + "loss": 1.7487, + "step": 2510 + }, + { + "epoch": 0.13995875369265928, + "grad_norm": 0.515998363494873, + "learning_rate": 9.617393226215157e-05, + "loss": 1.6849, + "step": 2511 + }, + { + "epoch": 0.1400144919458224, + "grad_norm": 0.5627748370170593, + "learning_rate": 9.617051542124371e-05, + "loss": 1.7637, + "step": 2512 + }, + { + "epoch": 0.14007023019898557, + "grad_norm": 0.49436190724372864, + "learning_rate": 9.61670971160763e-05, + "loss": 1.6303, + "step": 2513 + }, + { + "epoch": 0.1401259684521487, + "grad_norm": 0.5101426839828491, + "learning_rate": 9.616367734675772e-05, + "loss": 1.5709, + "step": 2514 + }, + { + "epoch": 0.14018170670531185, + "grad_norm": 0.5416966080665588, + "learning_rate": 9.616025611339647e-05, + "loss": 1.8456, + "step": 2515 + }, + { + "epoch": 0.140237444958475, + "grad_norm": 0.5797568559646606, + "learning_rate": 9.615683341610103e-05, + "loss": 1.7499, + "step": 2516 + }, + { + "epoch": 0.14029318321163814, + "grad_norm": 0.5696927905082703, + "learning_rate": 9.615340925497995e-05, + "loss": 1.6875, + "step": 2517 + }, + { + "epoch": 0.1403489214648013, + "grad_norm": 0.49985361099243164, + "learning_rate": 9.61499836301418e-05, + "loss": 1.6336, + "step": 2518 + }, + { + "epoch": 0.14040465971796443, + "grad_norm": 0.5426433086395264, + "learning_rate": 9.614655654169527e-05, + "loss": 1.8164, + "step": 2519 + }, + { + "epoch": 0.14046039797112758, + "grad_norm": 0.562021017074585, + "learning_rate": 9.6143127989749e-05, + "loss": 1.626, + "step": 2520 + }, + { + "epoch": 0.14051613622429074, + "grad_norm": 0.5873587727546692, + "learning_rate": 9.613969797441173e-05, + "loss": 2.0087, + "step": 2521 + }, + { + "epoch": 0.14057187447745387, + "grad_norm": 0.5239251852035522, + "learning_rate": 9.613626649579229e-05, + "loss": 1.74, + "step": 2522 + }, + { + "epoch": 0.14062761273061702, + "grad_norm": 0.613498330116272, + "learning_rate": 9.613283355399945e-05, + "loss": 1.7088, + "step": 2523 + }, + { + "epoch": 0.14068335098378018, + "grad_norm": 0.5224273800849915, + "learning_rate": 9.61293991491421e-05, + "loss": 1.5665, + "step": 2524 + }, + { + "epoch": 0.1407390892369433, + "grad_norm": 0.5063479542732239, + "learning_rate": 9.612596328132915e-05, + "loss": 1.3456, + "step": 2525 + }, + { + "epoch": 0.14079482749010647, + "grad_norm": 0.5042296648025513, + "learning_rate": 9.61225259506696e-05, + "loss": 1.6111, + "step": 2526 + }, + { + "epoch": 0.1408505657432696, + "grad_norm": 0.5116347670555115, + "learning_rate": 9.611908715727244e-05, + "loss": 1.9546, + "step": 2527 + }, + { + "epoch": 0.14090630399643275, + "grad_norm": 0.5643008351325989, + "learning_rate": 9.611564690124672e-05, + "loss": 1.8488, + "step": 2528 + }, + { + "epoch": 0.1409620422495959, + "grad_norm": 0.5275754332542419, + "learning_rate": 9.611220518270155e-05, + "loss": 1.7367, + "step": 2529 + }, + { + "epoch": 0.14101778050275904, + "grad_norm": 0.523114800453186, + "learning_rate": 9.61087620017461e-05, + "loss": 1.5207, + "step": 2530 + }, + { + "epoch": 0.1410735187559222, + "grad_norm": 0.5141943693161011, + "learning_rate": 9.610531735848953e-05, + "loss": 1.6592, + "step": 2531 + }, + { + "epoch": 0.14112925700908532, + "grad_norm": 0.5485236048698425, + "learning_rate": 9.610187125304111e-05, + "loss": 1.7567, + "step": 2532 + }, + { + "epoch": 0.14118499526224848, + "grad_norm": 0.537264347076416, + "learning_rate": 9.609842368551014e-05, + "loss": 1.7151, + "step": 2533 + }, + { + "epoch": 0.14124073351541164, + "grad_norm": 0.588664174079895, + "learning_rate": 9.609497465600595e-05, + "loss": 1.9591, + "step": 2534 + }, + { + "epoch": 0.14129647176857477, + "grad_norm": 0.5192539691925049, + "learning_rate": 9.60915241646379e-05, + "loss": 1.7296, + "step": 2535 + }, + { + "epoch": 0.14135221002173792, + "grad_norm": 0.543268620967865, + "learning_rate": 9.608807221151543e-05, + "loss": 1.7645, + "step": 2536 + }, + { + "epoch": 0.14140794827490105, + "grad_norm": 0.534324049949646, + "learning_rate": 9.608461879674802e-05, + "loss": 1.8227, + "step": 2537 + }, + { + "epoch": 0.1414636865280642, + "grad_norm": 0.5177492499351501, + "learning_rate": 9.608116392044521e-05, + "loss": 1.6495, + "step": 2538 + }, + { + "epoch": 0.14151942478122737, + "grad_norm": 0.5617666840553284, + "learning_rate": 9.607770758271655e-05, + "loss": 1.9329, + "step": 2539 + }, + { + "epoch": 0.1415751630343905, + "grad_norm": 0.5591059327125549, + "learning_rate": 9.607424978367165e-05, + "loss": 1.8535, + "step": 2540 + }, + { + "epoch": 0.14163090128755365, + "grad_norm": 0.5114865899085999, + "learning_rate": 9.607079052342018e-05, + "loss": 1.6956, + "step": 2541 + }, + { + "epoch": 0.1416866395407168, + "grad_norm": 0.5444316864013672, + "learning_rate": 9.606732980207184e-05, + "loss": 1.6842, + "step": 2542 + }, + { + "epoch": 0.14174237779387994, + "grad_norm": 0.5291377305984497, + "learning_rate": 9.606386761973641e-05, + "loss": 1.778, + "step": 2543 + }, + { + "epoch": 0.1417981160470431, + "grad_norm": 0.5469574332237244, + "learning_rate": 9.606040397652365e-05, + "loss": 1.8492, + "step": 2544 + }, + { + "epoch": 0.14185385430020622, + "grad_norm": 0.5374149084091187, + "learning_rate": 9.605693887254343e-05, + "loss": 1.8428, + "step": 2545 + }, + { + "epoch": 0.14190959255336938, + "grad_norm": 0.5556001663208008, + "learning_rate": 9.605347230790565e-05, + "loss": 1.786, + "step": 2546 + }, + { + "epoch": 0.14196533080653254, + "grad_norm": 0.5268534421920776, + "learning_rate": 9.605000428272023e-05, + "loss": 1.5936, + "step": 2547 + }, + { + "epoch": 0.14202106905969566, + "grad_norm": 0.5348252058029175, + "learning_rate": 9.604653479709717e-05, + "loss": 1.8033, + "step": 2548 + }, + { + "epoch": 0.14207680731285882, + "grad_norm": 0.47919270396232605, + "learning_rate": 9.60430638511465e-05, + "loss": 1.5892, + "step": 2549 + }, + { + "epoch": 0.14213254556602195, + "grad_norm": 0.5066027045249939, + "learning_rate": 9.603959144497827e-05, + "loss": 1.6489, + "step": 2550 + }, + { + "epoch": 0.1421882838191851, + "grad_norm": 0.512729823589325, + "learning_rate": 9.603611757870266e-05, + "loss": 1.4806, + "step": 2551 + }, + { + "epoch": 0.14224402207234826, + "grad_norm": 0.5020458102226257, + "learning_rate": 9.603264225242978e-05, + "loss": 1.7944, + "step": 2552 + }, + { + "epoch": 0.1422997603255114, + "grad_norm": 0.5788121819496155, + "learning_rate": 9.60291654662699e-05, + "loss": 1.828, + "step": 2553 + }, + { + "epoch": 0.14235549857867455, + "grad_norm": 0.5426775217056274, + "learning_rate": 9.602568722033326e-05, + "loss": 1.8621, + "step": 2554 + }, + { + "epoch": 0.14241123683183768, + "grad_norm": 0.5158776044845581, + "learning_rate": 9.602220751473015e-05, + "loss": 1.8829, + "step": 2555 + }, + { + "epoch": 0.14246697508500084, + "grad_norm": 0.48226305842399597, + "learning_rate": 9.601872634957096e-05, + "loss": 1.6547, + "step": 2556 + }, + { + "epoch": 0.142522713338164, + "grad_norm": 0.5081673860549927, + "learning_rate": 9.601524372496608e-05, + "loss": 1.6629, + "step": 2557 + }, + { + "epoch": 0.14257845159132712, + "grad_norm": 0.5080944299697876, + "learning_rate": 9.601175964102596e-05, + "loss": 1.8285, + "step": 2558 + }, + { + "epoch": 0.14263418984449028, + "grad_norm": 0.5221143364906311, + "learning_rate": 9.600827409786107e-05, + "loss": 1.9544, + "step": 2559 + }, + { + "epoch": 0.1426899280976534, + "grad_norm": 0.5045720338821411, + "learning_rate": 9.600478709558199e-05, + "loss": 1.5243, + "step": 2560 + }, + { + "epoch": 0.14274566635081656, + "grad_norm": 0.5300230383872986, + "learning_rate": 9.600129863429929e-05, + "loss": 1.6888, + "step": 2561 + }, + { + "epoch": 0.14280140460397972, + "grad_norm": 0.5262769460678101, + "learning_rate": 9.599780871412359e-05, + "loss": 1.8205, + "step": 2562 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.5437910556793213, + "learning_rate": 9.59943173351656e-05, + "loss": 1.69, + "step": 2563 + }, + { + "epoch": 0.142912881110306, + "grad_norm": 0.5781261324882507, + "learning_rate": 9.599082449753602e-05, + "loss": 1.918, + "step": 2564 + }, + { + "epoch": 0.14296861936346916, + "grad_norm": 0.5519402623176575, + "learning_rate": 9.598733020134562e-05, + "loss": 1.7039, + "step": 2565 + }, + { + "epoch": 0.1430243576166323, + "grad_norm": 0.5874602198600769, + "learning_rate": 9.598383444670526e-05, + "loss": 1.6948, + "step": 2566 + }, + { + "epoch": 0.14308009586979545, + "grad_norm": 0.5131939649581909, + "learning_rate": 9.598033723372575e-05, + "loss": 1.6666, + "step": 2567 + }, + { + "epoch": 0.14313583412295858, + "grad_norm": 0.6441419124603271, + "learning_rate": 9.597683856251804e-05, + "loss": 1.9023, + "step": 2568 + }, + { + "epoch": 0.14319157237612173, + "grad_norm": 0.48139771819114685, + "learning_rate": 9.597333843319309e-05, + "loss": 1.6297, + "step": 2569 + }, + { + "epoch": 0.1432473106292849, + "grad_norm": 0.4975999891757965, + "learning_rate": 9.596983684586186e-05, + "loss": 1.6558, + "step": 2570 + }, + { + "epoch": 0.14330304888244802, + "grad_norm": 0.5479779839515686, + "learning_rate": 9.596633380063544e-05, + "loss": 1.78, + "step": 2571 + }, + { + "epoch": 0.14335878713561118, + "grad_norm": 0.5358686447143555, + "learning_rate": 9.596282929762492e-05, + "loss": 1.848, + "step": 2572 + }, + { + "epoch": 0.1434145253887743, + "grad_norm": 0.5355905890464783, + "learning_rate": 9.595932333694142e-05, + "loss": 1.847, + "step": 2573 + }, + { + "epoch": 0.14347026364193746, + "grad_norm": 0.5640880465507507, + "learning_rate": 9.595581591869616e-05, + "loss": 1.713, + "step": 2574 + }, + { + "epoch": 0.14352600189510062, + "grad_norm": 0.5763548016548157, + "learning_rate": 9.595230704300035e-05, + "loss": 1.9647, + "step": 2575 + }, + { + "epoch": 0.14358174014826375, + "grad_norm": 0.5426276922225952, + "learning_rate": 9.594879670996528e-05, + "loss": 1.7378, + "step": 2576 + }, + { + "epoch": 0.1436374784014269, + "grad_norm": 0.5128087997436523, + "learning_rate": 9.594528491970228e-05, + "loss": 1.7663, + "step": 2577 + }, + { + "epoch": 0.14369321665459003, + "grad_norm": 0.5331497192382812, + "learning_rate": 9.594177167232273e-05, + "loss": 1.6068, + "step": 2578 + }, + { + "epoch": 0.1437489549077532, + "grad_norm": 0.5513312220573425, + "learning_rate": 9.593825696793803e-05, + "loss": 1.6527, + "step": 2579 + }, + { + "epoch": 0.14380469316091635, + "grad_norm": 0.5069592595100403, + "learning_rate": 9.593474080665968e-05, + "loss": 1.5839, + "step": 2580 + }, + { + "epoch": 0.14386043141407948, + "grad_norm": 0.5478212237358093, + "learning_rate": 9.593122318859915e-05, + "loss": 1.8217, + "step": 2581 + }, + { + "epoch": 0.14391616966724263, + "grad_norm": 0.5398098230361938, + "learning_rate": 9.592770411386802e-05, + "loss": 1.8395, + "step": 2582 + }, + { + "epoch": 0.14397190792040576, + "grad_norm": 0.535152792930603, + "learning_rate": 9.592418358257789e-05, + "loss": 1.8477, + "step": 2583 + }, + { + "epoch": 0.14402764617356892, + "grad_norm": 0.5321324467658997, + "learning_rate": 9.592066159484043e-05, + "loss": 1.6152, + "step": 2584 + }, + { + "epoch": 0.14408338442673208, + "grad_norm": 0.525637686252594, + "learning_rate": 9.59171381507673e-05, + "loss": 1.8558, + "step": 2585 + }, + { + "epoch": 0.1441391226798952, + "grad_norm": 0.5971347689628601, + "learning_rate": 9.591361325047028e-05, + "loss": 1.8752, + "step": 2586 + }, + { + "epoch": 0.14419486093305836, + "grad_norm": 0.5029361844062805, + "learning_rate": 9.591008689406114e-05, + "loss": 1.6977, + "step": 2587 + }, + { + "epoch": 0.14425059918622152, + "grad_norm": 0.5642208456993103, + "learning_rate": 9.59065590816517e-05, + "loss": 1.8379, + "step": 2588 + }, + { + "epoch": 0.14430633743938465, + "grad_norm": 0.5269021391868591, + "learning_rate": 9.590302981335387e-05, + "loss": 1.98, + "step": 2589 + }, + { + "epoch": 0.1443620756925478, + "grad_norm": 0.5572815537452698, + "learning_rate": 9.589949908927957e-05, + "loss": 1.7123, + "step": 2590 + }, + { + "epoch": 0.14441781394571093, + "grad_norm": 0.5520729422569275, + "learning_rate": 9.589596690954077e-05, + "loss": 1.8578, + "step": 2591 + }, + { + "epoch": 0.1444735521988741, + "grad_norm": 0.5181688070297241, + "learning_rate": 9.589243327424951e-05, + "loss": 1.7641, + "step": 2592 + }, + { + "epoch": 0.14452929045203725, + "grad_norm": 0.5066071152687073, + "learning_rate": 9.588889818351781e-05, + "loss": 1.6991, + "step": 2593 + }, + { + "epoch": 0.14458502870520037, + "grad_norm": 0.5530059933662415, + "learning_rate": 9.588536163745782e-05, + "loss": 1.7019, + "step": 2594 + }, + { + "epoch": 0.14464076695836353, + "grad_norm": 0.5519603490829468, + "learning_rate": 9.58818236361817e-05, + "loss": 1.6645, + "step": 2595 + }, + { + "epoch": 0.14469650521152666, + "grad_norm": 0.6039948463439941, + "learning_rate": 9.587828417980163e-05, + "loss": 2.0606, + "step": 2596 + }, + { + "epoch": 0.14475224346468982, + "grad_norm": 0.5822129845619202, + "learning_rate": 9.587474326842987e-05, + "loss": 1.8879, + "step": 2597 + }, + { + "epoch": 0.14480798171785297, + "grad_norm": 0.5391368865966797, + "learning_rate": 9.587120090217874e-05, + "loss": 1.6668, + "step": 2598 + }, + { + "epoch": 0.1448637199710161, + "grad_norm": 0.505940854549408, + "learning_rate": 9.586765708116056e-05, + "loss": 1.6322, + "step": 2599 + }, + { + "epoch": 0.14491945822417926, + "grad_norm": 0.5613484978675842, + "learning_rate": 9.586411180548771e-05, + "loss": 1.7002, + "step": 2600 + }, + { + "epoch": 0.1449751964773424, + "grad_norm": 0.5343160629272461, + "learning_rate": 9.586056507527266e-05, + "loss": 1.8232, + "step": 2601 + }, + { + "epoch": 0.14503093473050555, + "grad_norm": 0.5221366286277771, + "learning_rate": 9.585701689062785e-05, + "loss": 1.7799, + "step": 2602 + }, + { + "epoch": 0.1450866729836687, + "grad_norm": 0.503301739692688, + "learning_rate": 9.585346725166584e-05, + "loss": 1.5724, + "step": 2603 + }, + { + "epoch": 0.14514241123683183, + "grad_norm": 0.5650082230567932, + "learning_rate": 9.584991615849921e-05, + "loss": 1.898, + "step": 2604 + }, + { + "epoch": 0.145198149489995, + "grad_norm": 0.4780997633934021, + "learning_rate": 9.584636361124054e-05, + "loss": 1.5643, + "step": 2605 + }, + { + "epoch": 0.14525388774315812, + "grad_norm": 0.5057533979415894, + "learning_rate": 9.584280961000253e-05, + "loss": 1.575, + "step": 2606 + }, + { + "epoch": 0.14530962599632127, + "grad_norm": 0.530737578868866, + "learning_rate": 9.583925415489787e-05, + "loss": 1.7932, + "step": 2607 + }, + { + "epoch": 0.14536536424948443, + "grad_norm": 0.603374719619751, + "learning_rate": 9.583569724603934e-05, + "loss": 2.0627, + "step": 2608 + }, + { + "epoch": 0.14542110250264756, + "grad_norm": 0.5549886226654053, + "learning_rate": 9.583213888353972e-05, + "loss": 1.7767, + "step": 2609 + }, + { + "epoch": 0.14547684075581072, + "grad_norm": 0.6217805743217468, + "learning_rate": 9.582857906751191e-05, + "loss": 2.05, + "step": 2610 + }, + { + "epoch": 0.14553257900897387, + "grad_norm": 0.5606620907783508, + "learning_rate": 9.582501779806874e-05, + "loss": 1.7722, + "step": 2611 + }, + { + "epoch": 0.145588317262137, + "grad_norm": 0.5387722253799438, + "learning_rate": 9.582145507532319e-05, + "loss": 1.6958, + "step": 2612 + }, + { + "epoch": 0.14564405551530016, + "grad_norm": 0.557847797870636, + "learning_rate": 9.581789089938825e-05, + "loss": 1.8401, + "step": 2613 + }, + { + "epoch": 0.1456997937684633, + "grad_norm": 0.5201898217201233, + "learning_rate": 9.581432527037693e-05, + "loss": 1.7684, + "step": 2614 + }, + { + "epoch": 0.14575553202162644, + "grad_norm": 0.5138794183731079, + "learning_rate": 9.581075818840234e-05, + "loss": 1.7435, + "step": 2615 + }, + { + "epoch": 0.1458112702747896, + "grad_norm": 0.5721390247344971, + "learning_rate": 9.58071896535776e-05, + "loss": 1.8191, + "step": 2616 + }, + { + "epoch": 0.14586700852795273, + "grad_norm": 0.5593292117118835, + "learning_rate": 9.580361966601588e-05, + "loss": 1.877, + "step": 2617 + }, + { + "epoch": 0.1459227467811159, + "grad_norm": 0.5009481906890869, + "learning_rate": 9.580004822583038e-05, + "loss": 1.6282, + "step": 2618 + }, + { + "epoch": 0.14597848503427902, + "grad_norm": 0.4969474673271179, + "learning_rate": 9.579647533313439e-05, + "loss": 1.7076, + "step": 2619 + }, + { + "epoch": 0.14603422328744217, + "grad_norm": 0.5316969156265259, + "learning_rate": 9.579290098804122e-05, + "loss": 1.6271, + "step": 2620 + }, + { + "epoch": 0.14608996154060533, + "grad_norm": 0.5574962496757507, + "learning_rate": 9.578932519066422e-05, + "loss": 1.8687, + "step": 2621 + }, + { + "epoch": 0.14614569979376846, + "grad_norm": 0.499491423368454, + "learning_rate": 9.57857479411168e-05, + "loss": 1.6985, + "step": 2622 + }, + { + "epoch": 0.14620143804693161, + "grad_norm": 0.654602587223053, + "learning_rate": 9.57821692395124e-05, + "loss": 1.7291, + "step": 2623 + }, + { + "epoch": 0.14625717630009474, + "grad_norm": 0.5459001660346985, + "learning_rate": 9.577858908596451e-05, + "loss": 1.729, + "step": 2624 + }, + { + "epoch": 0.1463129145532579, + "grad_norm": 0.5157297849655151, + "learning_rate": 9.57750074805867e-05, + "loss": 1.4164, + "step": 2625 + }, + { + "epoch": 0.14636865280642106, + "grad_norm": 0.5205078125, + "learning_rate": 9.577142442349254e-05, + "loss": 1.7282, + "step": 2626 + }, + { + "epoch": 0.14642439105958419, + "grad_norm": 0.563706636428833, + "learning_rate": 9.576783991479565e-05, + "loss": 1.8092, + "step": 2627 + }, + { + "epoch": 0.14648012931274734, + "grad_norm": 0.5385141968727112, + "learning_rate": 9.576425395460973e-05, + "loss": 1.8241, + "step": 2628 + }, + { + "epoch": 0.14653586756591047, + "grad_norm": 0.6100838780403137, + "learning_rate": 9.576066654304849e-05, + "loss": 1.9425, + "step": 2629 + }, + { + "epoch": 0.14659160581907363, + "grad_norm": 0.5153439044952393, + "learning_rate": 9.575707768022572e-05, + "loss": 1.4287, + "step": 2630 + }, + { + "epoch": 0.14664734407223678, + "grad_norm": 0.5562304258346558, + "learning_rate": 9.575348736625523e-05, + "loss": 1.9308, + "step": 2631 + }, + { + "epoch": 0.14670308232539991, + "grad_norm": 0.5785409808158875, + "learning_rate": 9.574989560125087e-05, + "loss": 1.8831, + "step": 2632 + }, + { + "epoch": 0.14675882057856307, + "grad_norm": 0.5315858721733093, + "learning_rate": 9.574630238532658e-05, + "loss": 1.5871, + "step": 2633 + }, + { + "epoch": 0.14681455883172623, + "grad_norm": 0.5748802423477173, + "learning_rate": 9.574270771859628e-05, + "loss": 1.8394, + "step": 2634 + }, + { + "epoch": 0.14687029708488936, + "grad_norm": 0.5130333304405212, + "learning_rate": 9.5739111601174e-05, + "loss": 1.8598, + "step": 2635 + }, + { + "epoch": 0.1469260353380525, + "grad_norm": 0.5098990201950073, + "learning_rate": 9.573551403317378e-05, + "loss": 1.5862, + "step": 2636 + }, + { + "epoch": 0.14698177359121564, + "grad_norm": 0.5426929593086243, + "learning_rate": 9.573191501470971e-05, + "loss": 1.8026, + "step": 2637 + }, + { + "epoch": 0.1470375118443788, + "grad_norm": 0.5652133226394653, + "learning_rate": 9.572831454589592e-05, + "loss": 1.7529, + "step": 2638 + }, + { + "epoch": 0.14709325009754196, + "grad_norm": 0.5370623469352722, + "learning_rate": 9.572471262684662e-05, + "loss": 1.7851, + "step": 2639 + }, + { + "epoch": 0.14714898835070508, + "grad_norm": 0.5871500372886658, + "learning_rate": 9.572110925767601e-05, + "loss": 1.7617, + "step": 2640 + }, + { + "epoch": 0.14720472660386824, + "grad_norm": 0.5181992053985596, + "learning_rate": 9.571750443849841e-05, + "loss": 1.6418, + "step": 2641 + }, + { + "epoch": 0.14726046485703137, + "grad_norm": 0.5635068416595459, + "learning_rate": 9.571389816942811e-05, + "loss": 2.0309, + "step": 2642 + }, + { + "epoch": 0.14731620311019453, + "grad_norm": 0.5830138921737671, + "learning_rate": 9.571029045057948e-05, + "loss": 1.8764, + "step": 2643 + }, + { + "epoch": 0.14737194136335768, + "grad_norm": 0.5109788179397583, + "learning_rate": 9.570668128206697e-05, + "loss": 1.6183, + "step": 2644 + }, + { + "epoch": 0.1474276796165208, + "grad_norm": 0.5681736469268799, + "learning_rate": 9.5703070664005e-05, + "loss": 1.738, + "step": 2645 + }, + { + "epoch": 0.14748341786968397, + "grad_norm": 0.5385489463806152, + "learning_rate": 9.56994585965081e-05, + "loss": 1.7379, + "step": 2646 + }, + { + "epoch": 0.1475391561228471, + "grad_norm": 0.5935365557670593, + "learning_rate": 9.569584507969082e-05, + "loss": 1.6596, + "step": 2647 + }, + { + "epoch": 0.14759489437601025, + "grad_norm": 0.5758340358734131, + "learning_rate": 9.569223011366776e-05, + "loss": 1.7998, + "step": 2648 + }, + { + "epoch": 0.1476506326291734, + "grad_norm": 0.5150250196456909, + "learning_rate": 9.568861369855357e-05, + "loss": 1.5843, + "step": 2649 + }, + { + "epoch": 0.14770637088233654, + "grad_norm": 0.549801230430603, + "learning_rate": 9.568499583446293e-05, + "loss": 1.6966, + "step": 2650 + }, + { + "epoch": 0.1477621091354997, + "grad_norm": 0.5092233419418335, + "learning_rate": 9.568137652151059e-05, + "loss": 1.7318, + "step": 2651 + }, + { + "epoch": 0.14781784738866283, + "grad_norm": 0.5549139976501465, + "learning_rate": 9.567775575981133e-05, + "loss": 1.8252, + "step": 2652 + }, + { + "epoch": 0.14787358564182598, + "grad_norm": 0.5805264115333557, + "learning_rate": 9.567413354947997e-05, + "loss": 1.8455, + "step": 2653 + }, + { + "epoch": 0.14792932389498914, + "grad_norm": 0.5241934657096863, + "learning_rate": 9.56705098906314e-05, + "loss": 1.8003, + "step": 2654 + }, + { + "epoch": 0.14798506214815227, + "grad_norm": 0.5738681554794312, + "learning_rate": 9.566688478338053e-05, + "loss": 1.765, + "step": 2655 + }, + { + "epoch": 0.14804080040131543, + "grad_norm": 0.5123993158340454, + "learning_rate": 9.566325822784232e-05, + "loss": 1.686, + "step": 2656 + }, + { + "epoch": 0.14809653865447858, + "grad_norm": 0.5327409505844116, + "learning_rate": 9.56596302241318e-05, + "loss": 1.9386, + "step": 2657 + }, + { + "epoch": 0.1481522769076417, + "grad_norm": 0.4922872483730316, + "learning_rate": 9.565600077236403e-05, + "loss": 1.6464, + "step": 2658 + }, + { + "epoch": 0.14820801516080487, + "grad_norm": 0.5839138031005859, + "learning_rate": 9.565236987265411e-05, + "loss": 2.0237, + "step": 2659 + }, + { + "epoch": 0.148263753413968, + "grad_norm": 0.5407429933547974, + "learning_rate": 9.564873752511718e-05, + "loss": 1.9181, + "step": 2660 + }, + { + "epoch": 0.14831949166713115, + "grad_norm": 0.5354205369949341, + "learning_rate": 9.564510372986845e-05, + "loss": 1.9004, + "step": 2661 + }, + { + "epoch": 0.1483752299202943, + "grad_norm": 0.517620325088501, + "learning_rate": 9.564146848702316e-05, + "loss": 1.4634, + "step": 2662 + }, + { + "epoch": 0.14843096817345744, + "grad_norm": 0.513761579990387, + "learning_rate": 9.56378317966966e-05, + "loss": 1.7994, + "step": 2663 + }, + { + "epoch": 0.1484867064266206, + "grad_norm": 0.520189642906189, + "learning_rate": 9.56341936590041e-05, + "loss": 1.493, + "step": 2664 + }, + { + "epoch": 0.14854244467978373, + "grad_norm": 0.5256882905960083, + "learning_rate": 9.563055407406104e-05, + "loss": 1.747, + "step": 2665 + }, + { + "epoch": 0.14859818293294688, + "grad_norm": 0.5171797871589661, + "learning_rate": 9.562691304198286e-05, + "loss": 1.7043, + "step": 2666 + }, + { + "epoch": 0.14865392118611004, + "grad_norm": 0.5845912098884583, + "learning_rate": 9.5623270562885e-05, + "loss": 1.8348, + "step": 2667 + }, + { + "epoch": 0.14870965943927317, + "grad_norm": 0.5168249011039734, + "learning_rate": 9.561962663688302e-05, + "loss": 1.5255, + "step": 2668 + }, + { + "epoch": 0.14876539769243632, + "grad_norm": 0.5021228790283203, + "learning_rate": 9.561598126409245e-05, + "loss": 1.5113, + "step": 2669 + }, + { + "epoch": 0.14882113594559945, + "grad_norm": 0.5029981732368469, + "learning_rate": 9.561233444462894e-05, + "loss": 1.5927, + "step": 2670 + }, + { + "epoch": 0.1488768741987626, + "grad_norm": 0.5585193634033203, + "learning_rate": 9.56086861786081e-05, + "loss": 1.9007, + "step": 2671 + }, + { + "epoch": 0.14893261245192577, + "grad_norm": 0.4993244409561157, + "learning_rate": 9.560503646614564e-05, + "loss": 1.5592, + "step": 2672 + }, + { + "epoch": 0.1489883507050889, + "grad_norm": 0.4925285875797272, + "learning_rate": 9.560138530735734e-05, + "loss": 1.5822, + "step": 2673 + }, + { + "epoch": 0.14904408895825205, + "grad_norm": 0.5714946985244751, + "learning_rate": 9.559773270235896e-05, + "loss": 1.703, + "step": 2674 + }, + { + "epoch": 0.14909982721141518, + "grad_norm": 0.5588274598121643, + "learning_rate": 9.559407865126636e-05, + "loss": 1.7473, + "step": 2675 + }, + { + "epoch": 0.14915556546457834, + "grad_norm": 0.5327757000923157, + "learning_rate": 9.559042315419542e-05, + "loss": 1.6382, + "step": 2676 + }, + { + "epoch": 0.1492113037177415, + "grad_norm": 0.5377374887466431, + "learning_rate": 9.558676621126206e-05, + "loss": 1.7602, + "step": 2677 + }, + { + "epoch": 0.14926704197090462, + "grad_norm": 0.5468077659606934, + "learning_rate": 9.558310782258227e-05, + "loss": 1.7686, + "step": 2678 + }, + { + "epoch": 0.14932278022406778, + "grad_norm": 0.5344017744064331, + "learning_rate": 9.557944798827205e-05, + "loss": 1.6661, + "step": 2679 + }, + { + "epoch": 0.14937851847723094, + "grad_norm": 0.5011274218559265, + "learning_rate": 9.557578670844751e-05, + "loss": 1.6757, + "step": 2680 + }, + { + "epoch": 0.14943425673039407, + "grad_norm": 0.5330647826194763, + "learning_rate": 9.557212398322473e-05, + "loss": 1.8146, + "step": 2681 + }, + { + "epoch": 0.14948999498355722, + "grad_norm": 0.5211254954338074, + "learning_rate": 9.556845981271989e-05, + "loss": 1.7437, + "step": 2682 + }, + { + "epoch": 0.14954573323672035, + "grad_norm": 0.603344738483429, + "learning_rate": 9.556479419704918e-05, + "loss": 2.0424, + "step": 2683 + }, + { + "epoch": 0.1496014714898835, + "grad_norm": 0.5117289423942566, + "learning_rate": 9.556112713632885e-05, + "loss": 1.6523, + "step": 2684 + }, + { + "epoch": 0.14965720974304667, + "grad_norm": 0.5624164938926697, + "learning_rate": 9.555745863067522e-05, + "loss": 1.8348, + "step": 2685 + }, + { + "epoch": 0.1497129479962098, + "grad_norm": 0.4994141459465027, + "learning_rate": 9.555378868020461e-05, + "loss": 1.6003, + "step": 2686 + }, + { + "epoch": 0.14976868624937295, + "grad_norm": 0.5267731547355652, + "learning_rate": 9.555011728503343e-05, + "loss": 1.6412, + "step": 2687 + }, + { + "epoch": 0.14982442450253608, + "grad_norm": 0.4905613958835602, + "learning_rate": 9.554644444527812e-05, + "loss": 1.6397, + "step": 2688 + }, + { + "epoch": 0.14988016275569924, + "grad_norm": 0.5710086226463318, + "learning_rate": 9.554277016105512e-05, + "loss": 2.0408, + "step": 2689 + }, + { + "epoch": 0.1499359010088624, + "grad_norm": 0.5375673770904541, + "learning_rate": 9.5539094432481e-05, + "loss": 1.7599, + "step": 2690 + }, + { + "epoch": 0.14999163926202552, + "grad_norm": 0.5491001009941101, + "learning_rate": 9.55354172596723e-05, + "loss": 1.6704, + "step": 2691 + }, + { + "epoch": 0.15004737751518868, + "grad_norm": 0.5431581139564514, + "learning_rate": 9.553173864274567e-05, + "loss": 1.7792, + "step": 2692 + }, + { + "epoch": 0.1501031157683518, + "grad_norm": 0.5338147282600403, + "learning_rate": 9.552805858181775e-05, + "loss": 1.7461, + "step": 2693 + }, + { + "epoch": 0.15015885402151496, + "grad_norm": 0.5207554697990417, + "learning_rate": 9.552437707700526e-05, + "loss": 1.7735, + "step": 2694 + }, + { + "epoch": 0.15021459227467812, + "grad_norm": 0.515975296497345, + "learning_rate": 9.552069412842495e-05, + "loss": 1.6318, + "step": 2695 + }, + { + "epoch": 0.15027033052784125, + "grad_norm": 0.5207625031471252, + "learning_rate": 9.551700973619364e-05, + "loss": 1.665, + "step": 2696 + }, + { + "epoch": 0.1503260687810044, + "grad_norm": 0.5158435702323914, + "learning_rate": 9.551332390042816e-05, + "loss": 1.743, + "step": 2697 + }, + { + "epoch": 0.15038180703416754, + "grad_norm": 0.5647339224815369, + "learning_rate": 9.55096366212454e-05, + "loss": 1.9245, + "step": 2698 + }, + { + "epoch": 0.1504375452873307, + "grad_norm": 0.545265793800354, + "learning_rate": 9.55059478987623e-05, + "loss": 1.5553, + "step": 2699 + }, + { + "epoch": 0.15049328354049385, + "grad_norm": 0.5328176617622375, + "learning_rate": 9.550225773309586e-05, + "loss": 1.4489, + "step": 2700 + }, + { + "epoch": 0.15054902179365698, + "grad_norm": 0.5154641270637512, + "learning_rate": 9.54985661243631e-05, + "loss": 1.9052, + "step": 2701 + }, + { + "epoch": 0.15060476004682014, + "grad_norm": 0.5019435286521912, + "learning_rate": 9.54948730726811e-05, + "loss": 1.5049, + "step": 2702 + }, + { + "epoch": 0.1506604982999833, + "grad_norm": 0.557501494884491, + "learning_rate": 9.549117857816697e-05, + "loss": 1.8818, + "step": 2703 + }, + { + "epoch": 0.15071623655314642, + "grad_norm": 0.5352375507354736, + "learning_rate": 9.548748264093789e-05, + "loss": 1.6683, + "step": 2704 + }, + { + "epoch": 0.15077197480630958, + "grad_norm": 0.5106709599494934, + "learning_rate": 9.548378526111108e-05, + "loss": 1.6966, + "step": 2705 + }, + { + "epoch": 0.1508277130594727, + "grad_norm": 0.5565862655639648, + "learning_rate": 9.54800864388038e-05, + "loss": 1.8303, + "step": 2706 + }, + { + "epoch": 0.15088345131263586, + "grad_norm": 0.5492972135543823, + "learning_rate": 9.547638617413333e-05, + "loss": 1.8624, + "step": 2707 + }, + { + "epoch": 0.15093918956579902, + "grad_norm": 0.50017249584198, + "learning_rate": 9.547268446721702e-05, + "loss": 1.5654, + "step": 2708 + }, + { + "epoch": 0.15099492781896215, + "grad_norm": 0.48998236656188965, + "learning_rate": 9.54689813181723e-05, + "loss": 1.6074, + "step": 2709 + }, + { + "epoch": 0.1510506660721253, + "grad_norm": 0.5397832989692688, + "learning_rate": 9.54652767271166e-05, + "loss": 1.8095, + "step": 2710 + }, + { + "epoch": 0.15110640432528843, + "grad_norm": 0.5553854703903198, + "learning_rate": 9.54615706941674e-05, + "loss": 1.8065, + "step": 2711 + }, + { + "epoch": 0.1511621425784516, + "grad_norm": 0.5286390781402588, + "learning_rate": 9.545786321944223e-05, + "loss": 1.5857, + "step": 2712 + }, + { + "epoch": 0.15121788083161475, + "grad_norm": 0.4900679588317871, + "learning_rate": 9.545415430305869e-05, + "loss": 1.5847, + "step": 2713 + }, + { + "epoch": 0.15127361908477788, + "grad_norm": 0.5456913113594055, + "learning_rate": 9.545044394513439e-05, + "loss": 1.7911, + "step": 2714 + }, + { + "epoch": 0.15132935733794103, + "grad_norm": 0.5544347763061523, + "learning_rate": 9.544673214578698e-05, + "loss": 1.7341, + "step": 2715 + }, + { + "epoch": 0.15138509559110416, + "grad_norm": 0.5260149836540222, + "learning_rate": 9.544301890513423e-05, + "loss": 1.6531, + "step": 2716 + }, + { + "epoch": 0.15144083384426732, + "grad_norm": 0.5473960638046265, + "learning_rate": 9.543930422329386e-05, + "loss": 1.7704, + "step": 2717 + }, + { + "epoch": 0.15149657209743048, + "grad_norm": 0.5335630178451538, + "learning_rate": 9.543558810038368e-05, + "loss": 1.6427, + "step": 2718 + }, + { + "epoch": 0.1515523103505936, + "grad_norm": 0.558547854423523, + "learning_rate": 9.543187053652156e-05, + "loss": 1.9572, + "step": 2719 + }, + { + "epoch": 0.15160804860375676, + "grad_norm": 0.5423372983932495, + "learning_rate": 9.54281515318254e-05, + "loss": 1.6761, + "step": 2720 + }, + { + "epoch": 0.1516637868569199, + "grad_norm": 0.5132402181625366, + "learning_rate": 9.542443108641312e-05, + "loss": 1.8216, + "step": 2721 + }, + { + "epoch": 0.15171952511008305, + "grad_norm": 0.491897314786911, + "learning_rate": 9.542070920040274e-05, + "loss": 1.5411, + "step": 2722 + }, + { + "epoch": 0.1517752633632462, + "grad_norm": 0.5645871758460999, + "learning_rate": 9.541698587391229e-05, + "loss": 1.848, + "step": 2723 + }, + { + "epoch": 0.15183100161640933, + "grad_norm": 0.5238233208656311, + "learning_rate": 9.541326110705983e-05, + "loss": 1.7717, + "step": 2724 + }, + { + "epoch": 0.1518867398695725, + "grad_norm": 0.5333484411239624, + "learning_rate": 9.540953489996354e-05, + "loss": 1.6865, + "step": 2725 + }, + { + "epoch": 0.15194247812273565, + "grad_norm": 0.5394174456596375, + "learning_rate": 9.540580725274153e-05, + "loss": 1.7526, + "step": 2726 + }, + { + "epoch": 0.15199821637589878, + "grad_norm": 0.5119402408599854, + "learning_rate": 9.540207816551206e-05, + "loss": 1.7543, + "step": 2727 + }, + { + "epoch": 0.15205395462906193, + "grad_norm": 0.4968518912792206, + "learning_rate": 9.539834763839337e-05, + "loss": 1.4261, + "step": 2728 + }, + { + "epoch": 0.15210969288222506, + "grad_norm": 0.5909052491188049, + "learning_rate": 9.539461567150378e-05, + "loss": 1.9545, + "step": 2729 + }, + { + "epoch": 0.15216543113538822, + "grad_norm": 0.5353077054023743, + "learning_rate": 9.539088226496167e-05, + "loss": 1.7021, + "step": 2730 + }, + { + "epoch": 0.15222116938855138, + "grad_norm": 0.526706874370575, + "learning_rate": 9.538714741888541e-05, + "loss": 1.7132, + "step": 2731 + }, + { + "epoch": 0.1522769076417145, + "grad_norm": 0.5296183228492737, + "learning_rate": 9.538341113339346e-05, + "loss": 1.6896, + "step": 2732 + }, + { + "epoch": 0.15233264589487766, + "grad_norm": 0.5836046934127808, + "learning_rate": 9.537967340860432e-05, + "loss": 1.7815, + "step": 2733 + }, + { + "epoch": 0.1523883841480408, + "grad_norm": 0.5508841872215271, + "learning_rate": 9.537593424463651e-05, + "loss": 1.8918, + "step": 2734 + }, + { + "epoch": 0.15244412240120395, + "grad_norm": 0.522796630859375, + "learning_rate": 9.537219364160863e-05, + "loss": 1.7225, + "step": 2735 + }, + { + "epoch": 0.1524998606543671, + "grad_norm": 0.48475125432014465, + "learning_rate": 9.536845159963932e-05, + "loss": 1.5232, + "step": 2736 + }, + { + "epoch": 0.15255559890753023, + "grad_norm": 0.5141192674636841, + "learning_rate": 9.536470811884723e-05, + "loss": 1.8193, + "step": 2737 + }, + { + "epoch": 0.1526113371606934, + "grad_norm": 0.5721970796585083, + "learning_rate": 9.536096319935108e-05, + "loss": 1.9167, + "step": 2738 + }, + { + "epoch": 0.15266707541385652, + "grad_norm": 0.53280109167099, + "learning_rate": 9.535721684126967e-05, + "loss": 1.8613, + "step": 2739 + }, + { + "epoch": 0.15272281366701967, + "grad_norm": 0.5099390745162964, + "learning_rate": 9.535346904472177e-05, + "loss": 1.6646, + "step": 2740 + }, + { + "epoch": 0.15277855192018283, + "grad_norm": 0.8719338774681091, + "learning_rate": 9.53497198098263e-05, + "loss": 1.7495, + "step": 2741 + }, + { + "epoch": 0.15283429017334596, + "grad_norm": 0.6453019380569458, + "learning_rate": 9.53459691367021e-05, + "loss": 1.9952, + "step": 2742 + }, + { + "epoch": 0.15289002842650912, + "grad_norm": 0.5782769322395325, + "learning_rate": 9.534221702546814e-05, + "loss": 1.9164, + "step": 2743 + }, + { + "epoch": 0.15294576667967225, + "grad_norm": 0.4970633387565613, + "learning_rate": 9.533846347624343e-05, + "loss": 1.7106, + "step": 2744 + }, + { + "epoch": 0.1530015049328354, + "grad_norm": 0.5226539373397827, + "learning_rate": 9.533470848914698e-05, + "loss": 1.6197, + "step": 2745 + }, + { + "epoch": 0.15305724318599856, + "grad_norm": 0.5139595866203308, + "learning_rate": 9.533095206429792e-05, + "loss": 1.7638, + "step": 2746 + }, + { + "epoch": 0.1531129814391617, + "grad_norm": 0.5007668733596802, + "learning_rate": 9.532719420181535e-05, + "loss": 1.5744, + "step": 2747 + }, + { + "epoch": 0.15316871969232485, + "grad_norm": 0.5414915084838867, + "learning_rate": 9.532343490181845e-05, + "loss": 1.748, + "step": 2748 + }, + { + "epoch": 0.153224457945488, + "grad_norm": 0.6250778436660767, + "learning_rate": 9.531967416442646e-05, + "loss": 1.8845, + "step": 2749 + }, + { + "epoch": 0.15328019619865113, + "grad_norm": 0.5204728245735168, + "learning_rate": 9.531591198975863e-05, + "loss": 1.7691, + "step": 2750 + }, + { + "epoch": 0.1533359344518143, + "grad_norm": 0.5631746649742126, + "learning_rate": 9.531214837793429e-05, + "loss": 1.6964, + "step": 2751 + }, + { + "epoch": 0.15339167270497742, + "grad_norm": 0.49102160334587097, + "learning_rate": 9.530838332907278e-05, + "loss": 1.6693, + "step": 2752 + }, + { + "epoch": 0.15344741095814057, + "grad_norm": 0.5530296564102173, + "learning_rate": 9.530461684329352e-05, + "loss": 1.932, + "step": 2753 + }, + { + "epoch": 0.15350314921130373, + "grad_norm": 0.4979936480522156, + "learning_rate": 9.530084892071596e-05, + "loss": 1.6084, + "step": 2754 + }, + { + "epoch": 0.15355888746446686, + "grad_norm": 0.5499585270881653, + "learning_rate": 9.52970795614596e-05, + "loss": 1.8431, + "step": 2755 + }, + { + "epoch": 0.15361462571763002, + "grad_norm": 0.5399606227874756, + "learning_rate": 9.529330876564398e-05, + "loss": 1.7747, + "step": 2756 + }, + { + "epoch": 0.15367036397079314, + "grad_norm": 0.5473707914352417, + "learning_rate": 9.528953653338867e-05, + "loss": 1.7633, + "step": 2757 + }, + { + "epoch": 0.1537261022239563, + "grad_norm": 0.5312392711639404, + "learning_rate": 9.528576286481332e-05, + "loss": 1.7155, + "step": 2758 + }, + { + "epoch": 0.15378184047711946, + "grad_norm": 0.5812214016914368, + "learning_rate": 9.52819877600376e-05, + "loss": 1.7427, + "step": 2759 + }, + { + "epoch": 0.1538375787302826, + "grad_norm": 0.5881000757217407, + "learning_rate": 9.527821121918126e-05, + "loss": 1.9338, + "step": 2760 + }, + { + "epoch": 0.15389331698344574, + "grad_norm": 0.4990249574184418, + "learning_rate": 9.527443324236403e-05, + "loss": 1.6865, + "step": 2761 + }, + { + "epoch": 0.15394905523660887, + "grad_norm": 0.5099406242370605, + "learning_rate": 9.527065382970576e-05, + "loss": 1.4843, + "step": 2762 + }, + { + "epoch": 0.15400479348977203, + "grad_norm": 0.555368959903717, + "learning_rate": 9.52668729813263e-05, + "loss": 1.7174, + "step": 2763 + }, + { + "epoch": 0.1540605317429352, + "grad_norm": 0.5384423136711121, + "learning_rate": 9.526309069734553e-05, + "loss": 1.8855, + "step": 2764 + }, + { + "epoch": 0.15411626999609832, + "grad_norm": 0.5143032073974609, + "learning_rate": 9.525930697788345e-05, + "loss": 1.7095, + "step": 2765 + }, + { + "epoch": 0.15417200824926147, + "grad_norm": 0.4992869794368744, + "learning_rate": 9.525552182306003e-05, + "loss": 1.5436, + "step": 2766 + }, + { + "epoch": 0.1542277465024246, + "grad_norm": 0.5122644901275635, + "learning_rate": 9.525173523299531e-05, + "loss": 1.8488, + "step": 2767 + }, + { + "epoch": 0.15428348475558776, + "grad_norm": 0.49027514457702637, + "learning_rate": 9.524794720780938e-05, + "loss": 1.6764, + "step": 2768 + }, + { + "epoch": 0.15433922300875091, + "grad_norm": 0.5170779824256897, + "learning_rate": 9.524415774762239e-05, + "loss": 1.7393, + "step": 2769 + }, + { + "epoch": 0.15439496126191404, + "grad_norm": 0.5226306319236755, + "learning_rate": 9.52403668525545e-05, + "loss": 1.6587, + "step": 2770 + }, + { + "epoch": 0.1544506995150772, + "grad_norm": 0.5146019458770752, + "learning_rate": 9.523657452272594e-05, + "loss": 1.5704, + "step": 2771 + }, + { + "epoch": 0.15450643776824036, + "grad_norm": 0.5141226649284363, + "learning_rate": 9.5232780758257e-05, + "loss": 1.6701, + "step": 2772 + }, + { + "epoch": 0.15456217602140349, + "grad_norm": 0.5106475353240967, + "learning_rate": 9.522898555926796e-05, + "loss": 1.7997, + "step": 2773 + }, + { + "epoch": 0.15461791427456664, + "grad_norm": 0.4933443069458008, + "learning_rate": 9.52251889258792e-05, + "loss": 1.4629, + "step": 2774 + }, + { + "epoch": 0.15467365252772977, + "grad_norm": 0.547154426574707, + "learning_rate": 9.522139085821113e-05, + "loss": 1.7481, + "step": 2775 + }, + { + "epoch": 0.15472939078089293, + "grad_norm": 0.5420608520507812, + "learning_rate": 9.521759135638422e-05, + "loss": 1.781, + "step": 2776 + }, + { + "epoch": 0.15478512903405608, + "grad_norm": 0.5556414723396301, + "learning_rate": 9.521379042051894e-05, + "loss": 1.5232, + "step": 2777 + }, + { + "epoch": 0.1548408672872192, + "grad_norm": 0.546357274055481, + "learning_rate": 9.520998805073584e-05, + "loss": 1.663, + "step": 2778 + }, + { + "epoch": 0.15489660554038237, + "grad_norm": 0.5195935964584351, + "learning_rate": 9.52061842471555e-05, + "loss": 1.632, + "step": 2779 + }, + { + "epoch": 0.1549523437935455, + "grad_norm": 0.5412857532501221, + "learning_rate": 9.520237900989858e-05, + "loss": 1.7983, + "step": 2780 + }, + { + "epoch": 0.15500808204670866, + "grad_norm": 0.5480208992958069, + "learning_rate": 9.519857233908574e-05, + "loss": 2.0205, + "step": 2781 + }, + { + "epoch": 0.1550638202998718, + "grad_norm": 0.5754556655883789, + "learning_rate": 9.519476423483771e-05, + "loss": 1.9992, + "step": 2782 + }, + { + "epoch": 0.15511955855303494, + "grad_norm": 0.560160756111145, + "learning_rate": 9.519095469727527e-05, + "loss": 1.8583, + "step": 2783 + }, + { + "epoch": 0.1551752968061981, + "grad_norm": 0.5757945775985718, + "learning_rate": 9.518714372651922e-05, + "loss": 1.9257, + "step": 2784 + }, + { + "epoch": 0.15523103505936123, + "grad_norm": 0.861761212348938, + "learning_rate": 9.518333132269043e-05, + "loss": 1.8291, + "step": 2785 + }, + { + "epoch": 0.15528677331252438, + "grad_norm": 0.5081753134727478, + "learning_rate": 9.517951748590983e-05, + "loss": 1.5859, + "step": 2786 + }, + { + "epoch": 0.15534251156568754, + "grad_norm": 0.5519318580627441, + "learning_rate": 9.517570221629833e-05, + "loss": 1.7556, + "step": 2787 + }, + { + "epoch": 0.15539824981885067, + "grad_norm": 0.5754350423812866, + "learning_rate": 9.517188551397695e-05, + "loss": 1.8201, + "step": 2788 + }, + { + "epoch": 0.15545398807201383, + "grad_norm": 0.5522143840789795, + "learning_rate": 9.516806737906674e-05, + "loss": 1.7392, + "step": 2789 + }, + { + "epoch": 0.15550972632517696, + "grad_norm": 0.5845313668251038, + "learning_rate": 9.516424781168877e-05, + "loss": 1.7216, + "step": 2790 + }, + { + "epoch": 0.1555654645783401, + "grad_norm": 0.57271808385849, + "learning_rate": 9.516042681196419e-05, + "loss": 1.561, + "step": 2791 + }, + { + "epoch": 0.15562120283150327, + "grad_norm": 0.5778896808624268, + "learning_rate": 9.515660438001417e-05, + "loss": 2.061, + "step": 2792 + }, + { + "epoch": 0.1556769410846664, + "grad_norm": 0.5089336633682251, + "learning_rate": 9.515278051595996e-05, + "loss": 1.5716, + "step": 2793 + }, + { + "epoch": 0.15573267933782955, + "grad_norm": 0.5174574255943298, + "learning_rate": 9.514895521992278e-05, + "loss": 1.5369, + "step": 2794 + }, + { + "epoch": 0.1557884175909927, + "grad_norm": 0.5474531650543213, + "learning_rate": 9.5145128492024e-05, + "loss": 1.9497, + "step": 2795 + }, + { + "epoch": 0.15584415584415584, + "grad_norm": 0.5397194027900696, + "learning_rate": 9.514130033238494e-05, + "loss": 1.7145, + "step": 2796 + }, + { + "epoch": 0.155899894097319, + "grad_norm": 0.5489051938056946, + "learning_rate": 9.513747074112705e-05, + "loss": 1.599, + "step": 2797 + }, + { + "epoch": 0.15595563235048213, + "grad_norm": 0.5342767834663391, + "learning_rate": 9.513363971837174e-05, + "loss": 1.6787, + "step": 2798 + }, + { + "epoch": 0.15601137060364528, + "grad_norm": 0.5298926830291748, + "learning_rate": 9.512980726424052e-05, + "loss": 1.6852, + "step": 2799 + }, + { + "epoch": 0.15606710885680844, + "grad_norm": 0.5444782376289368, + "learning_rate": 9.512597337885496e-05, + "loss": 1.6972, + "step": 2800 + }, + { + "epoch": 0.15612284710997157, + "grad_norm": 0.5541877150535583, + "learning_rate": 9.51221380623366e-05, + "loss": 1.6794, + "step": 2801 + }, + { + "epoch": 0.15617858536313473, + "grad_norm": 0.6140812039375305, + "learning_rate": 9.511830131480712e-05, + "loss": 1.6826, + "step": 2802 + }, + { + "epoch": 0.15623432361629785, + "grad_norm": 0.5042434930801392, + "learning_rate": 9.511446313638819e-05, + "loss": 1.6276, + "step": 2803 + }, + { + "epoch": 0.156290061869461, + "grad_norm": 0.5544094443321228, + "learning_rate": 9.51106235272015e-05, + "loss": 1.7685, + "step": 2804 + }, + { + "epoch": 0.15634580012262417, + "grad_norm": 0.49621298909187317, + "learning_rate": 9.510678248736887e-05, + "loss": 1.6194, + "step": 2805 + }, + { + "epoch": 0.1564015383757873, + "grad_norm": 0.5988842248916626, + "learning_rate": 9.510294001701208e-05, + "loss": 1.8121, + "step": 2806 + }, + { + "epoch": 0.15645727662895045, + "grad_norm": 0.5324400067329407, + "learning_rate": 9.509909611625298e-05, + "loss": 1.7674, + "step": 2807 + }, + { + "epoch": 0.15651301488211358, + "grad_norm": 0.5413124561309814, + "learning_rate": 9.509525078521353e-05, + "loss": 1.5738, + "step": 2808 + }, + { + "epoch": 0.15656875313527674, + "grad_norm": 0.5253452658653259, + "learning_rate": 9.509140402401563e-05, + "loss": 1.7126, + "step": 2809 + }, + { + "epoch": 0.1566244913884399, + "grad_norm": 0.5672581791877747, + "learning_rate": 9.508755583278131e-05, + "loss": 1.8056, + "step": 2810 + }, + { + "epoch": 0.15668022964160302, + "grad_norm": 0.49362093210220337, + "learning_rate": 9.508370621163259e-05, + "loss": 1.7569, + "step": 2811 + }, + { + "epoch": 0.15673596789476618, + "grad_norm": 0.5672383308410645, + "learning_rate": 9.507985516069154e-05, + "loss": 2.0115, + "step": 2812 + }, + { + "epoch": 0.1567917061479293, + "grad_norm": 0.576835036277771, + "learning_rate": 9.507600268008034e-05, + "loss": 2.0173, + "step": 2813 + }, + { + "epoch": 0.15684744440109247, + "grad_norm": 0.5514403581619263, + "learning_rate": 9.507214876992116e-05, + "loss": 1.711, + "step": 2814 + }, + { + "epoch": 0.15690318265425562, + "grad_norm": 0.5197775363922119, + "learning_rate": 9.506829343033619e-05, + "loss": 1.7613, + "step": 2815 + }, + { + "epoch": 0.15695892090741875, + "grad_norm": 0.5949315428733826, + "learning_rate": 9.506443666144773e-05, + "loss": 1.9146, + "step": 2816 + }, + { + "epoch": 0.1570146591605819, + "grad_norm": 0.5169588923454285, + "learning_rate": 9.506057846337808e-05, + "loss": 1.5925, + "step": 2817 + }, + { + "epoch": 0.15707039741374507, + "grad_norm": 0.5083977580070496, + "learning_rate": 9.505671883624959e-05, + "loss": 1.7269, + "step": 2818 + }, + { + "epoch": 0.1571261356669082, + "grad_norm": 0.5890203714370728, + "learning_rate": 9.505285778018469e-05, + "loss": 1.9239, + "step": 2819 + }, + { + "epoch": 0.15718187392007135, + "grad_norm": 0.5113581418991089, + "learning_rate": 9.504899529530582e-05, + "loss": 1.4883, + "step": 2820 + }, + { + "epoch": 0.15723761217323448, + "grad_norm": 0.5035502314567566, + "learning_rate": 9.504513138173547e-05, + "loss": 1.5673, + "step": 2821 + }, + { + "epoch": 0.15729335042639764, + "grad_norm": 0.5176184773445129, + "learning_rate": 9.504126603959618e-05, + "loss": 1.492, + "step": 2822 + }, + { + "epoch": 0.1573490886795608, + "grad_norm": 0.5595249533653259, + "learning_rate": 9.503739926901055e-05, + "loss": 1.916, + "step": 2823 + }, + { + "epoch": 0.15740482693272392, + "grad_norm": 0.5306408405303955, + "learning_rate": 9.50335310701012e-05, + "loss": 1.8255, + "step": 2824 + }, + { + "epoch": 0.15746056518588708, + "grad_norm": 0.5166139602661133, + "learning_rate": 9.50296614429908e-05, + "loss": 1.9614, + "step": 2825 + }, + { + "epoch": 0.1575163034390502, + "grad_norm": 0.5143607258796692, + "learning_rate": 9.502579038780207e-05, + "loss": 1.5858, + "step": 2826 + }, + { + "epoch": 0.15757204169221337, + "grad_norm": 0.5186240673065186, + "learning_rate": 9.50219179046578e-05, + "loss": 1.6746, + "step": 2827 + }, + { + "epoch": 0.15762777994537652, + "grad_norm": 0.5193765759468079, + "learning_rate": 9.50180439936808e-05, + "loss": 1.5768, + "step": 2828 + }, + { + "epoch": 0.15768351819853965, + "grad_norm": 0.5847373604774475, + "learning_rate": 9.501416865499391e-05, + "loss": 2.0199, + "step": 2829 + }, + { + "epoch": 0.1577392564517028, + "grad_norm": 0.5198137760162354, + "learning_rate": 9.501029188872004e-05, + "loss": 1.6215, + "step": 2830 + }, + { + "epoch": 0.15779499470486594, + "grad_norm": 0.5044419169425964, + "learning_rate": 9.500641369498214e-05, + "loss": 1.6355, + "step": 2831 + }, + { + "epoch": 0.1578507329580291, + "grad_norm": 0.6085756421089172, + "learning_rate": 9.50025340739032e-05, + "loss": 2.107, + "step": 2832 + }, + { + "epoch": 0.15790647121119225, + "grad_norm": 0.5201433300971985, + "learning_rate": 9.499865302560626e-05, + "loss": 1.5787, + "step": 2833 + }, + { + "epoch": 0.15796220946435538, + "grad_norm": 0.5003561973571777, + "learning_rate": 9.49947705502144e-05, + "loss": 1.6343, + "step": 2834 + }, + { + "epoch": 0.15801794771751854, + "grad_norm": 0.5781692862510681, + "learning_rate": 9.499088664785077e-05, + "loss": 1.8281, + "step": 2835 + }, + { + "epoch": 0.15807368597068167, + "grad_norm": 0.5135318636894226, + "learning_rate": 9.498700131863853e-05, + "loss": 1.7294, + "step": 2836 + }, + { + "epoch": 0.15812942422384482, + "grad_norm": 0.5199892520904541, + "learning_rate": 9.49831145627009e-05, + "loss": 1.6611, + "step": 2837 + }, + { + "epoch": 0.15818516247700798, + "grad_norm": 0.49417805671691895, + "learning_rate": 9.497922638016114e-05, + "loss": 1.4057, + "step": 2838 + }, + { + "epoch": 0.1582409007301711, + "grad_norm": 0.5626333951950073, + "learning_rate": 9.497533677114257e-05, + "loss": 1.7803, + "step": 2839 + }, + { + "epoch": 0.15829663898333426, + "grad_norm": 0.5851137042045593, + "learning_rate": 9.497144573576855e-05, + "loss": 1.7828, + "step": 2840 + }, + { + "epoch": 0.15835237723649742, + "grad_norm": 0.5782892107963562, + "learning_rate": 9.496755327416245e-05, + "loss": 1.9224, + "step": 2841 + }, + { + "epoch": 0.15840811548966055, + "grad_norm": 0.519010603427887, + "learning_rate": 9.496365938644775e-05, + "loss": 1.6932, + "step": 2842 + }, + { + "epoch": 0.1584638537428237, + "grad_norm": 0.588720440864563, + "learning_rate": 9.495976407274794e-05, + "loss": 1.7235, + "step": 2843 + }, + { + "epoch": 0.15851959199598684, + "grad_norm": 0.530684769153595, + "learning_rate": 9.495586733318654e-05, + "loss": 1.7368, + "step": 2844 + }, + { + "epoch": 0.15857533024915, + "grad_norm": 0.5223602652549744, + "learning_rate": 9.495196916788714e-05, + "loss": 1.5822, + "step": 2845 + }, + { + "epoch": 0.15863106850231315, + "grad_norm": 0.5282277464866638, + "learning_rate": 9.494806957697337e-05, + "loss": 1.7119, + "step": 2846 + }, + { + "epoch": 0.15868680675547628, + "grad_norm": 0.5861890912055969, + "learning_rate": 9.49441685605689e-05, + "loss": 1.7597, + "step": 2847 + }, + { + "epoch": 0.15874254500863944, + "grad_norm": 0.6072325110435486, + "learning_rate": 9.494026611879744e-05, + "loss": 2.1445, + "step": 2848 + }, + { + "epoch": 0.15879828326180256, + "grad_norm": 0.5348519086837769, + "learning_rate": 9.493636225178276e-05, + "loss": 1.5885, + "step": 2849 + }, + { + "epoch": 0.15885402151496572, + "grad_norm": 0.5133005976676941, + "learning_rate": 9.493245695964866e-05, + "loss": 1.7934, + "step": 2850 + }, + { + "epoch": 0.15890975976812888, + "grad_norm": 0.5469639897346497, + "learning_rate": 9.492855024251901e-05, + "loss": 1.7025, + "step": 2851 + }, + { + "epoch": 0.158965498021292, + "grad_norm": 0.5326577425003052, + "learning_rate": 9.492464210051771e-05, + "loss": 1.6258, + "step": 2852 + }, + { + "epoch": 0.15902123627445516, + "grad_norm": 0.6941805481910706, + "learning_rate": 9.492073253376865e-05, + "loss": 1.9171, + "step": 2853 + }, + { + "epoch": 0.1590769745276183, + "grad_norm": 0.5997553467750549, + "learning_rate": 9.491682154239589e-05, + "loss": 1.9891, + "step": 2854 + }, + { + "epoch": 0.15913271278078145, + "grad_norm": 0.5727251172065735, + "learning_rate": 9.491290912652344e-05, + "loss": 1.9522, + "step": 2855 + }, + { + "epoch": 0.1591884510339446, + "grad_norm": 0.5947685837745667, + "learning_rate": 9.490899528627536e-05, + "loss": 2.0334, + "step": 2856 + }, + { + "epoch": 0.15924418928710773, + "grad_norm": 0.5425087809562683, + "learning_rate": 9.490508002177579e-05, + "loss": 1.8532, + "step": 2857 + }, + { + "epoch": 0.1592999275402709, + "grad_norm": 0.5523599982261658, + "learning_rate": 9.490116333314889e-05, + "loss": 1.6041, + "step": 2858 + }, + { + "epoch": 0.15935566579343402, + "grad_norm": 0.5558710098266602, + "learning_rate": 9.489724522051888e-05, + "loss": 1.9383, + "step": 2859 + }, + { + "epoch": 0.15941140404659718, + "grad_norm": 0.5611505508422852, + "learning_rate": 9.489332568401004e-05, + "loss": 1.8919, + "step": 2860 + }, + { + "epoch": 0.15946714229976033, + "grad_norm": 0.5016571283340454, + "learning_rate": 9.488940472374663e-05, + "loss": 1.8347, + "step": 2861 + }, + { + "epoch": 0.15952288055292346, + "grad_norm": 0.5290272831916809, + "learning_rate": 9.488548233985305e-05, + "loss": 1.697, + "step": 2862 + }, + { + "epoch": 0.15957861880608662, + "grad_norm": 0.5488302707672119, + "learning_rate": 9.488155853245366e-05, + "loss": 1.9557, + "step": 2863 + }, + { + "epoch": 0.15963435705924978, + "grad_norm": 0.5422006845474243, + "learning_rate": 9.487763330167291e-05, + "loss": 1.6364, + "step": 2864 + }, + { + "epoch": 0.1596900953124129, + "grad_norm": 0.5467256307601929, + "learning_rate": 9.487370664763529e-05, + "loss": 1.7917, + "step": 2865 + }, + { + "epoch": 0.15974583356557606, + "grad_norm": 0.538063108921051, + "learning_rate": 9.486977857046532e-05, + "loss": 1.8552, + "step": 2866 + }, + { + "epoch": 0.1598015718187392, + "grad_norm": 0.5502356886863708, + "learning_rate": 9.486584907028758e-05, + "loss": 1.6089, + "step": 2867 + }, + { + "epoch": 0.15985731007190235, + "grad_norm": 0.526684582233429, + "learning_rate": 9.48619181472267e-05, + "loss": 1.5357, + "step": 2868 + }, + { + "epoch": 0.1599130483250655, + "grad_norm": 0.5427432656288147, + "learning_rate": 9.485798580140735e-05, + "loss": 1.7628, + "step": 2869 + }, + { + "epoch": 0.15996878657822863, + "grad_norm": 0.5465673208236694, + "learning_rate": 9.485405203295421e-05, + "loss": 1.6318, + "step": 2870 + }, + { + "epoch": 0.1600245248313918, + "grad_norm": 0.5261492729187012, + "learning_rate": 9.485011684199207e-05, + "loss": 1.6422, + "step": 2871 + }, + { + "epoch": 0.16008026308455492, + "grad_norm": 0.571042001247406, + "learning_rate": 9.484618022864571e-05, + "loss": 1.5466, + "step": 2872 + }, + { + "epoch": 0.16013600133771808, + "grad_norm": 0.5928837656974792, + "learning_rate": 9.484224219304e-05, + "loss": 2.0925, + "step": 2873 + }, + { + "epoch": 0.16019173959088123, + "grad_norm": 0.4875600337982178, + "learning_rate": 9.48383027352998e-05, + "loss": 1.6183, + "step": 2874 + }, + { + "epoch": 0.16024747784404436, + "grad_norm": 0.5074633955955505, + "learning_rate": 9.483436185555007e-05, + "loss": 1.5593, + "step": 2875 + }, + { + "epoch": 0.16030321609720752, + "grad_norm": 0.553817093372345, + "learning_rate": 9.483041955391578e-05, + "loss": 1.7093, + "step": 2876 + }, + { + "epoch": 0.16035895435037065, + "grad_norm": 0.5676888823509216, + "learning_rate": 9.482647583052196e-05, + "loss": 1.7555, + "step": 2877 + }, + { + "epoch": 0.1604146926035338, + "grad_norm": 0.5311009883880615, + "learning_rate": 9.48225306854937e-05, + "loss": 1.7709, + "step": 2878 + }, + { + "epoch": 0.16047043085669696, + "grad_norm": 0.5391182899475098, + "learning_rate": 9.481858411895608e-05, + "loss": 1.7296, + "step": 2879 + }, + { + "epoch": 0.1605261691098601, + "grad_norm": 0.5432226657867432, + "learning_rate": 9.481463613103429e-05, + "loss": 1.7808, + "step": 2880 + }, + { + "epoch": 0.16058190736302325, + "grad_norm": 0.5264506936073303, + "learning_rate": 9.481068672185353e-05, + "loss": 1.6362, + "step": 2881 + }, + { + "epoch": 0.16063764561618638, + "grad_norm": 0.5308744311332703, + "learning_rate": 9.480673589153904e-05, + "loss": 1.5913, + "step": 2882 + }, + { + "epoch": 0.16069338386934953, + "grad_norm": 0.4966695308685303, + "learning_rate": 9.480278364021614e-05, + "loss": 1.6744, + "step": 2883 + }, + { + "epoch": 0.1607491221225127, + "grad_norm": 0.5250310301780701, + "learning_rate": 9.479882996801017e-05, + "loss": 1.5185, + "step": 2884 + }, + { + "epoch": 0.16080486037567582, + "grad_norm": 0.5288892388343811, + "learning_rate": 9.479487487504649e-05, + "loss": 1.5259, + "step": 2885 + }, + { + "epoch": 0.16086059862883897, + "grad_norm": 0.5666532516479492, + "learning_rate": 9.479091836145057e-05, + "loss": 1.7626, + "step": 2886 + }, + { + "epoch": 0.16091633688200213, + "grad_norm": 0.5458130836486816, + "learning_rate": 9.478696042734785e-05, + "loss": 1.6936, + "step": 2887 + }, + { + "epoch": 0.16097207513516526, + "grad_norm": 0.5105459690093994, + "learning_rate": 9.478300107286389e-05, + "loss": 1.4811, + "step": 2888 + }, + { + "epoch": 0.16102781338832842, + "grad_norm": 0.5251494646072388, + "learning_rate": 9.477904029812422e-05, + "loss": 1.7184, + "step": 2889 + }, + { + "epoch": 0.16108355164149155, + "grad_norm": 0.5484756231307983, + "learning_rate": 9.477507810325448e-05, + "loss": 1.4053, + "step": 2890 + }, + { + "epoch": 0.1611392898946547, + "grad_norm": 0.5894975066184998, + "learning_rate": 9.477111448838031e-05, + "loss": 2.0827, + "step": 2891 + }, + { + "epoch": 0.16119502814781786, + "grad_norm": 0.5738565921783447, + "learning_rate": 9.476714945362745e-05, + "loss": 1.8864, + "step": 2892 + }, + { + "epoch": 0.161250766400981, + "grad_norm": 0.6212289333343506, + "learning_rate": 9.47631829991216e-05, + "loss": 1.9475, + "step": 2893 + }, + { + "epoch": 0.16130650465414414, + "grad_norm": 0.6506125330924988, + "learning_rate": 9.475921512498857e-05, + "loss": 1.9044, + "step": 2894 + }, + { + "epoch": 0.16136224290730727, + "grad_norm": 0.5559994578361511, + "learning_rate": 9.475524583135421e-05, + "loss": 1.5211, + "step": 2895 + }, + { + "epoch": 0.16141798116047043, + "grad_norm": 0.5860363841056824, + "learning_rate": 9.475127511834438e-05, + "loss": 1.7724, + "step": 2896 + }, + { + "epoch": 0.1614737194136336, + "grad_norm": 0.5559065341949463, + "learning_rate": 9.474730298608504e-05, + "loss": 1.8392, + "step": 2897 + }, + { + "epoch": 0.16152945766679672, + "grad_norm": 0.5526688694953918, + "learning_rate": 9.474332943470213e-05, + "loss": 1.7909, + "step": 2898 + }, + { + "epoch": 0.16158519591995987, + "grad_norm": 0.5582461357116699, + "learning_rate": 9.47393544643217e-05, + "loss": 1.9106, + "step": 2899 + }, + { + "epoch": 0.161640934173123, + "grad_norm": 0.5841467380523682, + "learning_rate": 9.473537807506977e-05, + "loss": 1.922, + "step": 2900 + }, + { + "epoch": 0.16169667242628616, + "grad_norm": 0.5061233043670654, + "learning_rate": 9.47314002670725e-05, + "loss": 1.5719, + "step": 2901 + }, + { + "epoch": 0.16175241067944932, + "grad_norm": 0.4959016442298889, + "learning_rate": 9.472742104045599e-05, + "loss": 1.6517, + "step": 2902 + }, + { + "epoch": 0.16180814893261244, + "grad_norm": 0.5075359344482422, + "learning_rate": 9.472344039534646e-05, + "loss": 1.7661, + "step": 2903 + }, + { + "epoch": 0.1618638871857756, + "grad_norm": 0.5135536193847656, + "learning_rate": 9.471945833187018e-05, + "loss": 1.6874, + "step": 2904 + }, + { + "epoch": 0.16191962543893873, + "grad_norm": 0.5618202090263367, + "learning_rate": 9.471547485015341e-05, + "loss": 1.6745, + "step": 2905 + }, + { + "epoch": 0.1619753636921019, + "grad_norm": 0.5325173139572144, + "learning_rate": 9.471148995032247e-05, + "loss": 1.7141, + "step": 2906 + }, + { + "epoch": 0.16203110194526504, + "grad_norm": 0.521827220916748, + "learning_rate": 9.470750363250378e-05, + "loss": 1.595, + "step": 2907 + }, + { + "epoch": 0.16208684019842817, + "grad_norm": 0.5489259362220764, + "learning_rate": 9.470351589682372e-05, + "loss": 1.8687, + "step": 2908 + }, + { + "epoch": 0.16214257845159133, + "grad_norm": 0.5823487043380737, + "learning_rate": 9.469952674340877e-05, + "loss": 1.8964, + "step": 2909 + }, + { + "epoch": 0.16219831670475449, + "grad_norm": 0.5378115773200989, + "learning_rate": 9.469553617238546e-05, + "loss": 1.6171, + "step": 2910 + }, + { + "epoch": 0.16225405495791762, + "grad_norm": 0.500411331653595, + "learning_rate": 9.469154418388034e-05, + "loss": 1.7592, + "step": 2911 + }, + { + "epoch": 0.16230979321108077, + "grad_norm": 0.49383944272994995, + "learning_rate": 9.468755077801999e-05, + "loss": 1.6709, + "step": 2912 + }, + { + "epoch": 0.1623655314642439, + "grad_norm": 0.5428176522254944, + "learning_rate": 9.468355595493109e-05, + "loss": 1.7304, + "step": 2913 + }, + { + "epoch": 0.16242126971740706, + "grad_norm": 0.537581205368042, + "learning_rate": 9.467955971474031e-05, + "loss": 1.7252, + "step": 2914 + }, + { + "epoch": 0.16247700797057021, + "grad_norm": 0.5622221231460571, + "learning_rate": 9.46755620575744e-05, + "loss": 1.7643, + "step": 2915 + }, + { + "epoch": 0.16253274622373334, + "grad_norm": 0.5474369525909424, + "learning_rate": 9.467156298356015e-05, + "loss": 1.7263, + "step": 2916 + }, + { + "epoch": 0.1625884844768965, + "grad_norm": 0.5429725646972656, + "learning_rate": 9.466756249282435e-05, + "loss": 1.7771, + "step": 2917 + }, + { + "epoch": 0.16264422273005963, + "grad_norm": 0.5385332107543945, + "learning_rate": 9.466356058549393e-05, + "loss": 1.7372, + "step": 2918 + }, + { + "epoch": 0.16269996098322279, + "grad_norm": 0.5135955214500427, + "learning_rate": 9.465955726169575e-05, + "loss": 1.7296, + "step": 2919 + }, + { + "epoch": 0.16275569923638594, + "grad_norm": 0.5584880709648132, + "learning_rate": 9.46555525215568e-05, + "loss": 1.7907, + "step": 2920 + }, + { + "epoch": 0.16281143748954907, + "grad_norm": 0.5609123706817627, + "learning_rate": 9.46515463652041e-05, + "loss": 1.8558, + "step": 2921 + }, + { + "epoch": 0.16286717574271223, + "grad_norm": 0.5887969732284546, + "learning_rate": 9.464753879276467e-05, + "loss": 1.8673, + "step": 2922 + }, + { + "epoch": 0.16292291399587536, + "grad_norm": 0.5207127332687378, + "learning_rate": 9.464352980436562e-05, + "loss": 1.8252, + "step": 2923 + }, + { + "epoch": 0.1629786522490385, + "grad_norm": 0.4879356622695923, + "learning_rate": 9.463951940013411e-05, + "loss": 1.564, + "step": 2924 + }, + { + "epoch": 0.16303439050220167, + "grad_norm": 0.5253145098686218, + "learning_rate": 9.46355075801973e-05, + "loss": 1.731, + "step": 2925 + }, + { + "epoch": 0.1630901287553648, + "grad_norm": 0.5216013789176941, + "learning_rate": 9.463149434468244e-05, + "loss": 1.7954, + "step": 2926 + }, + { + "epoch": 0.16314586700852796, + "grad_norm": 0.5162796974182129, + "learning_rate": 9.46274796937168e-05, + "loss": 1.6639, + "step": 2927 + }, + { + "epoch": 0.16320160526169109, + "grad_norm": 0.5164597630500793, + "learning_rate": 9.462346362742767e-05, + "loss": 1.5104, + "step": 2928 + }, + { + "epoch": 0.16325734351485424, + "grad_norm": 0.5458294153213501, + "learning_rate": 9.461944614594248e-05, + "loss": 1.7081, + "step": 2929 + }, + { + "epoch": 0.1633130817680174, + "grad_norm": 0.525484025478363, + "learning_rate": 9.461542724938859e-05, + "loss": 1.8709, + "step": 2930 + }, + { + "epoch": 0.16336882002118053, + "grad_norm": 0.5675646662712097, + "learning_rate": 9.461140693789349e-05, + "loss": 1.7861, + "step": 2931 + }, + { + "epoch": 0.16342455827434368, + "grad_norm": 0.5174034833908081, + "learning_rate": 9.460738521158466e-05, + "loss": 1.745, + "step": 2932 + }, + { + "epoch": 0.16348029652750684, + "grad_norm": 0.5687560439109802, + "learning_rate": 9.460336207058964e-05, + "loss": 1.8071, + "step": 2933 + }, + { + "epoch": 0.16353603478066997, + "grad_norm": 0.5177374482154846, + "learning_rate": 9.459933751503604e-05, + "loss": 1.7359, + "step": 2934 + }, + { + "epoch": 0.16359177303383313, + "grad_norm": 0.5742724537849426, + "learning_rate": 9.459531154505147e-05, + "loss": 1.6545, + "step": 2935 + }, + { + "epoch": 0.16364751128699626, + "grad_norm": 0.555439293384552, + "learning_rate": 9.459128416076365e-05, + "loss": 1.5666, + "step": 2936 + }, + { + "epoch": 0.1637032495401594, + "grad_norm": 0.5305073857307434, + "learning_rate": 9.458725536230027e-05, + "loss": 1.8546, + "step": 2937 + }, + { + "epoch": 0.16375898779332257, + "grad_norm": 0.517587423324585, + "learning_rate": 9.458322514978912e-05, + "loss": 1.6707, + "step": 2938 + }, + { + "epoch": 0.1638147260464857, + "grad_norm": 0.5396296977996826, + "learning_rate": 9.4579193523358e-05, + "loss": 1.6807, + "step": 2939 + }, + { + "epoch": 0.16387046429964885, + "grad_norm": 0.545603334903717, + "learning_rate": 9.457516048313478e-05, + "loss": 1.7966, + "step": 2940 + }, + { + "epoch": 0.16392620255281198, + "grad_norm": 0.5535080432891846, + "learning_rate": 9.457112602924735e-05, + "loss": 1.8103, + "step": 2941 + }, + { + "epoch": 0.16398194080597514, + "grad_norm": 0.5278719663619995, + "learning_rate": 9.456709016182368e-05, + "loss": 1.7992, + "step": 2942 + }, + { + "epoch": 0.1640376790591383, + "grad_norm": 0.5094558000564575, + "learning_rate": 9.456305288099174e-05, + "loss": 1.8232, + "step": 2943 + }, + { + "epoch": 0.16409341731230143, + "grad_norm": 0.5989511013031006, + "learning_rate": 9.45590141868796e-05, + "loss": 1.8106, + "step": 2944 + }, + { + "epoch": 0.16414915556546458, + "grad_norm": 0.5221716165542603, + "learning_rate": 9.455497407961532e-05, + "loss": 1.6316, + "step": 2945 + }, + { + "epoch": 0.1642048938186277, + "grad_norm": 0.4996791481971741, + "learning_rate": 9.455093255932704e-05, + "loss": 1.4846, + "step": 2946 + }, + { + "epoch": 0.16426063207179087, + "grad_norm": 0.5217500329017639, + "learning_rate": 9.454688962614293e-05, + "loss": 1.7717, + "step": 2947 + }, + { + "epoch": 0.16431637032495403, + "grad_norm": 0.5416474938392639, + "learning_rate": 9.45428452801912e-05, + "loss": 1.8829, + "step": 2948 + }, + { + "epoch": 0.16437210857811715, + "grad_norm": 0.5558078289031982, + "learning_rate": 9.453879952160013e-05, + "loss": 1.8933, + "step": 2949 + }, + { + "epoch": 0.1644278468312803, + "grad_norm": 0.5439289808273315, + "learning_rate": 9.4534752350498e-05, + "loss": 1.6009, + "step": 2950 + }, + { + "epoch": 0.16448358508444344, + "grad_norm": 0.5921631455421448, + "learning_rate": 9.45307037670132e-05, + "loss": 1.9932, + "step": 2951 + }, + { + "epoch": 0.1645393233376066, + "grad_norm": 0.5491567850112915, + "learning_rate": 9.452665377127409e-05, + "loss": 1.9729, + "step": 2952 + }, + { + "epoch": 0.16459506159076975, + "grad_norm": 0.6129978895187378, + "learning_rate": 9.452260236340915e-05, + "loss": 1.8995, + "step": 2953 + }, + { + "epoch": 0.16465079984393288, + "grad_norm": 0.6029583215713501, + "learning_rate": 9.451854954354684e-05, + "loss": 1.8313, + "step": 2954 + }, + { + "epoch": 0.16470653809709604, + "grad_norm": 0.5197410583496094, + "learning_rate": 9.451449531181572e-05, + "loss": 1.6307, + "step": 2955 + }, + { + "epoch": 0.1647622763502592, + "grad_norm": 0.5214848518371582, + "learning_rate": 9.451043966834431e-05, + "loss": 1.7253, + "step": 2956 + }, + { + "epoch": 0.16481801460342232, + "grad_norm": 0.48953381180763245, + "learning_rate": 9.450638261326128e-05, + "loss": 1.5122, + "step": 2957 + }, + { + "epoch": 0.16487375285658548, + "grad_norm": 0.5038783550262451, + "learning_rate": 9.450232414669528e-05, + "loss": 1.7602, + "step": 2958 + }, + { + "epoch": 0.1649294911097486, + "grad_norm": 0.5723398327827454, + "learning_rate": 9.449826426877504e-05, + "loss": 1.9841, + "step": 2959 + }, + { + "epoch": 0.16498522936291177, + "grad_norm": 0.5200619101524353, + "learning_rate": 9.44942029796293e-05, + "loss": 1.7965, + "step": 2960 + }, + { + "epoch": 0.16504096761607492, + "grad_norm": 0.6376471519470215, + "learning_rate": 9.449014027938685e-05, + "loss": 2.1267, + "step": 2961 + }, + { + "epoch": 0.16509670586923805, + "grad_norm": 0.5397600531578064, + "learning_rate": 9.448607616817655e-05, + "loss": 1.7952, + "step": 2962 + }, + { + "epoch": 0.1651524441224012, + "grad_norm": 0.5907739996910095, + "learning_rate": 9.448201064612728e-05, + "loss": 1.8026, + "step": 2963 + }, + { + "epoch": 0.16520818237556434, + "grad_norm": 0.5700837969779968, + "learning_rate": 9.447794371336799e-05, + "loss": 2.1377, + "step": 2964 + }, + { + "epoch": 0.1652639206287275, + "grad_norm": 0.5404232740402222, + "learning_rate": 9.447387537002765e-05, + "loss": 1.9586, + "step": 2965 + }, + { + "epoch": 0.16531965888189065, + "grad_norm": 0.5181935429573059, + "learning_rate": 9.446980561623527e-05, + "loss": 1.4828, + "step": 2966 + }, + { + "epoch": 0.16537539713505378, + "grad_norm": 0.6044127941131592, + "learning_rate": 9.446573445211994e-05, + "loss": 1.789, + "step": 2967 + }, + { + "epoch": 0.16543113538821694, + "grad_norm": 0.5353678464889526, + "learning_rate": 9.446166187781077e-05, + "loss": 1.709, + "step": 2968 + }, + { + "epoch": 0.16548687364138007, + "grad_norm": 0.5155282020568848, + "learning_rate": 9.445758789343691e-05, + "loss": 1.6335, + "step": 2969 + }, + { + "epoch": 0.16554261189454322, + "grad_norm": 0.5247118473052979, + "learning_rate": 9.445351249912757e-05, + "loss": 1.6666, + "step": 2970 + }, + { + "epoch": 0.16559835014770638, + "grad_norm": 0.5768206119537354, + "learning_rate": 9.4449435695012e-05, + "loss": 1.9109, + "step": 2971 + }, + { + "epoch": 0.1656540884008695, + "grad_norm": 0.5591040849685669, + "learning_rate": 9.444535748121949e-05, + "loss": 1.781, + "step": 2972 + }, + { + "epoch": 0.16570982665403267, + "grad_norm": 0.5098216533660889, + "learning_rate": 9.444127785787938e-05, + "loss": 1.7213, + "step": 2973 + }, + { + "epoch": 0.1657655649071958, + "grad_norm": 0.5072734355926514, + "learning_rate": 9.443719682512102e-05, + "loss": 1.8224, + "step": 2974 + }, + { + "epoch": 0.16582130316035895, + "grad_norm": 0.5172891020774841, + "learning_rate": 9.443311438307389e-05, + "loss": 1.8449, + "step": 2975 + }, + { + "epoch": 0.1658770414135221, + "grad_norm": 0.557597815990448, + "learning_rate": 9.442903053186743e-05, + "loss": 1.6679, + "step": 2976 + }, + { + "epoch": 0.16593277966668524, + "grad_norm": 0.518157422542572, + "learning_rate": 9.442494527163115e-05, + "loss": 1.6812, + "step": 2977 + }, + { + "epoch": 0.1659885179198484, + "grad_norm": 0.5476084351539612, + "learning_rate": 9.442085860249461e-05, + "loss": 1.7849, + "step": 2978 + }, + { + "epoch": 0.16604425617301155, + "grad_norm": 0.5458279252052307, + "learning_rate": 9.441677052458745e-05, + "loss": 1.8582, + "step": 2979 + }, + { + "epoch": 0.16609999442617468, + "grad_norm": 0.592612624168396, + "learning_rate": 9.441268103803928e-05, + "loss": 2.0226, + "step": 2980 + }, + { + "epoch": 0.16615573267933784, + "grad_norm": 0.5498427748680115, + "learning_rate": 9.440859014297982e-05, + "loss": 1.577, + "step": 2981 + }, + { + "epoch": 0.16621147093250097, + "grad_norm": 0.5673382878303528, + "learning_rate": 9.440449783953883e-05, + "loss": 1.7272, + "step": 2982 + }, + { + "epoch": 0.16626720918566412, + "grad_norm": 0.565617024898529, + "learning_rate": 9.440040412784603e-05, + "loss": 1.7481, + "step": 2983 + }, + { + "epoch": 0.16632294743882728, + "grad_norm": 0.6157540678977966, + "learning_rate": 9.439630900803129e-05, + "loss": 1.9244, + "step": 2984 + }, + { + "epoch": 0.1663786856919904, + "grad_norm": 0.4916851818561554, + "learning_rate": 9.439221248022447e-05, + "loss": 1.5845, + "step": 2985 + }, + { + "epoch": 0.16643442394515356, + "grad_norm": 0.573154091835022, + "learning_rate": 9.43881145445555e-05, + "loss": 1.8841, + "step": 2986 + }, + { + "epoch": 0.1664901621983167, + "grad_norm": 0.5438728332519531, + "learning_rate": 9.438401520115434e-05, + "loss": 1.7537, + "step": 2987 + }, + { + "epoch": 0.16654590045147985, + "grad_norm": 0.5793212652206421, + "learning_rate": 9.4379914450151e-05, + "loss": 1.9331, + "step": 2988 + }, + { + "epoch": 0.166601638704643, + "grad_norm": 0.5194965600967407, + "learning_rate": 9.437581229167551e-05, + "loss": 1.5948, + "step": 2989 + }, + { + "epoch": 0.16665737695780614, + "grad_norm": 0.5872880816459656, + "learning_rate": 9.4371708725858e-05, + "loss": 1.7629, + "step": 2990 + }, + { + "epoch": 0.1667131152109693, + "grad_norm": 0.519842803478241, + "learning_rate": 9.436760375282859e-05, + "loss": 1.766, + "step": 2991 + }, + { + "epoch": 0.16676885346413242, + "grad_norm": 0.5351104736328125, + "learning_rate": 9.436349737271745e-05, + "loss": 1.8319, + "step": 2992 + }, + { + "epoch": 0.16682459171729558, + "grad_norm": 0.5584455728530884, + "learning_rate": 9.435938958565487e-05, + "loss": 1.7975, + "step": 2993 + }, + { + "epoch": 0.16688032997045874, + "grad_norm": 0.4804225564002991, + "learning_rate": 9.435528039177105e-05, + "loss": 1.7058, + "step": 2994 + }, + { + "epoch": 0.16693606822362186, + "grad_norm": 0.5311334133148193, + "learning_rate": 9.435116979119635e-05, + "loss": 1.7305, + "step": 2995 + }, + { + "epoch": 0.16699180647678502, + "grad_norm": 0.5292813777923584, + "learning_rate": 9.434705778406114e-05, + "loss": 1.6901, + "step": 2996 + }, + { + "epoch": 0.16704754472994815, + "grad_norm": 0.5105124711990356, + "learning_rate": 9.434294437049582e-05, + "loss": 1.7462, + "step": 2997 + }, + { + "epoch": 0.1671032829831113, + "grad_norm": 0.5604652762413025, + "learning_rate": 9.433882955063084e-05, + "loss": 1.7997, + "step": 2998 + }, + { + "epoch": 0.16715902123627446, + "grad_norm": 0.555237889289856, + "learning_rate": 9.43347133245967e-05, + "loss": 1.923, + "step": 2999 + }, + { + "epoch": 0.1672147594894376, + "grad_norm": 0.5382326245307922, + "learning_rate": 9.433059569252394e-05, + "loss": 1.7263, + "step": 3000 + }, + { + "epoch": 0.16727049774260075, + "grad_norm": 0.6488143801689148, + "learning_rate": 9.432647665454315e-05, + "loss": 1.5881, + "step": 3001 + }, + { + "epoch": 0.1673262359957639, + "grad_norm": 0.55712890625, + "learning_rate": 9.432235621078497e-05, + "loss": 1.9409, + "step": 3002 + }, + { + "epoch": 0.16738197424892703, + "grad_norm": 0.5540611147880554, + "learning_rate": 9.431823436138005e-05, + "loss": 1.8471, + "step": 3003 + }, + { + "epoch": 0.1674377125020902, + "grad_norm": 0.5297248959541321, + "learning_rate": 9.431411110645915e-05, + "loss": 1.6844, + "step": 3004 + }, + { + "epoch": 0.16749345075525332, + "grad_norm": 0.5368382334709167, + "learning_rate": 9.4309986446153e-05, + "loss": 1.7333, + "step": 3005 + }, + { + "epoch": 0.16754918900841648, + "grad_norm": 0.5433456897735596, + "learning_rate": 9.430586038059244e-05, + "loss": 1.9837, + "step": 3006 + }, + { + "epoch": 0.16760492726157963, + "grad_norm": 0.5077199339866638, + "learning_rate": 9.430173290990829e-05, + "loss": 1.7391, + "step": 3007 + }, + { + "epoch": 0.16766066551474276, + "grad_norm": 0.49970632791519165, + "learning_rate": 9.429760403423148e-05, + "loss": 1.5325, + "step": 3008 + }, + { + "epoch": 0.16771640376790592, + "grad_norm": 0.5068593621253967, + "learning_rate": 9.429347375369295e-05, + "loss": 1.5849, + "step": 3009 + }, + { + "epoch": 0.16777214202106905, + "grad_norm": 0.5405229330062866, + "learning_rate": 9.428934206842365e-05, + "loss": 1.7995, + "step": 3010 + }, + { + "epoch": 0.1678278802742322, + "grad_norm": 0.5368816256523132, + "learning_rate": 9.428520897855469e-05, + "loss": 1.7941, + "step": 3011 + }, + { + "epoch": 0.16788361852739536, + "grad_norm": 0.5910351872444153, + "learning_rate": 9.428107448421708e-05, + "loss": 1.8987, + "step": 3012 + }, + { + "epoch": 0.1679393567805585, + "grad_norm": 0.5387074947357178, + "learning_rate": 9.427693858554196e-05, + "loss": 1.2377, + "step": 3013 + }, + { + "epoch": 0.16799509503372165, + "grad_norm": 0.5382748246192932, + "learning_rate": 9.42728012826605e-05, + "loss": 1.8915, + "step": 3014 + }, + { + "epoch": 0.16805083328688478, + "grad_norm": 0.5706035494804382, + "learning_rate": 9.426866257570391e-05, + "loss": 1.9298, + "step": 3015 + }, + { + "epoch": 0.16810657154004793, + "grad_norm": 0.517613410949707, + "learning_rate": 9.426452246480347e-05, + "loss": 1.6459, + "step": 3016 + }, + { + "epoch": 0.1681623097932111, + "grad_norm": 0.5248231291770935, + "learning_rate": 9.426038095009042e-05, + "loss": 1.8506, + "step": 3017 + }, + { + "epoch": 0.16821804804637422, + "grad_norm": 0.49280843138694763, + "learning_rate": 9.425623803169616e-05, + "loss": 1.5642, + "step": 3018 + }, + { + "epoch": 0.16827378629953738, + "grad_norm": 0.5404548048973083, + "learning_rate": 9.425209370975208e-05, + "loss": 1.7475, + "step": 3019 + }, + { + "epoch": 0.1683295245527005, + "grad_norm": 0.5196406245231628, + "learning_rate": 9.424794798438958e-05, + "loss": 1.8123, + "step": 3020 + }, + { + "epoch": 0.16838526280586366, + "grad_norm": 0.5767018795013428, + "learning_rate": 9.424380085574015e-05, + "loss": 1.9773, + "step": 3021 + }, + { + "epoch": 0.16844100105902682, + "grad_norm": 0.5589628219604492, + "learning_rate": 9.423965232393532e-05, + "loss": 1.8269, + "step": 3022 + }, + { + "epoch": 0.16849673931218995, + "grad_norm": 0.5162323117256165, + "learning_rate": 9.423550238910666e-05, + "loss": 1.7838, + "step": 3023 + }, + { + "epoch": 0.1685524775653531, + "grad_norm": 0.5301263332366943, + "learning_rate": 9.423135105138577e-05, + "loss": 1.7805, + "step": 3024 + }, + { + "epoch": 0.16860821581851626, + "grad_norm": 0.5383440256118774, + "learning_rate": 9.42271983109043e-05, + "loss": 1.8054, + "step": 3025 + }, + { + "epoch": 0.1686639540716794, + "grad_norm": 0.572410523891449, + "learning_rate": 9.422304416779397e-05, + "loss": 1.7666, + "step": 3026 + }, + { + "epoch": 0.16871969232484255, + "grad_norm": 0.5496928691864014, + "learning_rate": 9.421888862218651e-05, + "loss": 1.8725, + "step": 3027 + }, + { + "epoch": 0.16877543057800568, + "grad_norm": 0.5649563670158386, + "learning_rate": 9.421473167421373e-05, + "loss": 1.873, + "step": 3028 + }, + { + "epoch": 0.16883116883116883, + "grad_norm": 0.5560464262962341, + "learning_rate": 9.421057332400744e-05, + "loss": 1.6385, + "step": 3029 + }, + { + "epoch": 0.168886907084332, + "grad_norm": 0.5245364904403687, + "learning_rate": 9.420641357169954e-05, + "loss": 1.758, + "step": 3030 + }, + { + "epoch": 0.16894264533749512, + "grad_norm": 0.5251185297966003, + "learning_rate": 9.420225241742193e-05, + "loss": 1.829, + "step": 3031 + }, + { + "epoch": 0.16899838359065827, + "grad_norm": 0.5360503792762756, + "learning_rate": 9.419808986130661e-05, + "loss": 1.7447, + "step": 3032 + }, + { + "epoch": 0.1690541218438214, + "grad_norm": 0.579368531703949, + "learning_rate": 9.419392590348555e-05, + "loss": 1.7367, + "step": 3033 + }, + { + "epoch": 0.16910986009698456, + "grad_norm": 0.5943927764892578, + "learning_rate": 9.418976054409084e-05, + "loss": 1.8542, + "step": 3034 + }, + { + "epoch": 0.16916559835014772, + "grad_norm": 0.5310322642326355, + "learning_rate": 9.418559378325457e-05, + "loss": 1.5941, + "step": 3035 + }, + { + "epoch": 0.16922133660331085, + "grad_norm": 0.5201945304870605, + "learning_rate": 9.418142562110888e-05, + "loss": 1.6894, + "step": 3036 + }, + { + "epoch": 0.169277074856474, + "grad_norm": 0.49601128697395325, + "learning_rate": 9.417725605778598e-05, + "loss": 1.5647, + "step": 3037 + }, + { + "epoch": 0.16933281310963713, + "grad_norm": 0.5370486378669739, + "learning_rate": 9.417308509341806e-05, + "loss": 1.7843, + "step": 3038 + }, + { + "epoch": 0.1693885513628003, + "grad_norm": 0.5515000820159912, + "learning_rate": 9.416891272813747e-05, + "loss": 1.8156, + "step": 3039 + }, + { + "epoch": 0.16944428961596344, + "grad_norm": 0.5245648622512817, + "learning_rate": 9.416473896207645e-05, + "loss": 1.7029, + "step": 3040 + }, + { + "epoch": 0.16950002786912657, + "grad_norm": 0.6024215817451477, + "learning_rate": 9.416056379536744e-05, + "loss": 1.8892, + "step": 3041 + }, + { + "epoch": 0.16955576612228973, + "grad_norm": 0.5456023812294006, + "learning_rate": 9.415638722814279e-05, + "loss": 1.7344, + "step": 3042 + }, + { + "epoch": 0.16961150437545286, + "grad_norm": 0.47283026576042175, + "learning_rate": 9.415220926053501e-05, + "loss": 1.4281, + "step": 3043 + }, + { + "epoch": 0.16966724262861602, + "grad_norm": 0.5906921029090881, + "learning_rate": 9.414802989267657e-05, + "loss": 1.772, + "step": 3044 + }, + { + "epoch": 0.16972298088177917, + "grad_norm": 0.5549463033676147, + "learning_rate": 9.414384912470002e-05, + "loss": 1.6814, + "step": 3045 + }, + { + "epoch": 0.1697787191349423, + "grad_norm": 0.5007080435752869, + "learning_rate": 9.413966695673795e-05, + "loss": 1.7041, + "step": 3046 + }, + { + "epoch": 0.16983445738810546, + "grad_norm": 0.5527877807617188, + "learning_rate": 9.413548338892301e-05, + "loss": 1.8597, + "step": 3047 + }, + { + "epoch": 0.16989019564126862, + "grad_norm": 0.5755193829536438, + "learning_rate": 9.413129842138786e-05, + "loss": 2.115, + "step": 3048 + }, + { + "epoch": 0.16994593389443174, + "grad_norm": 0.5897433161735535, + "learning_rate": 9.412711205426521e-05, + "loss": 1.5559, + "step": 3049 + }, + { + "epoch": 0.1700016721475949, + "grad_norm": 0.5253439545631409, + "learning_rate": 9.412292428768787e-05, + "loss": 1.8423, + "step": 3050 + }, + { + "epoch": 0.17005741040075803, + "grad_norm": 0.5220539569854736, + "learning_rate": 9.411873512178862e-05, + "loss": 1.6792, + "step": 3051 + }, + { + "epoch": 0.1701131486539212, + "grad_norm": 0.5669887661933899, + "learning_rate": 9.41145445567003e-05, + "loss": 1.8432, + "step": 3052 + }, + { + "epoch": 0.17016888690708434, + "grad_norm": 0.5661007761955261, + "learning_rate": 9.411035259255585e-05, + "loss": 1.9316, + "step": 3053 + }, + { + "epoch": 0.17022462516024747, + "grad_norm": 0.5614895820617676, + "learning_rate": 9.41061592294882e-05, + "loss": 1.8668, + "step": 3054 + }, + { + "epoch": 0.17028036341341063, + "grad_norm": 0.541671872138977, + "learning_rate": 9.410196446763034e-05, + "loss": 1.9025, + "step": 3055 + }, + { + "epoch": 0.17033610166657376, + "grad_norm": 0.54454106092453, + "learning_rate": 9.409776830711528e-05, + "loss": 1.7351, + "step": 3056 + }, + { + "epoch": 0.17039183991973691, + "grad_norm": 0.581135094165802, + "learning_rate": 9.409357074807612e-05, + "loss": 2.0981, + "step": 3057 + }, + { + "epoch": 0.17044757817290007, + "grad_norm": 0.5024539232254028, + "learning_rate": 9.4089371790646e-05, + "loss": 1.74, + "step": 3058 + }, + { + "epoch": 0.1705033164260632, + "grad_norm": 0.527542233467102, + "learning_rate": 9.408517143495806e-05, + "loss": 1.7409, + "step": 3059 + }, + { + "epoch": 0.17055905467922636, + "grad_norm": 0.5976712107658386, + "learning_rate": 9.40809696811455e-05, + "loss": 1.6624, + "step": 3060 + }, + { + "epoch": 0.1706147929323895, + "grad_norm": 0.5328633785247803, + "learning_rate": 9.40767665293416e-05, + "loss": 1.7723, + "step": 3061 + }, + { + "epoch": 0.17067053118555264, + "grad_norm": 0.5550236701965332, + "learning_rate": 9.407256197967965e-05, + "loss": 1.771, + "step": 3062 + }, + { + "epoch": 0.1707262694387158, + "grad_norm": 0.5482365489006042, + "learning_rate": 9.4068356032293e-05, + "loss": 1.5427, + "step": 3063 + }, + { + "epoch": 0.17078200769187893, + "grad_norm": 0.5379420518875122, + "learning_rate": 9.406414868731502e-05, + "loss": 1.7884, + "step": 3064 + }, + { + "epoch": 0.17083774594504209, + "grad_norm": 0.5322206020355225, + "learning_rate": 9.405993994487917e-05, + "loss": 1.7756, + "step": 3065 + }, + { + "epoch": 0.17089348419820521, + "grad_norm": 0.5303000807762146, + "learning_rate": 9.40557298051189e-05, + "loss": 1.7589, + "step": 3066 + }, + { + "epoch": 0.17094922245136837, + "grad_norm": 0.5660407543182373, + "learning_rate": 9.405151826816776e-05, + "loss": 1.7427, + "step": 3067 + }, + { + "epoch": 0.17100496070453153, + "grad_norm": 0.5341696739196777, + "learning_rate": 9.404730533415929e-05, + "loss": 1.8757, + "step": 3068 + }, + { + "epoch": 0.17106069895769466, + "grad_norm": 0.533214271068573, + "learning_rate": 9.40430910032271e-05, + "loss": 1.8219, + "step": 3069 + }, + { + "epoch": 0.1711164372108578, + "grad_norm": 0.6056374311447144, + "learning_rate": 9.403887527550486e-05, + "loss": 1.9808, + "step": 3070 + }, + { + "epoch": 0.17117217546402097, + "grad_norm": 0.5189699530601501, + "learning_rate": 9.403465815112626e-05, + "loss": 1.6841, + "step": 3071 + }, + { + "epoch": 0.1712279137171841, + "grad_norm": 0.5255261659622192, + "learning_rate": 9.403043963022505e-05, + "loss": 1.5559, + "step": 3072 + }, + { + "epoch": 0.17128365197034726, + "grad_norm": 0.8432055115699768, + "learning_rate": 9.4026219712935e-05, + "loss": 1.8316, + "step": 3073 + }, + { + "epoch": 0.17133939022351038, + "grad_norm": 0.5276064276695251, + "learning_rate": 9.402199839938996e-05, + "loss": 1.678, + "step": 3074 + }, + { + "epoch": 0.17139512847667354, + "grad_norm": 0.5075768232345581, + "learning_rate": 9.401777568972379e-05, + "loss": 1.5931, + "step": 3075 + }, + { + "epoch": 0.1714508667298367, + "grad_norm": 0.5471227169036865, + "learning_rate": 9.401355158407042e-05, + "loss": 1.8761, + "step": 3076 + }, + { + "epoch": 0.17150660498299983, + "grad_norm": 0.5062270760536194, + "learning_rate": 9.400932608256381e-05, + "loss": 1.6682, + "step": 3077 + }, + { + "epoch": 0.17156234323616298, + "grad_norm": 0.5492522716522217, + "learning_rate": 9.400509918533798e-05, + "loss": 1.6889, + "step": 3078 + }, + { + "epoch": 0.1716180814893261, + "grad_norm": 0.5703136324882507, + "learning_rate": 9.400087089252695e-05, + "loss": 1.6925, + "step": 3079 + }, + { + "epoch": 0.17167381974248927, + "grad_norm": 0.5027966499328613, + "learning_rate": 9.399664120426484e-05, + "loss": 1.4425, + "step": 3080 + }, + { + "epoch": 0.17172955799565243, + "grad_norm": 0.558413028717041, + "learning_rate": 9.39924101206858e-05, + "loss": 1.6485, + "step": 3081 + }, + { + "epoch": 0.17178529624881556, + "grad_norm": 0.6047654151916504, + "learning_rate": 9.3988177641924e-05, + "loss": 1.835, + "step": 3082 + }, + { + "epoch": 0.1718410345019787, + "grad_norm": 0.5760734677314758, + "learning_rate": 9.398394376811368e-05, + "loss": 1.7104, + "step": 3083 + }, + { + "epoch": 0.17189677275514184, + "grad_norm": 0.5076540112495422, + "learning_rate": 9.397970849938911e-05, + "loss": 1.5808, + "step": 3084 + }, + { + "epoch": 0.171952511008305, + "grad_norm": 0.5645167827606201, + "learning_rate": 9.39754718358846e-05, + "loss": 1.771, + "step": 3085 + }, + { + "epoch": 0.17200824926146815, + "grad_norm": 0.5443428158760071, + "learning_rate": 9.397123377773451e-05, + "loss": 1.8713, + "step": 3086 + }, + { + "epoch": 0.17206398751463128, + "grad_norm": 0.513888418674469, + "learning_rate": 9.396699432507325e-05, + "loss": 1.5279, + "step": 3087 + }, + { + "epoch": 0.17211972576779444, + "grad_norm": 0.5408303141593933, + "learning_rate": 9.396275347803529e-05, + "loss": 1.8924, + "step": 3088 + }, + { + "epoch": 0.1721754640209576, + "grad_norm": 0.5284982323646545, + "learning_rate": 9.395851123675512e-05, + "loss": 1.7562, + "step": 3089 + }, + { + "epoch": 0.17223120227412073, + "grad_norm": 0.5364746451377869, + "learning_rate": 9.395426760136726e-05, + "loss": 1.599, + "step": 3090 + }, + { + "epoch": 0.17228694052728388, + "grad_norm": 0.5527182817459106, + "learning_rate": 9.39500225720063e-05, + "loss": 1.7657, + "step": 3091 + }, + { + "epoch": 0.172342678780447, + "grad_norm": 0.5294612646102905, + "learning_rate": 9.394577614880687e-05, + "loss": 1.684, + "step": 3092 + }, + { + "epoch": 0.17239841703361017, + "grad_norm": 0.5614673495292664, + "learning_rate": 9.394152833190364e-05, + "loss": 1.8619, + "step": 3093 + }, + { + "epoch": 0.17245415528677333, + "grad_norm": 0.5280752182006836, + "learning_rate": 9.393727912143134e-05, + "loss": 1.6454, + "step": 3094 + }, + { + "epoch": 0.17250989353993645, + "grad_norm": 0.5236919522285461, + "learning_rate": 9.39330285175247e-05, + "loss": 1.6498, + "step": 3095 + }, + { + "epoch": 0.1725656317930996, + "grad_norm": 0.5192380547523499, + "learning_rate": 9.392877652031855e-05, + "loss": 1.8345, + "step": 3096 + }, + { + "epoch": 0.17262137004626274, + "grad_norm": 0.5223302841186523, + "learning_rate": 9.392452312994773e-05, + "loss": 1.5056, + "step": 3097 + }, + { + "epoch": 0.1726771082994259, + "grad_norm": 0.5231219530105591, + "learning_rate": 9.392026834654714e-05, + "loss": 1.5868, + "step": 3098 + }, + { + "epoch": 0.17273284655258905, + "grad_norm": 0.5619219541549683, + "learning_rate": 9.39160121702517e-05, + "loss": 1.8988, + "step": 3099 + }, + { + "epoch": 0.17278858480575218, + "grad_norm": 0.5591604709625244, + "learning_rate": 9.391175460119642e-05, + "loss": 1.7228, + "step": 3100 + }, + { + "epoch": 0.17284432305891534, + "grad_norm": 0.5290101766586304, + "learning_rate": 9.39074956395163e-05, + "loss": 1.6436, + "step": 3101 + }, + { + "epoch": 0.17290006131207847, + "grad_norm": 0.5596829056739807, + "learning_rate": 9.390323528534641e-05, + "loss": 1.498, + "step": 3102 + }, + { + "epoch": 0.17295579956524162, + "grad_norm": 0.5178213119506836, + "learning_rate": 9.389897353882188e-05, + "loss": 1.6834, + "step": 3103 + }, + { + "epoch": 0.17301153781840478, + "grad_norm": 0.46845757961273193, + "learning_rate": 9.389471040007784e-05, + "loss": 1.4012, + "step": 3104 + }, + { + "epoch": 0.1730672760715679, + "grad_norm": 0.5671401619911194, + "learning_rate": 9.389044586924953e-05, + "loss": 1.7005, + "step": 3105 + }, + { + "epoch": 0.17312301432473107, + "grad_norm": 0.5250539779663086, + "learning_rate": 9.388617994647218e-05, + "loss": 1.6934, + "step": 3106 + }, + { + "epoch": 0.1731787525778942, + "grad_norm": 0.5091891884803772, + "learning_rate": 9.388191263188107e-05, + "loss": 1.5041, + "step": 3107 + }, + { + "epoch": 0.17323449083105735, + "grad_norm": 0.5298328995704651, + "learning_rate": 9.387764392561153e-05, + "loss": 1.6184, + "step": 3108 + }, + { + "epoch": 0.1732902290842205, + "grad_norm": 0.5605019330978394, + "learning_rate": 9.387337382779894e-05, + "loss": 1.8302, + "step": 3109 + }, + { + "epoch": 0.17334596733738364, + "grad_norm": 0.554153561592102, + "learning_rate": 9.386910233857875e-05, + "loss": 1.6565, + "step": 3110 + }, + { + "epoch": 0.1734017055905468, + "grad_norm": 0.5952569246292114, + "learning_rate": 9.386482945808641e-05, + "loss": 1.5957, + "step": 3111 + }, + { + "epoch": 0.17345744384370995, + "grad_norm": 0.6842632293701172, + "learning_rate": 9.386055518645742e-05, + "loss": 1.7147, + "step": 3112 + }, + { + "epoch": 0.17351318209687308, + "grad_norm": 0.6011619567871094, + "learning_rate": 9.385627952382736e-05, + "loss": 2.0401, + "step": 3113 + }, + { + "epoch": 0.17356892035003624, + "grad_norm": 0.5976441502571106, + "learning_rate": 9.38520024703318e-05, + "loss": 1.9242, + "step": 3114 + }, + { + "epoch": 0.17362465860319937, + "grad_norm": 0.4991317689418793, + "learning_rate": 9.38477240261064e-05, + "loss": 1.689, + "step": 3115 + }, + { + "epoch": 0.17368039685636252, + "grad_norm": 0.5823774337768555, + "learning_rate": 9.384344419128684e-05, + "loss": 1.7896, + "step": 3116 + }, + { + "epoch": 0.17373613510952568, + "grad_norm": 0.584511399269104, + "learning_rate": 9.383916296600886e-05, + "loss": 1.7828, + "step": 3117 + }, + { + "epoch": 0.1737918733626888, + "grad_norm": 0.5839495062828064, + "learning_rate": 9.383488035040821e-05, + "loss": 1.9487, + "step": 3118 + }, + { + "epoch": 0.17384761161585197, + "grad_norm": 0.5381820201873779, + "learning_rate": 9.383059634462077e-05, + "loss": 1.6792, + "step": 3119 + }, + { + "epoch": 0.1739033498690151, + "grad_norm": 0.5147883892059326, + "learning_rate": 9.382631094878234e-05, + "loss": 1.6627, + "step": 3120 + }, + { + "epoch": 0.17395908812217825, + "grad_norm": 0.6467978358268738, + "learning_rate": 9.382202416302885e-05, + "loss": 1.7446, + "step": 3121 + }, + { + "epoch": 0.1740148263753414, + "grad_norm": 0.5035672187805176, + "learning_rate": 9.381773598749626e-05, + "loss": 1.6078, + "step": 3122 + }, + { + "epoch": 0.17407056462850454, + "grad_norm": 0.5837130546569824, + "learning_rate": 9.381344642232056e-05, + "loss": 1.792, + "step": 3123 + }, + { + "epoch": 0.1741263028816677, + "grad_norm": 0.5331088900566101, + "learning_rate": 9.380915546763778e-05, + "loss": 1.788, + "step": 3124 + }, + { + "epoch": 0.17418204113483082, + "grad_norm": 0.5427802801132202, + "learning_rate": 9.380486312358402e-05, + "loss": 1.8515, + "step": 3125 + }, + { + "epoch": 0.17423777938799398, + "grad_norm": 0.4916117489337921, + "learning_rate": 9.380056939029541e-05, + "loss": 1.5184, + "step": 3126 + }, + { + "epoch": 0.17429351764115714, + "grad_norm": 0.559158980846405, + "learning_rate": 9.379627426790812e-05, + "loss": 1.8659, + "step": 3127 + }, + { + "epoch": 0.17434925589432027, + "grad_norm": 0.5941457152366638, + "learning_rate": 9.379197775655833e-05, + "loss": 1.7891, + "step": 3128 + }, + { + "epoch": 0.17440499414748342, + "grad_norm": 0.4794413447380066, + "learning_rate": 9.378767985638235e-05, + "loss": 1.4975, + "step": 3129 + }, + { + "epoch": 0.17446073240064655, + "grad_norm": 0.5934321284294128, + "learning_rate": 9.378338056751647e-05, + "loss": 1.9019, + "step": 3130 + }, + { + "epoch": 0.1745164706538097, + "grad_norm": 0.5290476679801941, + "learning_rate": 9.377907989009702e-05, + "loss": 1.7563, + "step": 3131 + }, + { + "epoch": 0.17457220890697286, + "grad_norm": 0.5909081101417542, + "learning_rate": 9.37747778242604e-05, + "loss": 2.009, + "step": 3132 + }, + { + "epoch": 0.174627947160136, + "grad_norm": 0.5411567687988281, + "learning_rate": 9.377047437014308e-05, + "loss": 1.8264, + "step": 3133 + }, + { + "epoch": 0.17468368541329915, + "grad_norm": 0.5046765208244324, + "learning_rate": 9.376616952788149e-05, + "loss": 1.6131, + "step": 3134 + }, + { + "epoch": 0.1747394236664623, + "grad_norm": 0.528154194355011, + "learning_rate": 9.376186329761219e-05, + "loss": 1.7159, + "step": 3135 + }, + { + "epoch": 0.17479516191962544, + "grad_norm": 0.5536481142044067, + "learning_rate": 9.375755567947173e-05, + "loss": 1.5203, + "step": 3136 + }, + { + "epoch": 0.1748509001727886, + "grad_norm": 0.5683685541152954, + "learning_rate": 9.375324667359673e-05, + "loss": 1.7154, + "step": 3137 + }, + { + "epoch": 0.17490663842595172, + "grad_norm": 0.4969169497489929, + "learning_rate": 9.374893628012384e-05, + "loss": 1.7277, + "step": 3138 + }, + { + "epoch": 0.17496237667911488, + "grad_norm": 0.548058032989502, + "learning_rate": 9.374462449918976e-05, + "loss": 1.7931, + "step": 3139 + }, + { + "epoch": 0.17501811493227803, + "grad_norm": 0.5391299724578857, + "learning_rate": 9.374031133093124e-05, + "loss": 1.8076, + "step": 3140 + }, + { + "epoch": 0.17507385318544116, + "grad_norm": 0.5356679558753967, + "learning_rate": 9.373599677548508e-05, + "loss": 1.7212, + "step": 3141 + }, + { + "epoch": 0.17512959143860432, + "grad_norm": 0.5841724276542664, + "learning_rate": 9.373168083298809e-05, + "loss": 1.9175, + "step": 3142 + }, + { + "epoch": 0.17518532969176745, + "grad_norm": 0.5568740963935852, + "learning_rate": 9.372736350357717e-05, + "loss": 1.842, + "step": 3143 + }, + { + "epoch": 0.1752410679449306, + "grad_norm": 0.5539031028747559, + "learning_rate": 9.372304478738922e-05, + "loss": 1.8881, + "step": 3144 + }, + { + "epoch": 0.17529680619809376, + "grad_norm": 0.5519389510154724, + "learning_rate": 9.371872468456122e-05, + "loss": 1.7381, + "step": 3145 + }, + { + "epoch": 0.1753525444512569, + "grad_norm": 0.5324805378913879, + "learning_rate": 9.371440319523016e-05, + "loss": 1.745, + "step": 3146 + }, + { + "epoch": 0.17540828270442005, + "grad_norm": 0.5449910759925842, + "learning_rate": 9.37100803195331e-05, + "loss": 1.8071, + "step": 3147 + }, + { + "epoch": 0.17546402095758318, + "grad_norm": 0.5846375823020935, + "learning_rate": 9.370575605760716e-05, + "loss": 1.8659, + "step": 3148 + }, + { + "epoch": 0.17551975921074633, + "grad_norm": 0.4958127737045288, + "learning_rate": 9.370143040958943e-05, + "loss": 1.5791, + "step": 3149 + }, + { + "epoch": 0.1755754974639095, + "grad_norm": 0.5119603276252747, + "learning_rate": 9.369710337561714e-05, + "loss": 1.7657, + "step": 3150 + }, + { + "epoch": 0.17563123571707262, + "grad_norm": 0.5698620080947876, + "learning_rate": 9.36927749558275e-05, + "loss": 2.0541, + "step": 3151 + }, + { + "epoch": 0.17568697397023578, + "grad_norm": 0.5704925656318665, + "learning_rate": 9.368844515035779e-05, + "loss": 1.762, + "step": 3152 + }, + { + "epoch": 0.1757427122233989, + "grad_norm": 0.5676224231719971, + "learning_rate": 9.368411395934533e-05, + "loss": 1.5928, + "step": 3153 + }, + { + "epoch": 0.17579845047656206, + "grad_norm": 0.5878868699073792, + "learning_rate": 9.367978138292747e-05, + "loss": 1.9292, + "step": 3154 + }, + { + "epoch": 0.17585418872972522, + "grad_norm": 0.5323675274848938, + "learning_rate": 9.36754474212416e-05, + "loss": 1.8832, + "step": 3155 + }, + { + "epoch": 0.17590992698288835, + "grad_norm": 0.49846091866493225, + "learning_rate": 9.36711120744252e-05, + "loss": 1.4679, + "step": 3156 + }, + { + "epoch": 0.1759656652360515, + "grad_norm": 0.5483475923538208, + "learning_rate": 9.366677534261572e-05, + "loss": 1.7744, + "step": 3157 + }, + { + "epoch": 0.17602140348921466, + "grad_norm": 0.5628114938735962, + "learning_rate": 9.366243722595074e-05, + "loss": 1.8169, + "step": 3158 + }, + { + "epoch": 0.1760771417423778, + "grad_norm": 0.5500927567481995, + "learning_rate": 9.365809772456782e-05, + "loss": 1.8714, + "step": 3159 + }, + { + "epoch": 0.17613287999554095, + "grad_norm": 0.5269673466682434, + "learning_rate": 9.365375683860458e-05, + "loss": 1.797, + "step": 3160 + }, + { + "epoch": 0.17618861824870408, + "grad_norm": 0.5542075037956238, + "learning_rate": 9.36494145681987e-05, + "loss": 1.8027, + "step": 3161 + }, + { + "epoch": 0.17624435650186723, + "grad_norm": 0.5421326756477356, + "learning_rate": 9.364507091348788e-05, + "loss": 1.7254, + "step": 3162 + }, + { + "epoch": 0.1763000947550304, + "grad_norm": 0.4979914128780365, + "learning_rate": 9.364072587460988e-05, + "loss": 1.7505, + "step": 3163 + }, + { + "epoch": 0.17635583300819352, + "grad_norm": 0.5363655686378479, + "learning_rate": 9.363637945170249e-05, + "loss": 1.7651, + "step": 3164 + }, + { + "epoch": 0.17641157126135668, + "grad_norm": 0.5159875750541687, + "learning_rate": 9.363203164490356e-05, + "loss": 1.7096, + "step": 3165 + }, + { + "epoch": 0.1764673095145198, + "grad_norm": 0.590908408164978, + "learning_rate": 9.362768245435098e-05, + "loss": 2.0557, + "step": 3166 + }, + { + "epoch": 0.17652304776768296, + "grad_norm": 0.5476133823394775, + "learning_rate": 9.362333188018269e-05, + "loss": 1.6362, + "step": 3167 + }, + { + "epoch": 0.17657878602084612, + "grad_norm": 0.5187797546386719, + "learning_rate": 9.361897992253665e-05, + "loss": 1.6019, + "step": 3168 + }, + { + "epoch": 0.17663452427400925, + "grad_norm": 0.5152827501296997, + "learning_rate": 9.361462658155089e-05, + "loss": 1.7042, + "step": 3169 + }, + { + "epoch": 0.1766902625271724, + "grad_norm": 0.5961150527000427, + "learning_rate": 9.361027185736346e-05, + "loss": 1.7224, + "step": 3170 + }, + { + "epoch": 0.17674600078033553, + "grad_norm": 0.5234068632125854, + "learning_rate": 9.360591575011245e-05, + "loss": 1.6534, + "step": 3171 + }, + { + "epoch": 0.1768017390334987, + "grad_norm": 0.5417289137840271, + "learning_rate": 9.360155825993607e-05, + "loss": 1.8964, + "step": 3172 + }, + { + "epoch": 0.17685747728666185, + "grad_norm": 0.535892903804779, + "learning_rate": 9.359719938697246e-05, + "loss": 1.7262, + "step": 3173 + }, + { + "epoch": 0.17691321553982498, + "grad_norm": 0.5440612435340881, + "learning_rate": 9.359283913135988e-05, + "loss": 1.7775, + "step": 3174 + }, + { + "epoch": 0.17696895379298813, + "grad_norm": 0.6108183264732361, + "learning_rate": 9.358847749323659e-05, + "loss": 1.9756, + "step": 3175 + }, + { + "epoch": 0.17702469204615126, + "grad_norm": 0.5500672459602356, + "learning_rate": 9.358411447274094e-05, + "loss": 1.7427, + "step": 3176 + }, + { + "epoch": 0.17708043029931442, + "grad_norm": 0.5370178818702698, + "learning_rate": 9.357975007001129e-05, + "loss": 1.8076, + "step": 3177 + }, + { + "epoch": 0.17713616855247757, + "grad_norm": 0.5063850283622742, + "learning_rate": 9.357538428518607e-05, + "loss": 1.6826, + "step": 3178 + }, + { + "epoch": 0.1771919068056407, + "grad_norm": 0.5165611505508423, + "learning_rate": 9.357101711840372e-05, + "loss": 1.6545, + "step": 3179 + }, + { + "epoch": 0.17724764505880386, + "grad_norm": 0.521656334400177, + "learning_rate": 9.356664856980273e-05, + "loss": 1.4337, + "step": 3180 + }, + { + "epoch": 0.17730338331196702, + "grad_norm": 0.527341902256012, + "learning_rate": 9.356227863952168e-05, + "loss": 1.7241, + "step": 3181 + }, + { + "epoch": 0.17735912156513015, + "grad_norm": 0.494210422039032, + "learning_rate": 9.355790732769911e-05, + "loss": 1.5474, + "step": 3182 + }, + { + "epoch": 0.1774148598182933, + "grad_norm": 0.5171836614608765, + "learning_rate": 9.35535346344737e-05, + "loss": 1.5795, + "step": 3183 + }, + { + "epoch": 0.17747059807145643, + "grad_norm": 0.5571975111961365, + "learning_rate": 9.354916055998409e-05, + "loss": 1.8093, + "step": 3184 + }, + { + "epoch": 0.1775263363246196, + "grad_norm": 0.5996416807174683, + "learning_rate": 9.354478510436902e-05, + "loss": 2.0041, + "step": 3185 + }, + { + "epoch": 0.17758207457778274, + "grad_norm": 0.4972604811191559, + "learning_rate": 9.354040826776727e-05, + "loss": 1.7329, + "step": 3186 + }, + { + "epoch": 0.17763781283094587, + "grad_norm": 0.5599552392959595, + "learning_rate": 9.35360300503176e-05, + "loss": 1.7857, + "step": 3187 + }, + { + "epoch": 0.17769355108410903, + "grad_norm": 0.5476880669593811, + "learning_rate": 9.35316504521589e-05, + "loss": 1.7757, + "step": 3188 + }, + { + "epoch": 0.17774928933727216, + "grad_norm": 0.5362497568130493, + "learning_rate": 9.352726947343006e-05, + "loss": 1.7656, + "step": 3189 + }, + { + "epoch": 0.17780502759043532, + "grad_norm": 0.5269262194633484, + "learning_rate": 9.352288711427001e-05, + "loss": 1.7716, + "step": 3190 + }, + { + "epoch": 0.17786076584359847, + "grad_norm": 0.5733572244644165, + "learning_rate": 9.351850337481773e-05, + "loss": 1.7623, + "step": 3191 + }, + { + "epoch": 0.1779165040967616, + "grad_norm": 0.5491241812705994, + "learning_rate": 9.351411825521228e-05, + "loss": 1.7835, + "step": 3192 + }, + { + "epoch": 0.17797224234992476, + "grad_norm": 0.5553460121154785, + "learning_rate": 9.350973175559267e-05, + "loss": 1.9064, + "step": 3193 + }, + { + "epoch": 0.1780279806030879, + "grad_norm": 0.5257185101509094, + "learning_rate": 9.350534387609807e-05, + "loss": 1.7245, + "step": 3194 + }, + { + "epoch": 0.17808371885625104, + "grad_norm": 0.5201014876365662, + "learning_rate": 9.35009546168676e-05, + "loss": 1.6991, + "step": 3195 + }, + { + "epoch": 0.1781394571094142, + "grad_norm": 0.5365905165672302, + "learning_rate": 9.34965639780405e-05, + "loss": 1.747, + "step": 3196 + }, + { + "epoch": 0.17819519536257733, + "grad_norm": 0.5471792221069336, + "learning_rate": 9.349217195975598e-05, + "loss": 1.8114, + "step": 3197 + }, + { + "epoch": 0.1782509336157405, + "grad_norm": 0.5407313704490662, + "learning_rate": 9.348777856215334e-05, + "loss": 1.7719, + "step": 3198 + }, + { + "epoch": 0.17830667186890362, + "grad_norm": 0.5418484807014465, + "learning_rate": 9.348338378537192e-05, + "loss": 1.7989, + "step": 3199 + }, + { + "epoch": 0.17836241012206677, + "grad_norm": 0.5235376954078674, + "learning_rate": 9.347898762955109e-05, + "loss": 1.5998, + "step": 3200 + }, + { + "epoch": 0.17841814837522993, + "grad_norm": 0.5582895874977112, + "learning_rate": 9.347459009483028e-05, + "loss": 1.7352, + "step": 3201 + }, + { + "epoch": 0.17847388662839306, + "grad_norm": 0.5512102246284485, + "learning_rate": 9.347019118134893e-05, + "loss": 1.8595, + "step": 3202 + }, + { + "epoch": 0.17852962488155621, + "grad_norm": 0.5874474048614502, + "learning_rate": 9.346579088924658e-05, + "loss": 1.8312, + "step": 3203 + }, + { + "epoch": 0.17858536313471937, + "grad_norm": 0.5523637533187866, + "learning_rate": 9.346138921866276e-05, + "loss": 1.9124, + "step": 3204 + }, + { + "epoch": 0.1786411013878825, + "grad_norm": 0.5245184898376465, + "learning_rate": 9.345698616973707e-05, + "loss": 1.8279, + "step": 3205 + }, + { + "epoch": 0.17869683964104566, + "grad_norm": 0.5538264513015747, + "learning_rate": 9.345258174260915e-05, + "loss": 1.8218, + "step": 3206 + }, + { + "epoch": 0.1787525778942088, + "grad_norm": 0.5474498271942139, + "learning_rate": 9.344817593741868e-05, + "loss": 1.6772, + "step": 3207 + }, + { + "epoch": 0.17880831614737194, + "grad_norm": 0.5437337756156921, + "learning_rate": 9.344376875430539e-05, + "loss": 1.8402, + "step": 3208 + }, + { + "epoch": 0.1788640544005351, + "grad_norm": 0.6069798469543457, + "learning_rate": 9.343936019340906e-05, + "loss": 2.0245, + "step": 3209 + }, + { + "epoch": 0.17891979265369823, + "grad_norm": 0.5451731085777283, + "learning_rate": 9.343495025486948e-05, + "loss": 1.7243, + "step": 3210 + }, + { + "epoch": 0.17897553090686139, + "grad_norm": 0.5307853817939758, + "learning_rate": 9.343053893882654e-05, + "loss": 1.8062, + "step": 3211 + }, + { + "epoch": 0.17903126916002451, + "grad_norm": 0.5642760992050171, + "learning_rate": 9.34261262454201e-05, + "loss": 1.9111, + "step": 3212 + }, + { + "epoch": 0.17908700741318767, + "grad_norm": 0.5641029477119446, + "learning_rate": 9.342171217479014e-05, + "loss": 1.892, + "step": 3213 + }, + { + "epoch": 0.17914274566635083, + "grad_norm": 0.5118708610534668, + "learning_rate": 9.341729672707664e-05, + "loss": 1.7303, + "step": 3214 + }, + { + "epoch": 0.17919848391951396, + "grad_norm": 0.5048193335533142, + "learning_rate": 9.341287990241962e-05, + "loss": 1.5011, + "step": 3215 + }, + { + "epoch": 0.1792542221726771, + "grad_norm": 0.5508407950401306, + "learning_rate": 9.340846170095917e-05, + "loss": 1.8355, + "step": 3216 + }, + { + "epoch": 0.17930996042584024, + "grad_norm": 0.5779476165771484, + "learning_rate": 9.34040421228354e-05, + "loss": 1.8892, + "step": 3217 + }, + { + "epoch": 0.1793656986790034, + "grad_norm": 0.5211353898048401, + "learning_rate": 9.339962116818848e-05, + "loss": 1.6359, + "step": 3218 + }, + { + "epoch": 0.17942143693216656, + "grad_norm": 0.5479955077171326, + "learning_rate": 9.339519883715862e-05, + "loss": 1.7594, + "step": 3219 + }, + { + "epoch": 0.17947717518532968, + "grad_norm": 0.49651384353637695, + "learning_rate": 9.339077512988606e-05, + "loss": 1.5873, + "step": 3220 + }, + { + "epoch": 0.17953291343849284, + "grad_norm": 0.569810152053833, + "learning_rate": 9.338635004651108e-05, + "loss": 1.6675, + "step": 3221 + }, + { + "epoch": 0.17958865169165597, + "grad_norm": 0.5437332987785339, + "learning_rate": 9.338192358717406e-05, + "loss": 1.8268, + "step": 3222 + }, + { + "epoch": 0.17964438994481913, + "grad_norm": 0.5670780539512634, + "learning_rate": 9.337749575201535e-05, + "loss": 1.6647, + "step": 3223 + }, + { + "epoch": 0.17970012819798228, + "grad_norm": 0.5969633460044861, + "learning_rate": 9.337306654117538e-05, + "loss": 1.7202, + "step": 3224 + }, + { + "epoch": 0.1797558664511454, + "grad_norm": 0.48552221059799194, + "learning_rate": 9.336863595479462e-05, + "loss": 1.4645, + "step": 3225 + }, + { + "epoch": 0.17981160470430857, + "grad_norm": 0.5412662625312805, + "learning_rate": 9.33642039930136e-05, + "loss": 1.8443, + "step": 3226 + }, + { + "epoch": 0.17986734295747173, + "grad_norm": 0.5973519682884216, + "learning_rate": 9.335977065597285e-05, + "loss": 1.98, + "step": 3227 + }, + { + "epoch": 0.17992308121063486, + "grad_norm": 0.5288311243057251, + "learning_rate": 9.335533594381297e-05, + "loss": 1.5549, + "step": 3228 + }, + { + "epoch": 0.179978819463798, + "grad_norm": 0.5504105687141418, + "learning_rate": 9.335089985667463e-05, + "loss": 1.5479, + "step": 3229 + }, + { + "epoch": 0.18003455771696114, + "grad_norm": 0.4889037609100342, + "learning_rate": 9.334646239469848e-05, + "loss": 1.7899, + "step": 3230 + }, + { + "epoch": 0.1800902959701243, + "grad_norm": 0.5372660756111145, + "learning_rate": 9.334202355802528e-05, + "loss": 1.7351, + "step": 3231 + }, + { + "epoch": 0.18014603422328745, + "grad_norm": 0.5164480209350586, + "learning_rate": 9.333758334679581e-05, + "loss": 1.6461, + "step": 3232 + }, + { + "epoch": 0.18020177247645058, + "grad_norm": 0.539726972579956, + "learning_rate": 9.333314176115084e-05, + "loss": 1.6368, + "step": 3233 + }, + { + "epoch": 0.18025751072961374, + "grad_norm": 0.6785762310028076, + "learning_rate": 9.33286988012313e-05, + "loss": 2.0446, + "step": 3234 + }, + { + "epoch": 0.18031324898277687, + "grad_norm": 0.580847442150116, + "learning_rate": 9.332425446717803e-05, + "loss": 1.8455, + "step": 3235 + }, + { + "epoch": 0.18036898723594003, + "grad_norm": 0.5236613154411316, + "learning_rate": 9.331980875913202e-05, + "loss": 1.4925, + "step": 3236 + }, + { + "epoch": 0.18042472548910318, + "grad_norm": 0.5626049637794495, + "learning_rate": 9.331536167723423e-05, + "loss": 1.7695, + "step": 3237 + }, + { + "epoch": 0.1804804637422663, + "grad_norm": 0.5435861349105835, + "learning_rate": 9.331091322162573e-05, + "loss": 1.8594, + "step": 3238 + }, + { + "epoch": 0.18053620199542947, + "grad_norm": 0.5868507027626038, + "learning_rate": 9.330646339244759e-05, + "loss": 1.8194, + "step": 3239 + }, + { + "epoch": 0.1805919402485926, + "grad_norm": 0.5488845705986023, + "learning_rate": 9.330201218984092e-05, + "loss": 1.6584, + "step": 3240 + }, + { + "epoch": 0.18064767850175575, + "grad_norm": 0.5238907933235168, + "learning_rate": 9.329755961394688e-05, + "loss": 1.757, + "step": 3241 + }, + { + "epoch": 0.1807034167549189, + "grad_norm": 0.5120671987533569, + "learning_rate": 9.32931056649067e-05, + "loss": 1.6786, + "step": 3242 + }, + { + "epoch": 0.18075915500808204, + "grad_norm": 0.49454161524772644, + "learning_rate": 9.328865034286161e-05, + "loss": 1.457, + "step": 3243 + }, + { + "epoch": 0.1808148932612452, + "grad_norm": 0.5296444892883301, + "learning_rate": 9.328419364795295e-05, + "loss": 1.691, + "step": 3244 + }, + { + "epoch": 0.18087063151440833, + "grad_norm": 0.5104671120643616, + "learning_rate": 9.327973558032201e-05, + "loss": 1.6702, + "step": 3245 + }, + { + "epoch": 0.18092636976757148, + "grad_norm": 0.5683085322380066, + "learning_rate": 9.32752761401102e-05, + "loss": 1.6912, + "step": 3246 + }, + { + "epoch": 0.18098210802073464, + "grad_norm": 0.5360772609710693, + "learning_rate": 9.327081532745896e-05, + "loss": 1.7894, + "step": 3247 + }, + { + "epoch": 0.18103784627389777, + "grad_norm": 0.6272693872451782, + "learning_rate": 9.326635314250971e-05, + "loss": 2.0331, + "step": 3248 + }, + { + "epoch": 0.18109358452706092, + "grad_norm": 0.5494347810745239, + "learning_rate": 9.326188958540403e-05, + "loss": 1.8261, + "step": 3249 + }, + { + "epoch": 0.18114932278022408, + "grad_norm": 0.5473103523254395, + "learning_rate": 9.325742465628342e-05, + "loss": 1.5244, + "step": 3250 + }, + { + "epoch": 0.1812050610333872, + "grad_norm": 0.5626412034034729, + "learning_rate": 9.325295835528953e-05, + "loss": 1.8512, + "step": 3251 + }, + { + "epoch": 0.18126079928655037, + "grad_norm": 0.5165623426437378, + "learning_rate": 9.324849068256397e-05, + "loss": 1.8405, + "step": 3252 + }, + { + "epoch": 0.1813165375397135, + "grad_norm": 0.5183326601982117, + "learning_rate": 9.324402163824846e-05, + "loss": 1.7193, + "step": 3253 + }, + { + "epoch": 0.18137227579287665, + "grad_norm": 0.5188653469085693, + "learning_rate": 9.323955122248468e-05, + "loss": 1.6715, + "step": 3254 + }, + { + "epoch": 0.1814280140460398, + "grad_norm": 0.5316330194473267, + "learning_rate": 9.323507943541447e-05, + "loss": 1.5796, + "step": 3255 + }, + { + "epoch": 0.18148375229920294, + "grad_norm": 0.5456557869911194, + "learning_rate": 9.323060627717961e-05, + "loss": 1.7856, + "step": 3256 + }, + { + "epoch": 0.1815394905523661, + "grad_norm": 0.5671826004981995, + "learning_rate": 9.322613174792197e-05, + "loss": 1.7715, + "step": 3257 + }, + { + "epoch": 0.18159522880552922, + "grad_norm": 0.5530715584754944, + "learning_rate": 9.322165584778347e-05, + "loss": 1.9437, + "step": 3258 + }, + { + "epoch": 0.18165096705869238, + "grad_norm": 0.5097282528877258, + "learning_rate": 9.321717857690601e-05, + "loss": 1.5789, + "step": 3259 + }, + { + "epoch": 0.18170670531185554, + "grad_norm": 0.5106785297393799, + "learning_rate": 9.321269993543166e-05, + "loss": 1.7718, + "step": 3260 + }, + { + "epoch": 0.18176244356501867, + "grad_norm": 0.5174189209938049, + "learning_rate": 9.320821992350239e-05, + "loss": 1.6088, + "step": 3261 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.5284159779548645, + "learning_rate": 9.320373854126032e-05, + "loss": 1.6355, + "step": 3262 + }, + { + "epoch": 0.18187392007134495, + "grad_norm": 0.5431947708129883, + "learning_rate": 9.319925578884754e-05, + "loss": 1.8282, + "step": 3263 + }, + { + "epoch": 0.1819296583245081, + "grad_norm": 0.5244488716125488, + "learning_rate": 9.319477166640626e-05, + "loss": 1.8765, + "step": 3264 + }, + { + "epoch": 0.18198539657767127, + "grad_norm": 0.5338707566261292, + "learning_rate": 9.319028617407865e-05, + "loss": 1.7684, + "step": 3265 + }, + { + "epoch": 0.1820411348308344, + "grad_norm": 0.5929536819458008, + "learning_rate": 9.318579931200697e-05, + "loss": 1.9083, + "step": 3266 + }, + { + "epoch": 0.18209687308399755, + "grad_norm": 0.5214221477508545, + "learning_rate": 9.318131108033355e-05, + "loss": 1.6844, + "step": 3267 + }, + { + "epoch": 0.18215261133716068, + "grad_norm": 0.5370472073554993, + "learning_rate": 9.31768214792007e-05, + "loss": 1.9451, + "step": 3268 + }, + { + "epoch": 0.18220834959032384, + "grad_norm": 0.5181378722190857, + "learning_rate": 9.31723305087508e-05, + "loss": 1.7534, + "step": 3269 + }, + { + "epoch": 0.182264087843487, + "grad_norm": 0.5766522884368896, + "learning_rate": 9.316783816912629e-05, + "loss": 1.876, + "step": 3270 + }, + { + "epoch": 0.18231982609665012, + "grad_norm": 0.5224177241325378, + "learning_rate": 9.316334446046966e-05, + "loss": 1.7254, + "step": 3271 + }, + { + "epoch": 0.18237556434981328, + "grad_norm": 0.5871415138244629, + "learning_rate": 9.315884938292339e-05, + "loss": 1.6292, + "step": 3272 + }, + { + "epoch": 0.18243130260297644, + "grad_norm": 0.5917293429374695, + "learning_rate": 9.315435293663005e-05, + "loss": 2.0649, + "step": 3273 + }, + { + "epoch": 0.18248704085613957, + "grad_norm": 0.5843697190284729, + "learning_rate": 9.314985512173223e-05, + "loss": 1.8282, + "step": 3274 + }, + { + "epoch": 0.18254277910930272, + "grad_norm": 0.5423409938812256, + "learning_rate": 9.31453559383726e-05, + "loss": 1.7501, + "step": 3275 + }, + { + "epoch": 0.18259851736246585, + "grad_norm": 0.5610026717185974, + "learning_rate": 9.314085538669383e-05, + "loss": 1.7287, + "step": 3276 + }, + { + "epoch": 0.182654255615629, + "grad_norm": 0.5071337223052979, + "learning_rate": 9.313635346683865e-05, + "loss": 1.6779, + "step": 3277 + }, + { + "epoch": 0.18270999386879216, + "grad_norm": 0.5492652058601379, + "learning_rate": 9.313185017894985e-05, + "loss": 1.7884, + "step": 3278 + }, + { + "epoch": 0.1827657321219553, + "grad_norm": 0.4901118874549866, + "learning_rate": 9.312734552317023e-05, + "loss": 1.5747, + "step": 3279 + }, + { + "epoch": 0.18282147037511845, + "grad_norm": 0.515848696231842, + "learning_rate": 9.312283949964267e-05, + "loss": 1.4992, + "step": 3280 + }, + { + "epoch": 0.18287720862828158, + "grad_norm": 0.497324675321579, + "learning_rate": 9.311833210851007e-05, + "loss": 1.5226, + "step": 3281 + }, + { + "epoch": 0.18293294688144474, + "grad_norm": 0.5232150554656982, + "learning_rate": 9.311382334991536e-05, + "loss": 1.6106, + "step": 3282 + }, + { + "epoch": 0.1829886851346079, + "grad_norm": 0.6029054522514343, + "learning_rate": 9.310931322400156e-05, + "loss": 1.9531, + "step": 3283 + }, + { + "epoch": 0.18304442338777102, + "grad_norm": 0.70119708776474, + "learning_rate": 9.310480173091168e-05, + "loss": 1.9566, + "step": 3284 + }, + { + "epoch": 0.18310016164093418, + "grad_norm": 0.5252953767776489, + "learning_rate": 9.31002888707888e-05, + "loss": 1.8004, + "step": 3285 + }, + { + "epoch": 0.1831558998940973, + "grad_norm": 0.5744017958641052, + "learning_rate": 9.309577464377606e-05, + "loss": 1.8203, + "step": 3286 + }, + { + "epoch": 0.18321163814726046, + "grad_norm": 0.5286086797714233, + "learning_rate": 9.309125905001659e-05, + "loss": 1.8127, + "step": 3287 + }, + { + "epoch": 0.18326737640042362, + "grad_norm": 0.5180408954620361, + "learning_rate": 9.308674208965364e-05, + "loss": 1.5432, + "step": 3288 + }, + { + "epoch": 0.18332311465358675, + "grad_norm": 0.568420946598053, + "learning_rate": 9.308222376283045e-05, + "loss": 1.853, + "step": 3289 + }, + { + "epoch": 0.1833788529067499, + "grad_norm": 0.9352191090583801, + "learning_rate": 9.30777040696903e-05, + "loss": 1.531, + "step": 3290 + }, + { + "epoch": 0.18343459115991304, + "grad_norm": 0.5612093210220337, + "learning_rate": 9.307318301037656e-05, + "loss": 2.0149, + "step": 3291 + }, + { + "epoch": 0.1834903294130762, + "grad_norm": 0.5616469979286194, + "learning_rate": 9.306866058503257e-05, + "loss": 1.6388, + "step": 3292 + }, + { + "epoch": 0.18354606766623935, + "grad_norm": 0.5579656958580017, + "learning_rate": 9.306413679380177e-05, + "loss": 1.8719, + "step": 3293 + }, + { + "epoch": 0.18360180591940248, + "grad_norm": 0.5343957543373108, + "learning_rate": 9.305961163682764e-05, + "loss": 1.7592, + "step": 3294 + }, + { + "epoch": 0.18365754417256563, + "grad_norm": 0.5974972248077393, + "learning_rate": 9.305508511425367e-05, + "loss": 1.834, + "step": 3295 + }, + { + "epoch": 0.1837132824257288, + "grad_norm": 0.5827033519744873, + "learning_rate": 9.305055722622344e-05, + "loss": 1.8606, + "step": 3296 + }, + { + "epoch": 0.18376902067889192, + "grad_norm": 0.5568636059761047, + "learning_rate": 9.304602797288054e-05, + "loss": 1.8952, + "step": 3297 + }, + { + "epoch": 0.18382475893205508, + "grad_norm": 0.6066376566886902, + "learning_rate": 9.30414973543686e-05, + "loss": 1.9215, + "step": 3298 + }, + { + "epoch": 0.1838804971852182, + "grad_norm": 0.5111042261123657, + "learning_rate": 9.303696537083132e-05, + "loss": 1.5506, + "step": 3299 + }, + { + "epoch": 0.18393623543838136, + "grad_norm": 0.501711905002594, + "learning_rate": 9.303243202241242e-05, + "loss": 1.5003, + "step": 3300 + }, + { + "epoch": 0.18399197369154452, + "grad_norm": 0.543425977230072, + "learning_rate": 9.302789730925567e-05, + "loss": 1.5837, + "step": 3301 + }, + { + "epoch": 0.18404771194470765, + "grad_norm": 0.5619440674781799, + "learning_rate": 9.30233612315049e-05, + "loss": 1.8285, + "step": 3302 + }, + { + "epoch": 0.1841034501978708, + "grad_norm": 0.5294018387794495, + "learning_rate": 9.301882378930394e-05, + "loss": 1.6032, + "step": 3303 + }, + { + "epoch": 0.18415918845103393, + "grad_norm": 0.6101817488670349, + "learning_rate": 9.301428498279671e-05, + "loss": 1.9998, + "step": 3304 + }, + { + "epoch": 0.1842149267041971, + "grad_norm": 0.5133767127990723, + "learning_rate": 9.300974481212715e-05, + "loss": 1.6816, + "step": 3305 + }, + { + "epoch": 0.18427066495736025, + "grad_norm": 0.5289322137832642, + "learning_rate": 9.300520327743924e-05, + "loss": 1.4649, + "step": 3306 + }, + { + "epoch": 0.18432640321052338, + "grad_norm": 0.5560780763626099, + "learning_rate": 9.300066037887704e-05, + "loss": 1.6704, + "step": 3307 + }, + { + "epoch": 0.18438214146368653, + "grad_norm": 0.5855201482772827, + "learning_rate": 9.29961161165846e-05, + "loss": 1.9368, + "step": 3308 + }, + { + "epoch": 0.18443787971684966, + "grad_norm": 0.5227165818214417, + "learning_rate": 9.299157049070603e-05, + "loss": 1.663, + "step": 3309 + }, + { + "epoch": 0.18449361797001282, + "grad_norm": 0.555633008480072, + "learning_rate": 9.298702350138551e-05, + "loss": 1.6634, + "step": 3310 + }, + { + "epoch": 0.18454935622317598, + "grad_norm": 0.5284892916679382, + "learning_rate": 9.298247514876724e-05, + "loss": 1.7772, + "step": 3311 + }, + { + "epoch": 0.1846050944763391, + "grad_norm": 0.5455605983734131, + "learning_rate": 9.297792543299545e-05, + "loss": 1.7826, + "step": 3312 + }, + { + "epoch": 0.18466083272950226, + "grad_norm": 0.6630359292030334, + "learning_rate": 9.297337435421446e-05, + "loss": 2.0859, + "step": 3313 + }, + { + "epoch": 0.1847165709826654, + "grad_norm": 0.4958614408969879, + "learning_rate": 9.296882191256857e-05, + "loss": 1.6861, + "step": 3314 + }, + { + "epoch": 0.18477230923582855, + "grad_norm": 0.506952702999115, + "learning_rate": 9.29642681082022e-05, + "loss": 1.6616, + "step": 3315 + }, + { + "epoch": 0.1848280474889917, + "grad_norm": 0.5598859190940857, + "learning_rate": 9.295971294125973e-05, + "loss": 1.8831, + "step": 3316 + }, + { + "epoch": 0.18488378574215483, + "grad_norm": 0.5533158183097839, + "learning_rate": 9.295515641188563e-05, + "loss": 1.6373, + "step": 3317 + }, + { + "epoch": 0.184939523995318, + "grad_norm": 0.5264914035797119, + "learning_rate": 9.295059852022443e-05, + "loss": 1.6668, + "step": 3318 + }, + { + "epoch": 0.18499526224848115, + "grad_norm": 0.542248010635376, + "learning_rate": 9.294603926642064e-05, + "loss": 1.5566, + "step": 3319 + }, + { + "epoch": 0.18505100050164427, + "grad_norm": 0.5599246621131897, + "learning_rate": 9.294147865061891e-05, + "loss": 1.8183, + "step": 3320 + }, + { + "epoch": 0.18510673875480743, + "grad_norm": 0.48394709825515747, + "learning_rate": 9.293691667296382e-05, + "loss": 1.4792, + "step": 3321 + }, + { + "epoch": 0.18516247700797056, + "grad_norm": 0.5670637488365173, + "learning_rate": 9.293235333360009e-05, + "loss": 1.8202, + "step": 3322 + }, + { + "epoch": 0.18521821526113372, + "grad_norm": 0.5079344511032104, + "learning_rate": 9.29277886326724e-05, + "loss": 1.698, + "step": 3323 + }, + { + "epoch": 0.18527395351429687, + "grad_norm": 0.6303577423095703, + "learning_rate": 9.292322257032555e-05, + "loss": 1.8882, + "step": 3324 + }, + { + "epoch": 0.18532969176746, + "grad_norm": 0.5548877716064453, + "learning_rate": 9.291865514670435e-05, + "loss": 1.8684, + "step": 3325 + }, + { + "epoch": 0.18538543002062316, + "grad_norm": 0.5407868027687073, + "learning_rate": 9.291408636195364e-05, + "loss": 1.7726, + "step": 3326 + }, + { + "epoch": 0.1854411682737863, + "grad_norm": 0.5434556007385254, + "learning_rate": 9.29095162162183e-05, + "loss": 1.8152, + "step": 3327 + }, + { + "epoch": 0.18549690652694945, + "grad_norm": 0.5405827164649963, + "learning_rate": 9.290494470964332e-05, + "loss": 1.7364, + "step": 3328 + }, + { + "epoch": 0.1855526447801126, + "grad_norm": 0.4682316184043884, + "learning_rate": 9.290037184237362e-05, + "loss": 1.6331, + "step": 3329 + }, + { + "epoch": 0.18560838303327573, + "grad_norm": 0.5418784618377686, + "learning_rate": 9.289579761455426e-05, + "loss": 1.9186, + "step": 3330 + }, + { + "epoch": 0.1856641212864389, + "grad_norm": 0.6001595854759216, + "learning_rate": 9.289122202633029e-05, + "loss": 1.8436, + "step": 3331 + }, + { + "epoch": 0.18571985953960202, + "grad_norm": 0.5514225363731384, + "learning_rate": 9.288664507784686e-05, + "loss": 1.8193, + "step": 3332 + }, + { + "epoch": 0.18577559779276517, + "grad_norm": 0.5329412817955017, + "learning_rate": 9.288206676924906e-05, + "loss": 1.5945, + "step": 3333 + }, + { + "epoch": 0.18583133604592833, + "grad_norm": 0.5613374710083008, + "learning_rate": 9.287748710068214e-05, + "loss": 1.8746, + "step": 3334 + }, + { + "epoch": 0.18588707429909146, + "grad_norm": 0.5720524191856384, + "learning_rate": 9.287290607229131e-05, + "loss": 1.6635, + "step": 3335 + }, + { + "epoch": 0.18594281255225462, + "grad_norm": 0.5446194410324097, + "learning_rate": 9.286832368422187e-05, + "loss": 1.6587, + "step": 3336 + }, + { + "epoch": 0.18599855080541774, + "grad_norm": 0.5358483791351318, + "learning_rate": 9.286373993661916e-05, + "loss": 1.8244, + "step": 3337 + }, + { + "epoch": 0.1860542890585809, + "grad_norm": 0.5477625727653503, + "learning_rate": 9.28591548296285e-05, + "loss": 1.8085, + "step": 3338 + }, + { + "epoch": 0.18611002731174406, + "grad_norm": 0.528417706489563, + "learning_rate": 9.285456836339537e-05, + "loss": 1.7652, + "step": 3339 + }, + { + "epoch": 0.1861657655649072, + "grad_norm": 0.5157662630081177, + "learning_rate": 9.284998053806516e-05, + "loss": 1.7365, + "step": 3340 + }, + { + "epoch": 0.18622150381807034, + "grad_norm": 0.5836164951324463, + "learning_rate": 9.284539135378341e-05, + "loss": 1.8217, + "step": 3341 + }, + { + "epoch": 0.1862772420712335, + "grad_norm": 0.5283136963844299, + "learning_rate": 9.284080081069565e-05, + "loss": 1.7073, + "step": 3342 + }, + { + "epoch": 0.18633298032439663, + "grad_norm": 0.5611073970794678, + "learning_rate": 9.283620890894749e-05, + "loss": 1.6885, + "step": 3343 + }, + { + "epoch": 0.1863887185775598, + "grad_norm": 0.5854252576828003, + "learning_rate": 9.283161564868452e-05, + "loss": 1.8512, + "step": 3344 + }, + { + "epoch": 0.18644445683072292, + "grad_norm": 0.5314401984214783, + "learning_rate": 9.282702103005243e-05, + "loss": 1.8003, + "step": 3345 + }, + { + "epoch": 0.18650019508388607, + "grad_norm": 0.5689622759819031, + "learning_rate": 9.282242505319693e-05, + "loss": 1.7775, + "step": 3346 + }, + { + "epoch": 0.18655593333704923, + "grad_norm": 0.5099941492080688, + "learning_rate": 9.281782771826378e-05, + "loss": 1.4253, + "step": 3347 + }, + { + "epoch": 0.18661167159021236, + "grad_norm": 0.557032585144043, + "learning_rate": 9.281322902539878e-05, + "loss": 1.7682, + "step": 3348 + }, + { + "epoch": 0.18666740984337551, + "grad_norm": 0.5229087471961975, + "learning_rate": 9.280862897474776e-05, + "loss": 1.5904, + "step": 3349 + }, + { + "epoch": 0.18672314809653864, + "grad_norm": 0.5913739800453186, + "learning_rate": 9.280402756645663e-05, + "loss": 1.9147, + "step": 3350 + }, + { + "epoch": 0.1867788863497018, + "grad_norm": 0.5528784990310669, + "learning_rate": 9.279942480067131e-05, + "loss": 1.7212, + "step": 3351 + }, + { + "epoch": 0.18683462460286496, + "grad_norm": 0.5475696921348572, + "learning_rate": 9.279482067753777e-05, + "loss": 1.8177, + "step": 3352 + }, + { + "epoch": 0.18689036285602809, + "grad_norm": 0.5523363947868347, + "learning_rate": 9.279021519720203e-05, + "loss": 1.7726, + "step": 3353 + }, + { + "epoch": 0.18694610110919124, + "grad_norm": 0.4846109449863434, + "learning_rate": 9.278560835981016e-05, + "loss": 1.7335, + "step": 3354 + }, + { + "epoch": 0.18700183936235437, + "grad_norm": 0.5322748422622681, + "learning_rate": 9.278100016550825e-05, + "loss": 1.8071, + "step": 3355 + }, + { + "epoch": 0.18705757761551753, + "grad_norm": 0.5510337352752686, + "learning_rate": 9.277639061444244e-05, + "loss": 1.7673, + "step": 3356 + }, + { + "epoch": 0.18711331586868069, + "grad_norm": 0.5218777060508728, + "learning_rate": 9.277177970675893e-05, + "loss": 1.686, + "step": 3357 + }, + { + "epoch": 0.18716905412184381, + "grad_norm": 0.5483778715133667, + "learning_rate": 9.276716744260392e-05, + "loss": 1.8069, + "step": 3358 + }, + { + "epoch": 0.18722479237500697, + "grad_norm": 0.5690082907676697, + "learning_rate": 9.276255382212373e-05, + "loss": 1.7838, + "step": 3359 + }, + { + "epoch": 0.1872805306281701, + "grad_norm": 0.5564740896224976, + "learning_rate": 9.275793884546465e-05, + "loss": 1.6363, + "step": 3360 + }, + { + "epoch": 0.18733626888133326, + "grad_norm": 0.5689534544944763, + "learning_rate": 9.275332251277305e-05, + "loss": 1.7624, + "step": 3361 + }, + { + "epoch": 0.1873920071344964, + "grad_norm": 0.5340893864631653, + "learning_rate": 9.274870482419533e-05, + "loss": 1.785, + "step": 3362 + }, + { + "epoch": 0.18744774538765954, + "grad_norm": 0.556954562664032, + "learning_rate": 9.274408577987792e-05, + "loss": 1.7629, + "step": 3363 + }, + { + "epoch": 0.1875034836408227, + "grad_norm": 0.5275453329086304, + "learning_rate": 9.273946537996734e-05, + "loss": 1.6675, + "step": 3364 + }, + { + "epoch": 0.18755922189398586, + "grad_norm": 0.5510149598121643, + "learning_rate": 9.273484362461011e-05, + "loss": 1.8703, + "step": 3365 + }, + { + "epoch": 0.18761496014714898, + "grad_norm": 0.5040173530578613, + "learning_rate": 9.273022051395278e-05, + "loss": 1.646, + "step": 3366 + }, + { + "epoch": 0.18767069840031214, + "grad_norm": 0.5532334446907043, + "learning_rate": 9.272559604814201e-05, + "loss": 1.7221, + "step": 3367 + }, + { + "epoch": 0.18772643665347527, + "grad_norm": 0.5305314064025879, + "learning_rate": 9.272097022732443e-05, + "loss": 1.5933, + "step": 3368 + }, + { + "epoch": 0.18778217490663843, + "grad_norm": 0.5466606020927429, + "learning_rate": 9.271634305164675e-05, + "loss": 1.7312, + "step": 3369 + }, + { + "epoch": 0.18783791315980158, + "grad_norm": 0.5373468995094299, + "learning_rate": 9.271171452125575e-05, + "loss": 1.7442, + "step": 3370 + }, + { + "epoch": 0.1878936514129647, + "grad_norm": 0.5270282626152039, + "learning_rate": 9.270708463629815e-05, + "loss": 1.7939, + "step": 3371 + }, + { + "epoch": 0.18794938966612787, + "grad_norm": 0.5657024383544922, + "learning_rate": 9.270245339692086e-05, + "loss": 1.8941, + "step": 3372 + }, + { + "epoch": 0.188005127919291, + "grad_norm": 0.5092267990112305, + "learning_rate": 9.269782080327071e-05, + "loss": 1.6895, + "step": 3373 + }, + { + "epoch": 0.18806086617245416, + "grad_norm": 0.5645020008087158, + "learning_rate": 9.269318685549463e-05, + "loss": 1.6734, + "step": 3374 + }, + { + "epoch": 0.1881166044256173, + "grad_norm": 0.5031103491783142, + "learning_rate": 9.268855155373957e-05, + "loss": 1.848, + "step": 3375 + }, + { + "epoch": 0.18817234267878044, + "grad_norm": 0.5133728981018066, + "learning_rate": 9.268391489815257e-05, + "loss": 1.4297, + "step": 3376 + }, + { + "epoch": 0.1882280809319436, + "grad_norm": 0.5471519231796265, + "learning_rate": 9.267927688888062e-05, + "loss": 1.8073, + "step": 3377 + }, + { + "epoch": 0.18828381918510673, + "grad_norm": 0.545860230922699, + "learning_rate": 9.267463752607089e-05, + "loss": 1.751, + "step": 3378 + }, + { + "epoch": 0.18833955743826988, + "grad_norm": 0.4829151928424835, + "learning_rate": 9.266999680987043e-05, + "loss": 1.498, + "step": 3379 + }, + { + "epoch": 0.18839529569143304, + "grad_norm": 0.5440730452537537, + "learning_rate": 9.266535474042647e-05, + "loss": 1.4733, + "step": 3380 + }, + { + "epoch": 0.18845103394459617, + "grad_norm": 0.7026723623275757, + "learning_rate": 9.266071131788621e-05, + "loss": 1.904, + "step": 3381 + }, + { + "epoch": 0.18850677219775933, + "grad_norm": 0.49864065647125244, + "learning_rate": 9.26560665423969e-05, + "loss": 1.8644, + "step": 3382 + }, + { + "epoch": 0.18856251045092245, + "grad_norm": 0.5199279189109802, + "learning_rate": 9.265142041410589e-05, + "loss": 1.6917, + "step": 3383 + }, + { + "epoch": 0.1886182487040856, + "grad_norm": 0.5546734929084778, + "learning_rate": 9.26467729331605e-05, + "loss": 1.7944, + "step": 3384 + }, + { + "epoch": 0.18867398695724877, + "grad_norm": 0.5777541399002075, + "learning_rate": 9.26421240997081e-05, + "loss": 1.9372, + "step": 3385 + }, + { + "epoch": 0.1887297252104119, + "grad_norm": 0.6016680598258972, + "learning_rate": 9.263747391389615e-05, + "loss": 1.9591, + "step": 3386 + }, + { + "epoch": 0.18878546346357505, + "grad_norm": 0.5046743154525757, + "learning_rate": 9.263282237587213e-05, + "loss": 1.5718, + "step": 3387 + }, + { + "epoch": 0.1888412017167382, + "grad_norm": 0.5458966493606567, + "learning_rate": 9.262816948578354e-05, + "loss": 1.7829, + "step": 3388 + }, + { + "epoch": 0.18889693996990134, + "grad_norm": 0.5983991026878357, + "learning_rate": 9.262351524377797e-05, + "loss": 1.8848, + "step": 3389 + }, + { + "epoch": 0.1889526782230645, + "grad_norm": 0.5047475099563599, + "learning_rate": 9.261885965000298e-05, + "loss": 1.3356, + "step": 3390 + }, + { + "epoch": 0.18900841647622763, + "grad_norm": 0.5353848338127136, + "learning_rate": 9.261420270460628e-05, + "loss": 1.7632, + "step": 3391 + }, + { + "epoch": 0.18906415472939078, + "grad_norm": 0.5097886919975281, + "learning_rate": 9.26095444077355e-05, + "loss": 1.6608, + "step": 3392 + }, + { + "epoch": 0.18911989298255394, + "grad_norm": 0.5497481226921082, + "learning_rate": 9.260488475953842e-05, + "loss": 1.8704, + "step": 3393 + }, + { + "epoch": 0.18917563123571707, + "grad_norm": 0.5084047317504883, + "learning_rate": 9.26002237601628e-05, + "loss": 1.515, + "step": 3394 + }, + { + "epoch": 0.18923136948888022, + "grad_norm": 0.5252576470375061, + "learning_rate": 9.259556140975644e-05, + "loss": 1.448, + "step": 3395 + }, + { + "epoch": 0.18928710774204335, + "grad_norm": 0.5760124325752258, + "learning_rate": 9.259089770846723e-05, + "loss": 1.7052, + "step": 3396 + }, + { + "epoch": 0.1893428459952065, + "grad_norm": 0.5604876279830933, + "learning_rate": 9.258623265644309e-05, + "loss": 1.8782, + "step": 3397 + }, + { + "epoch": 0.18939858424836967, + "grad_norm": 0.5331717133522034, + "learning_rate": 9.258156625383192e-05, + "loss": 1.6754, + "step": 3398 + }, + { + "epoch": 0.1894543225015328, + "grad_norm": 0.5478466153144836, + "learning_rate": 9.257689850078174e-05, + "loss": 1.7709, + "step": 3399 + }, + { + "epoch": 0.18951006075469595, + "grad_norm": 0.5751819014549255, + "learning_rate": 9.257222939744059e-05, + "loss": 1.6806, + "step": 3400 + }, + { + "epoch": 0.18956579900785908, + "grad_norm": 0.557999849319458, + "learning_rate": 9.256755894395652e-05, + "loss": 1.6614, + "step": 3401 + }, + { + "epoch": 0.18962153726102224, + "grad_norm": 0.6242285370826721, + "learning_rate": 9.256288714047767e-05, + "loss": 1.9115, + "step": 3402 + }, + { + "epoch": 0.1896772755141854, + "grad_norm": 0.5403860807418823, + "learning_rate": 9.255821398715221e-05, + "loss": 1.6686, + "step": 3403 + }, + { + "epoch": 0.18973301376734852, + "grad_norm": 0.5129532814025879, + "learning_rate": 9.255353948412833e-05, + "loss": 1.5406, + "step": 3404 + }, + { + "epoch": 0.18978875202051168, + "grad_norm": 0.5771991014480591, + "learning_rate": 9.254886363155429e-05, + "loss": 1.8979, + "step": 3405 + }, + { + "epoch": 0.1898444902736748, + "grad_norm": 0.5569978356361389, + "learning_rate": 9.254418642957835e-05, + "loss": 1.7284, + "step": 3406 + }, + { + "epoch": 0.18990022852683797, + "grad_norm": 0.5016009211540222, + "learning_rate": 9.253950787834889e-05, + "loss": 1.7517, + "step": 3407 + }, + { + "epoch": 0.18995596678000112, + "grad_norm": 0.47752997279167175, + "learning_rate": 9.253482797801425e-05, + "loss": 1.7069, + "step": 3408 + }, + { + "epoch": 0.19001170503316425, + "grad_norm": 0.4722379446029663, + "learning_rate": 9.253014672872285e-05, + "loss": 1.4309, + "step": 3409 + }, + { + "epoch": 0.1900674432863274, + "grad_norm": 0.516113817691803, + "learning_rate": 9.252546413062319e-05, + "loss": 1.6337, + "step": 3410 + }, + { + "epoch": 0.19012318153949057, + "grad_norm": 0.4841940402984619, + "learning_rate": 9.252078018386374e-05, + "loss": 1.4486, + "step": 3411 + }, + { + "epoch": 0.1901789197926537, + "grad_norm": 0.566828191280365, + "learning_rate": 9.251609488859304e-05, + "loss": 1.524, + "step": 3412 + }, + { + "epoch": 0.19023465804581685, + "grad_norm": 0.5277671813964844, + "learning_rate": 9.251140824495972e-05, + "loss": 1.6331, + "step": 3413 + }, + { + "epoch": 0.19029039629897998, + "grad_norm": 0.5212645530700684, + "learning_rate": 9.250672025311237e-05, + "loss": 1.6409, + "step": 3414 + }, + { + "epoch": 0.19034613455214314, + "grad_norm": 0.5892760753631592, + "learning_rate": 9.250203091319968e-05, + "loss": 1.7712, + "step": 3415 + }, + { + "epoch": 0.1904018728053063, + "grad_norm": 0.5454036593437195, + "learning_rate": 9.24973402253704e-05, + "loss": 1.888, + "step": 3416 + }, + { + "epoch": 0.19045761105846942, + "grad_norm": 0.5001441836357117, + "learning_rate": 9.249264818977324e-05, + "loss": 1.6808, + "step": 3417 + }, + { + "epoch": 0.19051334931163258, + "grad_norm": 0.5732707977294922, + "learning_rate": 9.248795480655704e-05, + "loss": 1.8398, + "step": 3418 + }, + { + "epoch": 0.1905690875647957, + "grad_norm": 0.5356916785240173, + "learning_rate": 9.248326007587063e-05, + "loss": 1.7295, + "step": 3419 + }, + { + "epoch": 0.19062482581795887, + "grad_norm": 0.5317162275314331, + "learning_rate": 9.247856399786292e-05, + "loss": 1.7717, + "step": 3420 + }, + { + "epoch": 0.19068056407112202, + "grad_norm": 0.5117460489273071, + "learning_rate": 9.247386657268283e-05, + "loss": 1.5417, + "step": 3421 + }, + { + "epoch": 0.19073630232428515, + "grad_norm": 0.5263468623161316, + "learning_rate": 9.24691678004793e-05, + "loss": 1.8882, + "step": 3422 + }, + { + "epoch": 0.1907920405774483, + "grad_norm": 0.5721904635429382, + "learning_rate": 9.24644676814014e-05, + "loss": 1.8083, + "step": 3423 + }, + { + "epoch": 0.19084777883061144, + "grad_norm": 0.5673632025718689, + "learning_rate": 9.245976621559817e-05, + "loss": 1.8532, + "step": 3424 + }, + { + "epoch": 0.1909035170837746, + "grad_norm": 0.5096221566200256, + "learning_rate": 9.24550634032187e-05, + "loss": 1.5365, + "step": 3425 + }, + { + "epoch": 0.19095925533693775, + "grad_norm": 0.545087456703186, + "learning_rate": 9.245035924441217e-05, + "loss": 1.854, + "step": 3426 + }, + { + "epoch": 0.19101499359010088, + "grad_norm": 0.5424298644065857, + "learning_rate": 9.244565373932774e-05, + "loss": 1.7373, + "step": 3427 + }, + { + "epoch": 0.19107073184326404, + "grad_norm": 0.5558550357818604, + "learning_rate": 9.244094688811465e-05, + "loss": 1.746, + "step": 3428 + }, + { + "epoch": 0.19112647009642716, + "grad_norm": 0.49283209443092346, + "learning_rate": 9.243623869092218e-05, + "loss": 1.3836, + "step": 3429 + }, + { + "epoch": 0.19118220834959032, + "grad_norm": 0.5955911874771118, + "learning_rate": 9.24315291478996e-05, + "loss": 1.8499, + "step": 3430 + }, + { + "epoch": 0.19123794660275348, + "grad_norm": 0.5249252319335938, + "learning_rate": 9.242681825919635e-05, + "loss": 1.6767, + "step": 3431 + }, + { + "epoch": 0.1912936848559166, + "grad_norm": 0.5496412515640259, + "learning_rate": 9.242210602496178e-05, + "loss": 1.7963, + "step": 3432 + }, + { + "epoch": 0.19134942310907976, + "grad_norm": 0.5590277910232544, + "learning_rate": 9.241739244534534e-05, + "loss": 1.7885, + "step": 3433 + }, + { + "epoch": 0.19140516136224292, + "grad_norm": 0.5826262831687927, + "learning_rate": 9.241267752049653e-05, + "loss": 1.7971, + "step": 3434 + }, + { + "epoch": 0.19146089961540605, + "grad_norm": 0.5477822422981262, + "learning_rate": 9.240796125056486e-05, + "loss": 1.7376, + "step": 3435 + }, + { + "epoch": 0.1915166378685692, + "grad_norm": 0.5088443756103516, + "learning_rate": 9.240324363569992e-05, + "loss": 1.6705, + "step": 3436 + }, + { + "epoch": 0.19157237612173234, + "grad_norm": 0.5802351832389832, + "learning_rate": 9.239852467605132e-05, + "loss": 1.8505, + "step": 3437 + }, + { + "epoch": 0.1916281143748955, + "grad_norm": 0.5589656829833984, + "learning_rate": 9.239380437176872e-05, + "loss": 1.7993, + "step": 3438 + }, + { + "epoch": 0.19168385262805865, + "grad_norm": 0.5384811162948608, + "learning_rate": 9.238908272300181e-05, + "loss": 1.803, + "step": 3439 + }, + { + "epoch": 0.19173959088122178, + "grad_norm": 0.5251903533935547, + "learning_rate": 9.238435972990036e-05, + "loss": 1.6364, + "step": 3440 + }, + { + "epoch": 0.19179532913438493, + "grad_norm": 0.5536593794822693, + "learning_rate": 9.237963539261412e-05, + "loss": 1.8069, + "step": 3441 + }, + { + "epoch": 0.19185106738754806, + "grad_norm": 0.49031203985214233, + "learning_rate": 9.237490971129294e-05, + "loss": 1.6969, + "step": 3442 + }, + { + "epoch": 0.19190680564071122, + "grad_norm": 0.5111910700798035, + "learning_rate": 9.23701826860867e-05, + "loss": 1.7135, + "step": 3443 + }, + { + "epoch": 0.19196254389387438, + "grad_norm": 0.5502627491950989, + "learning_rate": 9.236545431714529e-05, + "loss": 1.8724, + "step": 3444 + }, + { + "epoch": 0.1920182821470375, + "grad_norm": 0.5772512555122375, + "learning_rate": 9.236072460461867e-05, + "loss": 1.7944, + "step": 3445 + }, + { + "epoch": 0.19207402040020066, + "grad_norm": 0.6393208503723145, + "learning_rate": 9.235599354865686e-05, + "loss": 1.5557, + "step": 3446 + }, + { + "epoch": 0.1921297586533638, + "grad_norm": 0.5822187066078186, + "learning_rate": 9.235126114940989e-05, + "loss": 1.8263, + "step": 3447 + }, + { + "epoch": 0.19218549690652695, + "grad_norm": 0.5391358733177185, + "learning_rate": 9.234652740702781e-05, + "loss": 1.7186, + "step": 3448 + }, + { + "epoch": 0.1922412351596901, + "grad_norm": 0.4919295907020569, + "learning_rate": 9.23417923216608e-05, + "loss": 1.5176, + "step": 3449 + }, + { + "epoch": 0.19229697341285323, + "grad_norm": 0.547146737575531, + "learning_rate": 9.233705589345902e-05, + "loss": 1.8129, + "step": 3450 + }, + { + "epoch": 0.1923527116660164, + "grad_norm": 0.4958893358707428, + "learning_rate": 9.233231812257265e-05, + "loss": 1.5314, + "step": 3451 + }, + { + "epoch": 0.19240844991917952, + "grad_norm": 0.4873281419277191, + "learning_rate": 9.232757900915197e-05, + "loss": 1.6043, + "step": 3452 + }, + { + "epoch": 0.19246418817234268, + "grad_norm": 0.5672634840011597, + "learning_rate": 9.232283855334727e-05, + "loss": 1.8168, + "step": 3453 + }, + { + "epoch": 0.19251992642550583, + "grad_norm": 0.514673113822937, + "learning_rate": 9.231809675530888e-05, + "loss": 1.7076, + "step": 3454 + }, + { + "epoch": 0.19257566467866896, + "grad_norm": 0.5566558241844177, + "learning_rate": 9.23133536151872e-05, + "loss": 1.8021, + "step": 3455 + }, + { + "epoch": 0.19263140293183212, + "grad_norm": 0.5627939701080322, + "learning_rate": 9.230860913313266e-05, + "loss": 1.659, + "step": 3456 + }, + { + "epoch": 0.19268714118499528, + "grad_norm": 0.5632688403129578, + "learning_rate": 9.23038633092957e-05, + "loss": 1.8172, + "step": 3457 + }, + { + "epoch": 0.1927428794381584, + "grad_norm": 0.5149570107460022, + "learning_rate": 9.229911614382685e-05, + "loss": 1.6086, + "step": 3458 + }, + { + "epoch": 0.19279861769132156, + "grad_norm": 0.5687461495399475, + "learning_rate": 9.229436763687665e-05, + "loss": 1.7102, + "step": 3459 + }, + { + "epoch": 0.1928543559444847, + "grad_norm": 0.527733325958252, + "learning_rate": 9.228961778859572e-05, + "loss": 1.6291, + "step": 3460 + }, + { + "epoch": 0.19291009419764785, + "grad_norm": 0.5713732242584229, + "learning_rate": 9.228486659913467e-05, + "loss": 1.7628, + "step": 3461 + }, + { + "epoch": 0.192965832450811, + "grad_norm": 0.5368852019309998, + "learning_rate": 9.228011406864417e-05, + "loss": 1.6604, + "step": 3462 + }, + { + "epoch": 0.19302157070397413, + "grad_norm": 0.5099670886993408, + "learning_rate": 9.227536019727498e-05, + "loss": 1.6571, + "step": 3463 + }, + { + "epoch": 0.1930773089571373, + "grad_norm": 0.5792325735092163, + "learning_rate": 9.227060498517785e-05, + "loss": 1.6586, + "step": 3464 + }, + { + "epoch": 0.19313304721030042, + "grad_norm": 0.5870433449745178, + "learning_rate": 9.226584843250357e-05, + "loss": 1.6693, + "step": 3465 + }, + { + "epoch": 0.19318878546346357, + "grad_norm": 0.5723249316215515, + "learning_rate": 9.226109053940302e-05, + "loss": 1.8516, + "step": 3466 + }, + { + "epoch": 0.19324452371662673, + "grad_norm": 0.5492411255836487, + "learning_rate": 9.225633130602707e-05, + "loss": 1.8369, + "step": 3467 + }, + { + "epoch": 0.19330026196978986, + "grad_norm": 0.5040132403373718, + "learning_rate": 9.225157073252666e-05, + "loss": 1.7936, + "step": 3468 + }, + { + "epoch": 0.19335600022295302, + "grad_norm": 0.5484923124313354, + "learning_rate": 9.224680881905279e-05, + "loss": 1.8398, + "step": 3469 + }, + { + "epoch": 0.19341173847611615, + "grad_norm": 0.6042559146881104, + "learning_rate": 9.224204556575644e-05, + "loss": 1.8699, + "step": 3470 + }, + { + "epoch": 0.1934674767292793, + "grad_norm": 0.5580307841300964, + "learning_rate": 9.22372809727887e-05, + "loss": 1.6961, + "step": 3471 + }, + { + "epoch": 0.19352321498244246, + "grad_norm": 0.5399236679077148, + "learning_rate": 9.223251504030066e-05, + "loss": 1.6302, + "step": 3472 + }, + { + "epoch": 0.1935789532356056, + "grad_norm": 0.5522669553756714, + "learning_rate": 9.222774776844349e-05, + "loss": 1.765, + "step": 3473 + }, + { + "epoch": 0.19363469148876875, + "grad_norm": 0.5530064105987549, + "learning_rate": 9.222297915736834e-05, + "loss": 1.783, + "step": 3474 + }, + { + "epoch": 0.19369042974193187, + "grad_norm": 0.5082196593284607, + "learning_rate": 9.22182092072265e-05, + "loss": 1.6188, + "step": 3475 + }, + { + "epoch": 0.19374616799509503, + "grad_norm": 0.5311219692230225, + "learning_rate": 9.221343791816918e-05, + "loss": 1.7017, + "step": 3476 + }, + { + "epoch": 0.1938019062482582, + "grad_norm": 0.542589545249939, + "learning_rate": 9.220866529034776e-05, + "loss": 1.7064, + "step": 3477 + }, + { + "epoch": 0.19385764450142132, + "grad_norm": 0.5327942967414856, + "learning_rate": 9.220389132391356e-05, + "loss": 1.7807, + "step": 3478 + }, + { + "epoch": 0.19391338275458447, + "grad_norm": 0.523639976978302, + "learning_rate": 9.219911601901799e-05, + "loss": 1.5785, + "step": 3479 + }, + { + "epoch": 0.19396912100774763, + "grad_norm": 0.5756027102470398, + "learning_rate": 9.21943393758125e-05, + "loss": 2.0297, + "step": 3480 + }, + { + "epoch": 0.19402485926091076, + "grad_norm": 0.5392191410064697, + "learning_rate": 9.218956139444858e-05, + "loss": 1.6824, + "step": 3481 + }, + { + "epoch": 0.19408059751407392, + "grad_norm": 0.536055326461792, + "learning_rate": 9.218478207507775e-05, + "loss": 1.7264, + "step": 3482 + }, + { + "epoch": 0.19413633576723704, + "grad_norm": 0.5701099634170532, + "learning_rate": 9.218000141785158e-05, + "loss": 1.7967, + "step": 3483 + }, + { + "epoch": 0.1941920740204002, + "grad_norm": 0.586493194103241, + "learning_rate": 9.21752194229217e-05, + "loss": 2.0026, + "step": 3484 + }, + { + "epoch": 0.19424781227356336, + "grad_norm": 0.5607553124427795, + "learning_rate": 9.217043609043975e-05, + "loss": 1.8374, + "step": 3485 + }, + { + "epoch": 0.1943035505267265, + "grad_norm": 0.5268848538398743, + "learning_rate": 9.216565142055745e-05, + "loss": 1.6248, + "step": 3486 + }, + { + "epoch": 0.19435928877988964, + "grad_norm": 0.563528299331665, + "learning_rate": 9.216086541342652e-05, + "loss": 1.8659, + "step": 3487 + }, + { + "epoch": 0.19441502703305277, + "grad_norm": 0.5309708714485168, + "learning_rate": 9.215607806919877e-05, + "loss": 1.7026, + "step": 3488 + }, + { + "epoch": 0.19447076528621593, + "grad_norm": 0.5582777857780457, + "learning_rate": 9.2151289388026e-05, + "loss": 1.8766, + "step": 3489 + }, + { + "epoch": 0.1945265035393791, + "grad_norm": 0.5012943744659424, + "learning_rate": 9.214649937006008e-05, + "loss": 1.372, + "step": 3490 + }, + { + "epoch": 0.19458224179254222, + "grad_norm": 0.5534226298332214, + "learning_rate": 9.214170801545294e-05, + "loss": 1.8491, + "step": 3491 + }, + { + "epoch": 0.19463798004570537, + "grad_norm": 0.5312340259552002, + "learning_rate": 9.213691532435654e-05, + "loss": 1.4738, + "step": 3492 + }, + { + "epoch": 0.1946937182988685, + "grad_norm": 0.5233004093170166, + "learning_rate": 9.213212129692284e-05, + "loss": 1.5871, + "step": 3493 + }, + { + "epoch": 0.19474945655203166, + "grad_norm": 0.5227805972099304, + "learning_rate": 9.212732593330389e-05, + "loss": 1.6355, + "step": 3494 + }, + { + "epoch": 0.19480519480519481, + "grad_norm": 0.5237340927124023, + "learning_rate": 9.21225292336518e-05, + "loss": 1.8903, + "step": 3495 + }, + { + "epoch": 0.19486093305835794, + "grad_norm": 0.5420545935630798, + "learning_rate": 9.211773119811867e-05, + "loss": 1.9006, + "step": 3496 + }, + { + "epoch": 0.1949166713115211, + "grad_norm": 0.534702718257904, + "learning_rate": 9.211293182685667e-05, + "loss": 1.5601, + "step": 3497 + }, + { + "epoch": 0.19497240956468423, + "grad_norm": 0.5968030095100403, + "learning_rate": 9.210813112001802e-05, + "loss": 1.7871, + "step": 3498 + }, + { + "epoch": 0.19502814781784739, + "grad_norm": 0.5270916223526001, + "learning_rate": 9.210332907775494e-05, + "loss": 1.69, + "step": 3499 + }, + { + "epoch": 0.19508388607101054, + "grad_norm": 0.5496137142181396, + "learning_rate": 9.209852570021975e-05, + "loss": 1.916, + "step": 3500 + }, + { + "epoch": 0.19513962432417367, + "grad_norm": 0.5198974013328552, + "learning_rate": 9.209372098756476e-05, + "loss": 1.6651, + "step": 3501 + }, + { + "epoch": 0.19519536257733683, + "grad_norm": 0.5615696907043457, + "learning_rate": 9.208891493994239e-05, + "loss": 1.7589, + "step": 3502 + }, + { + "epoch": 0.19525110083049999, + "grad_norm": 0.5367715954780579, + "learning_rate": 9.208410755750501e-05, + "loss": 1.5889, + "step": 3503 + }, + { + "epoch": 0.19530683908366311, + "grad_norm": 0.6133012771606445, + "learning_rate": 9.207929884040511e-05, + "loss": 1.8472, + "step": 3504 + }, + { + "epoch": 0.19536257733682627, + "grad_norm": 0.6582043170928955, + "learning_rate": 9.20744887887952e-05, + "loss": 1.6471, + "step": 3505 + }, + { + "epoch": 0.1954183155899894, + "grad_norm": 0.5180196762084961, + "learning_rate": 9.206967740282783e-05, + "loss": 1.5727, + "step": 3506 + }, + { + "epoch": 0.19547405384315256, + "grad_norm": 0.5526701807975769, + "learning_rate": 9.206486468265555e-05, + "loss": 1.635, + "step": 3507 + }, + { + "epoch": 0.1955297920963157, + "grad_norm": 0.6198756694793701, + "learning_rate": 9.206005062843102e-05, + "loss": 1.7088, + "step": 3508 + }, + { + "epoch": 0.19558553034947884, + "grad_norm": 0.5373274683952332, + "learning_rate": 9.205523524030693e-05, + "loss": 1.7032, + "step": 3509 + }, + { + "epoch": 0.195641268602642, + "grad_norm": 0.5724993944168091, + "learning_rate": 9.205041851843596e-05, + "loss": 1.8822, + "step": 3510 + }, + { + "epoch": 0.19569700685580513, + "grad_norm": 0.5542033314704895, + "learning_rate": 9.20456004629709e-05, + "loss": 1.333, + "step": 3511 + }, + { + "epoch": 0.19575274510896828, + "grad_norm": 0.5784552693367004, + "learning_rate": 9.204078107406454e-05, + "loss": 1.8277, + "step": 3512 + }, + { + "epoch": 0.19580848336213144, + "grad_norm": 0.5339728593826294, + "learning_rate": 9.203596035186969e-05, + "loss": 1.5545, + "step": 3513 + }, + { + "epoch": 0.19586422161529457, + "grad_norm": 0.5574887990951538, + "learning_rate": 9.203113829653927e-05, + "loss": 1.6811, + "step": 3514 + }, + { + "epoch": 0.19591995986845773, + "grad_norm": 0.48576298356056213, + "learning_rate": 9.202631490822622e-05, + "loss": 1.548, + "step": 3515 + }, + { + "epoch": 0.19597569812162086, + "grad_norm": 0.516997218132019, + "learning_rate": 9.202149018708347e-05, + "loss": 1.6624, + "step": 3516 + }, + { + "epoch": 0.196031436374784, + "grad_norm": 0.5537340641021729, + "learning_rate": 9.201666413326408e-05, + "loss": 1.8557, + "step": 3517 + }, + { + "epoch": 0.19608717462794717, + "grad_norm": 0.5295738577842712, + "learning_rate": 9.201183674692107e-05, + "loss": 1.5435, + "step": 3518 + }, + { + "epoch": 0.1961429128811103, + "grad_norm": 0.47536125779151917, + "learning_rate": 9.200700802820754e-05, + "loss": 1.4683, + "step": 3519 + }, + { + "epoch": 0.19619865113427346, + "grad_norm": 0.546451985836029, + "learning_rate": 9.200217797727662e-05, + "loss": 1.8706, + "step": 3520 + }, + { + "epoch": 0.19625438938743658, + "grad_norm": 0.5166674256324768, + "learning_rate": 9.199734659428152e-05, + "loss": 1.5608, + "step": 3521 + }, + { + "epoch": 0.19631012764059974, + "grad_norm": 0.5700700879096985, + "learning_rate": 9.199251387937545e-05, + "loss": 1.7221, + "step": 3522 + }, + { + "epoch": 0.1963658658937629, + "grad_norm": 0.6089435815811157, + "learning_rate": 9.198767983271166e-05, + "loss": 1.7989, + "step": 3523 + }, + { + "epoch": 0.19642160414692603, + "grad_norm": 0.6160342693328857, + "learning_rate": 9.198284445444348e-05, + "loss": 2.0033, + "step": 3524 + }, + { + "epoch": 0.19647734240008918, + "grad_norm": 0.6272563338279724, + "learning_rate": 9.197800774472426e-05, + "loss": 1.9705, + "step": 3525 + }, + { + "epoch": 0.19653308065325234, + "grad_norm": 0.4671235680580139, + "learning_rate": 9.197316970370737e-05, + "loss": 1.0644, + "step": 3526 + }, + { + "epoch": 0.19658881890641547, + "grad_norm": 0.5911363959312439, + "learning_rate": 9.196833033154625e-05, + "loss": 1.662, + "step": 3527 + }, + { + "epoch": 0.19664455715957863, + "grad_norm": 0.552719235420227, + "learning_rate": 9.19634896283944e-05, + "loss": 1.7743, + "step": 3528 + }, + { + "epoch": 0.19670029541274175, + "grad_norm": 0.5252164006233215, + "learning_rate": 9.195864759440531e-05, + "loss": 1.7682, + "step": 3529 + }, + { + "epoch": 0.1967560336659049, + "grad_norm": 0.53560471534729, + "learning_rate": 9.195380422973257e-05, + "loss": 1.6731, + "step": 3530 + }, + { + "epoch": 0.19681177191906807, + "grad_norm": 0.5091952085494995, + "learning_rate": 9.194895953452976e-05, + "loss": 1.4618, + "step": 3531 + }, + { + "epoch": 0.1968675101722312, + "grad_norm": 0.5449403524398804, + "learning_rate": 9.194411350895053e-05, + "loss": 1.7007, + "step": 3532 + }, + { + "epoch": 0.19692324842539435, + "grad_norm": 0.5258320569992065, + "learning_rate": 9.193926615314857e-05, + "loss": 1.8571, + "step": 3533 + }, + { + "epoch": 0.19697898667855748, + "grad_norm": 0.5018019080162048, + "learning_rate": 9.193441746727762e-05, + "loss": 1.4968, + "step": 3534 + }, + { + "epoch": 0.19703472493172064, + "grad_norm": 0.570955753326416, + "learning_rate": 9.192956745149144e-05, + "loss": 1.8938, + "step": 3535 + }, + { + "epoch": 0.1970904631848838, + "grad_norm": 0.595371663570404, + "learning_rate": 9.192471610594384e-05, + "loss": 2.0865, + "step": 3536 + }, + { + "epoch": 0.19714620143804693, + "grad_norm": 0.5452008247375488, + "learning_rate": 9.191986343078868e-05, + "loss": 1.7354, + "step": 3537 + }, + { + "epoch": 0.19720193969121008, + "grad_norm": 0.5002971887588501, + "learning_rate": 9.191500942617988e-05, + "loss": 1.5218, + "step": 3538 + }, + { + "epoch": 0.1972576779443732, + "grad_norm": 0.5388283133506775, + "learning_rate": 9.191015409227134e-05, + "loss": 1.6676, + "step": 3539 + }, + { + "epoch": 0.19731341619753637, + "grad_norm": 0.5798291563987732, + "learning_rate": 9.190529742921707e-05, + "loss": 1.8602, + "step": 3540 + }, + { + "epoch": 0.19736915445069952, + "grad_norm": 0.5622314214706421, + "learning_rate": 9.190043943717111e-05, + "loss": 1.7324, + "step": 3541 + }, + { + "epoch": 0.19742489270386265, + "grad_norm": 0.5845619440078735, + "learning_rate": 9.189558011628749e-05, + "loss": 1.8098, + "step": 3542 + }, + { + "epoch": 0.1974806309570258, + "grad_norm": 0.5707986354827881, + "learning_rate": 9.189071946672033e-05, + "loss": 1.9953, + "step": 3543 + }, + { + "epoch": 0.19753636921018894, + "grad_norm": 0.5030776858329773, + "learning_rate": 9.18858574886238e-05, + "loss": 1.6697, + "step": 3544 + }, + { + "epoch": 0.1975921074633521, + "grad_norm": 0.5452118515968323, + "learning_rate": 9.188099418215208e-05, + "loss": 1.4443, + "step": 3545 + }, + { + "epoch": 0.19764784571651525, + "grad_norm": 0.5277875065803528, + "learning_rate": 9.187612954745942e-05, + "loss": 1.738, + "step": 3546 + }, + { + "epoch": 0.19770358396967838, + "grad_norm": 0.563870906829834, + "learning_rate": 9.187126358470006e-05, + "loss": 1.7746, + "step": 3547 + }, + { + "epoch": 0.19775932222284154, + "grad_norm": 0.5097183585166931, + "learning_rate": 9.186639629402836e-05, + "loss": 1.5869, + "step": 3548 + }, + { + "epoch": 0.1978150604760047, + "grad_norm": 0.5304349660873413, + "learning_rate": 9.186152767559866e-05, + "loss": 1.4967, + "step": 3549 + }, + { + "epoch": 0.19787079872916782, + "grad_norm": 0.5379878878593445, + "learning_rate": 9.185665772956539e-05, + "loss": 1.7457, + "step": 3550 + }, + { + "epoch": 0.19792653698233098, + "grad_norm": 0.5299242734909058, + "learning_rate": 9.185178645608297e-05, + "loss": 1.6194, + "step": 3551 + }, + { + "epoch": 0.1979822752354941, + "grad_norm": 0.5131285190582275, + "learning_rate": 9.184691385530588e-05, + "loss": 1.8616, + "step": 3552 + }, + { + "epoch": 0.19803801348865727, + "grad_norm": 0.5294276475906372, + "learning_rate": 9.184203992738869e-05, + "loss": 1.5835, + "step": 3553 + }, + { + "epoch": 0.19809375174182042, + "grad_norm": 0.544457197189331, + "learning_rate": 9.183716467248593e-05, + "loss": 1.6874, + "step": 3554 + }, + { + "epoch": 0.19814948999498355, + "grad_norm": 0.5258937478065491, + "learning_rate": 9.183228809075223e-05, + "loss": 1.7946, + "step": 3555 + }, + { + "epoch": 0.1982052282481467, + "grad_norm": 0.5388005971908569, + "learning_rate": 9.182741018234228e-05, + "loss": 1.6509, + "step": 3556 + }, + { + "epoch": 0.19826096650130984, + "grad_norm": 0.5726017951965332, + "learning_rate": 9.182253094741073e-05, + "loss": 1.6885, + "step": 3557 + }, + { + "epoch": 0.198316704754473, + "grad_norm": 0.5634879469871521, + "learning_rate": 9.181765038611234e-05, + "loss": 1.7431, + "step": 3558 + }, + { + "epoch": 0.19837244300763615, + "grad_norm": 0.5139129161834717, + "learning_rate": 9.18127684986019e-05, + "loss": 1.763, + "step": 3559 + }, + { + "epoch": 0.19842818126079928, + "grad_norm": 0.5589642524719238, + "learning_rate": 9.180788528503423e-05, + "loss": 1.9388, + "step": 3560 + }, + { + "epoch": 0.19848391951396244, + "grad_norm": 0.538282036781311, + "learning_rate": 9.18030007455642e-05, + "loss": 1.8491, + "step": 3561 + }, + { + "epoch": 0.19853965776712557, + "grad_norm": 0.5197616219520569, + "learning_rate": 9.179811488034671e-05, + "loss": 1.657, + "step": 3562 + }, + { + "epoch": 0.19859539602028872, + "grad_norm": 0.569980800151825, + "learning_rate": 9.17932276895367e-05, + "loss": 1.8632, + "step": 3563 + }, + { + "epoch": 0.19865113427345188, + "grad_norm": 0.6533870100975037, + "learning_rate": 9.17883391732892e-05, + "loss": 2.2768, + "step": 3564 + }, + { + "epoch": 0.198706872526615, + "grad_norm": 0.5272773504257202, + "learning_rate": 9.178344933175922e-05, + "loss": 1.7145, + "step": 3565 + }, + { + "epoch": 0.19876261077977816, + "grad_norm": 0.5350964069366455, + "learning_rate": 9.177855816510184e-05, + "loss": 1.6678, + "step": 3566 + }, + { + "epoch": 0.1988183490329413, + "grad_norm": 0.5308762788772583, + "learning_rate": 9.177366567347216e-05, + "loss": 1.6745, + "step": 3567 + }, + { + "epoch": 0.19887408728610445, + "grad_norm": 0.552905261516571, + "learning_rate": 9.176877185702539e-05, + "loss": 1.7337, + "step": 3568 + }, + { + "epoch": 0.1989298255392676, + "grad_norm": 0.5350809693336487, + "learning_rate": 9.17638767159167e-05, + "loss": 1.754, + "step": 3569 + }, + { + "epoch": 0.19898556379243074, + "grad_norm": 0.5393645167350769, + "learning_rate": 9.175898025030134e-05, + "loss": 1.6508, + "step": 3570 + }, + { + "epoch": 0.1990413020455939, + "grad_norm": 0.5781660079956055, + "learning_rate": 9.175408246033458e-05, + "loss": 1.7258, + "step": 3571 + }, + { + "epoch": 0.19909704029875705, + "grad_norm": 0.5230069160461426, + "learning_rate": 9.17491833461718e-05, + "loss": 1.5918, + "step": 3572 + }, + { + "epoch": 0.19915277855192018, + "grad_norm": 0.54449063539505, + "learning_rate": 9.174428290796833e-05, + "loss": 1.4328, + "step": 3573 + }, + { + "epoch": 0.19920851680508334, + "grad_norm": 0.5652233958244324, + "learning_rate": 9.173938114587957e-05, + "loss": 1.6627, + "step": 3574 + }, + { + "epoch": 0.19926425505824646, + "grad_norm": 0.5487927198410034, + "learning_rate": 9.173447806006102e-05, + "loss": 1.6238, + "step": 3575 + }, + { + "epoch": 0.19931999331140962, + "grad_norm": 0.5450085401535034, + "learning_rate": 9.172957365066815e-05, + "loss": 1.8033, + "step": 3576 + }, + { + "epoch": 0.19937573156457278, + "grad_norm": 0.5951147079467773, + "learning_rate": 9.17246679178565e-05, + "loss": 2.0117, + "step": 3577 + }, + { + "epoch": 0.1994314698177359, + "grad_norm": 0.5555893778800964, + "learning_rate": 9.171976086178164e-05, + "loss": 1.6994, + "step": 3578 + }, + { + "epoch": 0.19948720807089906, + "grad_norm": 0.4888277053833008, + "learning_rate": 9.171485248259924e-05, + "loss": 1.555, + "step": 3579 + }, + { + "epoch": 0.1995429463240622, + "grad_norm": 0.5293035507202148, + "learning_rate": 9.170994278046492e-05, + "loss": 1.7463, + "step": 3580 + }, + { + "epoch": 0.19959868457722535, + "grad_norm": 0.544032096862793, + "learning_rate": 9.17050317555344e-05, + "loss": 1.8112, + "step": 3581 + }, + { + "epoch": 0.1996544228303885, + "grad_norm": 0.5483592748641968, + "learning_rate": 9.170011940796341e-05, + "loss": 1.7906, + "step": 3582 + }, + { + "epoch": 0.19971016108355163, + "grad_norm": 0.6069881319999695, + "learning_rate": 9.16952057379078e-05, + "loss": 2.0624, + "step": 3583 + }, + { + "epoch": 0.1997658993367148, + "grad_norm": 0.5667694211006165, + "learning_rate": 9.169029074552333e-05, + "loss": 1.8233, + "step": 3584 + }, + { + "epoch": 0.19982163758987792, + "grad_norm": 0.5053529739379883, + "learning_rate": 9.168537443096592e-05, + "loss": 1.6512, + "step": 3585 + }, + { + "epoch": 0.19987737584304108, + "grad_norm": 0.5334288477897644, + "learning_rate": 9.168045679439149e-05, + "loss": 1.5675, + "step": 3586 + }, + { + "epoch": 0.19993311409620423, + "grad_norm": 0.61188805103302, + "learning_rate": 9.167553783595597e-05, + "loss": 1.8834, + "step": 3587 + }, + { + "epoch": 0.19998885234936736, + "grad_norm": 0.5691487193107605, + "learning_rate": 9.167061755581539e-05, + "loss": 1.7663, + "step": 3588 + }, + { + "epoch": 0.20004459060253052, + "grad_norm": 0.5586220026016235, + "learning_rate": 9.166569595412575e-05, + "loss": 1.8832, + "step": 3589 + }, + { + "epoch": 0.20010032885569365, + "grad_norm": 0.4987550675868988, + "learning_rate": 9.166077303104319e-05, + "loss": 1.661, + "step": 3590 + }, + { + "epoch": 0.2001560671088568, + "grad_norm": 0.5463746190071106, + "learning_rate": 9.165584878672378e-05, + "loss": 1.6764, + "step": 3591 + }, + { + "epoch": 0.20021180536201996, + "grad_norm": 0.5752919316291809, + "learning_rate": 9.165092322132374e-05, + "loss": 1.6847, + "step": 3592 + }, + { + "epoch": 0.2002675436151831, + "grad_norm": 0.5271925330162048, + "learning_rate": 9.164599633499925e-05, + "loss": 1.7428, + "step": 3593 + }, + { + "epoch": 0.20032328186834625, + "grad_norm": 0.4875536262989044, + "learning_rate": 9.164106812790657e-05, + "loss": 1.3011, + "step": 3594 + }, + { + "epoch": 0.2003790201215094, + "grad_norm": 0.5993346571922302, + "learning_rate": 9.1636138600202e-05, + "loss": 1.8065, + "step": 3595 + }, + { + "epoch": 0.20043475837467253, + "grad_norm": 0.5418604612350464, + "learning_rate": 9.163120775204187e-05, + "loss": 1.6812, + "step": 3596 + }, + { + "epoch": 0.2004904966278357, + "grad_norm": 0.5411487817764282, + "learning_rate": 9.162627558358255e-05, + "loss": 1.8109, + "step": 3597 + }, + { + "epoch": 0.20054623488099882, + "grad_norm": 0.5583702325820923, + "learning_rate": 9.162134209498046e-05, + "loss": 1.8183, + "step": 3598 + }, + { + "epoch": 0.20060197313416198, + "grad_norm": 0.6028481721878052, + "learning_rate": 9.161640728639207e-05, + "loss": 1.8642, + "step": 3599 + }, + { + "epoch": 0.20065771138732513, + "grad_norm": 0.5424187183380127, + "learning_rate": 9.161147115797388e-05, + "loss": 1.8178, + "step": 3600 + }, + { + "epoch": 0.20071344964048826, + "grad_norm": 0.6147588491439819, + "learning_rate": 9.160653370988243e-05, + "loss": 1.7343, + "step": 3601 + }, + { + "epoch": 0.20076918789365142, + "grad_norm": 0.5581020712852478, + "learning_rate": 9.160159494227434e-05, + "loss": 1.713, + "step": 3602 + }, + { + "epoch": 0.20082492614681455, + "grad_norm": 0.5363709330558777, + "learning_rate": 9.15966548553062e-05, + "loss": 1.8839, + "step": 3603 + }, + { + "epoch": 0.2008806643999777, + "grad_norm": 0.5731095671653748, + "learning_rate": 9.159171344913469e-05, + "loss": 1.8919, + "step": 3604 + }, + { + "epoch": 0.20093640265314086, + "grad_norm": 0.5256056785583496, + "learning_rate": 9.158677072391653e-05, + "loss": 1.7236, + "step": 3605 + }, + { + "epoch": 0.200992140906304, + "grad_norm": 0.5467107892036438, + "learning_rate": 9.158182667980846e-05, + "loss": 1.6551, + "step": 3606 + }, + { + "epoch": 0.20104787915946715, + "grad_norm": 0.5082773566246033, + "learning_rate": 9.157688131696729e-05, + "loss": 1.6537, + "step": 3607 + }, + { + "epoch": 0.20110361741263028, + "grad_norm": 0.5320789813995361, + "learning_rate": 9.157193463554986e-05, + "loss": 1.6112, + "step": 3608 + }, + { + "epoch": 0.20115935566579343, + "grad_norm": 0.5658825635910034, + "learning_rate": 9.156698663571305e-05, + "loss": 1.6377, + "step": 3609 + }, + { + "epoch": 0.2012150939189566, + "grad_norm": 0.593096137046814, + "learning_rate": 9.156203731761376e-05, + "loss": 1.7296, + "step": 3610 + }, + { + "epoch": 0.20127083217211972, + "grad_norm": 0.5300352573394775, + "learning_rate": 9.155708668140899e-05, + "loss": 1.4073, + "step": 3611 + }, + { + "epoch": 0.20132657042528287, + "grad_norm": 0.5179193019866943, + "learning_rate": 9.155213472725571e-05, + "loss": 1.5432, + "step": 3612 + }, + { + "epoch": 0.201382308678446, + "grad_norm": 0.5618082880973816, + "learning_rate": 9.154718145531098e-05, + "loss": 1.79, + "step": 3613 + }, + { + "epoch": 0.20143804693160916, + "grad_norm": 0.47643256187438965, + "learning_rate": 9.15422268657319e-05, + "loss": 1.4084, + "step": 3614 + }, + { + "epoch": 0.20149378518477232, + "grad_norm": 0.6415194272994995, + "learning_rate": 9.15372709586756e-05, + "loss": 1.9196, + "step": 3615 + }, + { + "epoch": 0.20154952343793545, + "grad_norm": 0.5599740147590637, + "learning_rate": 9.153231373429922e-05, + "loss": 1.8508, + "step": 3616 + }, + { + "epoch": 0.2016052616910986, + "grad_norm": 0.5777899622917175, + "learning_rate": 9.152735519276002e-05, + "loss": 1.8367, + "step": 3617 + }, + { + "epoch": 0.20166099994426176, + "grad_norm": 0.5653935670852661, + "learning_rate": 9.152239533421523e-05, + "loss": 1.5819, + "step": 3618 + }, + { + "epoch": 0.2017167381974249, + "grad_norm": 0.5558584928512573, + "learning_rate": 9.151743415882215e-05, + "loss": 1.8245, + "step": 3619 + }, + { + "epoch": 0.20177247645058805, + "grad_norm": 0.5481976866722107, + "learning_rate": 9.151247166673811e-05, + "loss": 1.6422, + "step": 3620 + }, + { + "epoch": 0.20182821470375117, + "grad_norm": 0.49504461884498596, + "learning_rate": 9.150750785812052e-05, + "loss": 1.5992, + "step": 3621 + }, + { + "epoch": 0.20188395295691433, + "grad_norm": 0.6056009531021118, + "learning_rate": 9.150254273312677e-05, + "loss": 1.7729, + "step": 3622 + }, + { + "epoch": 0.2019396912100775, + "grad_norm": 0.5418253540992737, + "learning_rate": 9.149757629191436e-05, + "loss": 1.8279, + "step": 3623 + }, + { + "epoch": 0.20199542946324062, + "grad_norm": 0.5427140593528748, + "learning_rate": 9.149260853464077e-05, + "loss": 1.6135, + "step": 3624 + }, + { + "epoch": 0.20205116771640377, + "grad_norm": 0.5552391409873962, + "learning_rate": 9.148763946146354e-05, + "loss": 1.6617, + "step": 3625 + }, + { + "epoch": 0.2021069059695669, + "grad_norm": 0.5886726975440979, + "learning_rate": 9.148266907254031e-05, + "loss": 1.9072, + "step": 3626 + }, + { + "epoch": 0.20216264422273006, + "grad_norm": 0.587967038154602, + "learning_rate": 9.147769736802864e-05, + "loss": 1.7807, + "step": 3627 + }, + { + "epoch": 0.20221838247589322, + "grad_norm": 0.5265384912490845, + "learning_rate": 9.147272434808627e-05, + "loss": 1.5633, + "step": 3628 + }, + { + "epoch": 0.20227412072905634, + "grad_norm": 0.5282620191574097, + "learning_rate": 9.146775001287088e-05, + "loss": 1.579, + "step": 3629 + }, + { + "epoch": 0.2023298589822195, + "grad_norm": 0.5758345723152161, + "learning_rate": 9.146277436254022e-05, + "loss": 1.8881, + "step": 3630 + }, + { + "epoch": 0.20238559723538263, + "grad_norm": 0.5375788807868958, + "learning_rate": 9.145779739725213e-05, + "loss": 1.7915, + "step": 3631 + }, + { + "epoch": 0.2024413354885458, + "grad_norm": 0.5047256350517273, + "learning_rate": 9.14528191171644e-05, + "loss": 1.8006, + "step": 3632 + }, + { + "epoch": 0.20249707374170894, + "grad_norm": 0.5424186587333679, + "learning_rate": 9.144783952243493e-05, + "loss": 1.5753, + "step": 3633 + }, + { + "epoch": 0.20255281199487207, + "grad_norm": 0.5288758277893066, + "learning_rate": 9.144285861322166e-05, + "loss": 1.7276, + "step": 3634 + }, + { + "epoch": 0.20260855024803523, + "grad_norm": 0.638491690158844, + "learning_rate": 9.143787638968254e-05, + "loss": 1.8898, + "step": 3635 + }, + { + "epoch": 0.20266428850119836, + "grad_norm": 0.5804757475852966, + "learning_rate": 9.143289285197558e-05, + "loss": 1.9973, + "step": 3636 + }, + { + "epoch": 0.20272002675436152, + "grad_norm": 0.6197081804275513, + "learning_rate": 9.142790800025883e-05, + "loss": 1.7459, + "step": 3637 + }, + { + "epoch": 0.20277576500752467, + "grad_norm": 0.6034955382347107, + "learning_rate": 9.142292183469039e-05, + "loss": 1.9412, + "step": 3638 + }, + { + "epoch": 0.2028315032606878, + "grad_norm": 0.5404736995697021, + "learning_rate": 9.141793435542836e-05, + "loss": 1.6073, + "step": 3639 + }, + { + "epoch": 0.20288724151385096, + "grad_norm": 0.48670318722724915, + "learning_rate": 9.141294556263096e-05, + "loss": 1.5109, + "step": 3640 + }, + { + "epoch": 0.20294297976701411, + "grad_norm": 0.5840024948120117, + "learning_rate": 9.140795545645636e-05, + "loss": 1.7593, + "step": 3641 + }, + { + "epoch": 0.20299871802017724, + "grad_norm": 0.5371603965759277, + "learning_rate": 9.140296403706284e-05, + "loss": 1.6055, + "step": 3642 + }, + { + "epoch": 0.2030544562733404, + "grad_norm": 1.0509992837905884, + "learning_rate": 9.13979713046087e-05, + "loss": 2.0113, + "step": 3643 + }, + { + "epoch": 0.20311019452650353, + "grad_norm": 0.49479854106903076, + "learning_rate": 9.139297725925229e-05, + "loss": 1.516, + "step": 3644 + }, + { + "epoch": 0.20316593277966669, + "grad_norm": 0.5389636754989624, + "learning_rate": 9.138798190115196e-05, + "loss": 1.9002, + "step": 3645 + }, + { + "epoch": 0.20322167103282984, + "grad_norm": 0.5524114370346069, + "learning_rate": 9.138298523046617e-05, + "loss": 1.6288, + "step": 3646 + }, + { + "epoch": 0.20327740928599297, + "grad_norm": 0.49681249260902405, + "learning_rate": 9.137798724735336e-05, + "loss": 1.4397, + "step": 3647 + }, + { + "epoch": 0.20333314753915613, + "grad_norm": 0.6418421268463135, + "learning_rate": 9.137298795197204e-05, + "loss": 2.1691, + "step": 3648 + }, + { + "epoch": 0.20338888579231926, + "grad_norm": 0.5589434504508972, + "learning_rate": 9.136798734448077e-05, + "loss": 1.781, + "step": 3649 + }, + { + "epoch": 0.20344462404548241, + "grad_norm": 0.5447176694869995, + "learning_rate": 9.136298542503814e-05, + "loss": 1.6205, + "step": 3650 + }, + { + "epoch": 0.20350036229864557, + "grad_norm": 0.5343891978263855, + "learning_rate": 9.135798219380276e-05, + "loss": 1.7727, + "step": 3651 + }, + { + "epoch": 0.2035561005518087, + "grad_norm": 0.5254631638526917, + "learning_rate": 9.135297765093333e-05, + "loss": 1.7057, + "step": 3652 + }, + { + "epoch": 0.20361183880497186, + "grad_norm": 0.5393111109733582, + "learning_rate": 9.134797179658854e-05, + "loss": 1.7132, + "step": 3653 + }, + { + "epoch": 0.20366757705813499, + "grad_norm": 0.5616254806518555, + "learning_rate": 9.134296463092717e-05, + "loss": 1.9128, + "step": 3654 + }, + { + "epoch": 0.20372331531129814, + "grad_norm": 0.5558052659034729, + "learning_rate": 9.133795615410801e-05, + "loss": 1.6986, + "step": 3655 + }, + { + "epoch": 0.2037790535644613, + "grad_norm": 0.5616979002952576, + "learning_rate": 9.13329463662899e-05, + "loss": 1.9381, + "step": 3656 + }, + { + "epoch": 0.20383479181762443, + "grad_norm": 0.5200750827789307, + "learning_rate": 9.132793526763171e-05, + "loss": 1.6176, + "step": 3657 + }, + { + "epoch": 0.20389053007078758, + "grad_norm": 0.5086760520935059, + "learning_rate": 9.132292285829237e-05, + "loss": 1.5035, + "step": 3658 + }, + { + "epoch": 0.20394626832395074, + "grad_norm": 0.5122929215431213, + "learning_rate": 9.131790913843086e-05, + "loss": 1.6288, + "step": 3659 + }, + { + "epoch": 0.20400200657711387, + "grad_norm": 0.5770255923271179, + "learning_rate": 9.131289410820616e-05, + "loss": 1.71, + "step": 3660 + }, + { + "epoch": 0.20405774483027703, + "grad_norm": 0.5811052322387695, + "learning_rate": 9.130787776777734e-05, + "loss": 1.9395, + "step": 3661 + }, + { + "epoch": 0.20411348308344016, + "grad_norm": 0.5475841164588928, + "learning_rate": 9.130286011730347e-05, + "loss": 1.8358, + "step": 3662 + }, + { + "epoch": 0.2041692213366033, + "grad_norm": 0.5167744755744934, + "learning_rate": 9.129784115694369e-05, + "loss": 1.602, + "step": 3663 + }, + { + "epoch": 0.20422495958976647, + "grad_norm": 0.5313771963119507, + "learning_rate": 9.129282088685718e-05, + "loss": 1.7868, + "step": 3664 + }, + { + "epoch": 0.2042806978429296, + "grad_norm": 0.5149242877960205, + "learning_rate": 9.128779930720313e-05, + "loss": 1.6943, + "step": 3665 + }, + { + "epoch": 0.20433643609609276, + "grad_norm": 0.5548785924911499, + "learning_rate": 9.128277641814082e-05, + "loss": 1.9083, + "step": 3666 + }, + { + "epoch": 0.20439217434925588, + "grad_norm": 0.5865716338157654, + "learning_rate": 9.127775221982954e-05, + "loss": 1.9183, + "step": 3667 + }, + { + "epoch": 0.20444791260241904, + "grad_norm": 0.5036227703094482, + "learning_rate": 9.127272671242861e-05, + "loss": 1.6097, + "step": 3668 + }, + { + "epoch": 0.2045036508555822, + "grad_norm": 0.5178596377372742, + "learning_rate": 9.126769989609745e-05, + "loss": 1.7121, + "step": 3669 + }, + { + "epoch": 0.20455938910874533, + "grad_norm": 0.585189938545227, + "learning_rate": 9.126267177099543e-05, + "loss": 1.8327, + "step": 3670 + }, + { + "epoch": 0.20461512736190848, + "grad_norm": 0.5853554606437683, + "learning_rate": 9.125764233728206e-05, + "loss": 1.9047, + "step": 3671 + }, + { + "epoch": 0.2046708656150716, + "grad_norm": 0.5730652213096619, + "learning_rate": 9.125261159511682e-05, + "loss": 1.8311, + "step": 3672 + }, + { + "epoch": 0.20472660386823477, + "grad_norm": 0.5045105814933777, + "learning_rate": 9.124757954465925e-05, + "loss": 1.5241, + "step": 3673 + }, + { + "epoch": 0.20478234212139793, + "grad_norm": 0.5725773572921753, + "learning_rate": 9.124254618606897e-05, + "loss": 1.6949, + "step": 3674 + }, + { + "epoch": 0.20483808037456105, + "grad_norm": 0.5756915211677551, + "learning_rate": 9.123751151950557e-05, + "loss": 1.8553, + "step": 3675 + }, + { + "epoch": 0.2048938186277242, + "grad_norm": 0.5354653000831604, + "learning_rate": 9.123247554512873e-05, + "loss": 1.7906, + "step": 3676 + }, + { + "epoch": 0.20494955688088734, + "grad_norm": 0.5941489934921265, + "learning_rate": 9.122743826309819e-05, + "loss": 1.7721, + "step": 3677 + }, + { + "epoch": 0.2050052951340505, + "grad_norm": 0.5832119584083557, + "learning_rate": 9.122239967357366e-05, + "loss": 1.9673, + "step": 3678 + }, + { + "epoch": 0.20506103338721365, + "grad_norm": 0.6178232431411743, + "learning_rate": 9.121735977671495e-05, + "loss": 2.0516, + "step": 3679 + }, + { + "epoch": 0.20511677164037678, + "grad_norm": 0.5315244197845459, + "learning_rate": 9.121231857268191e-05, + "loss": 1.5958, + "step": 3680 + }, + { + "epoch": 0.20517250989353994, + "grad_norm": 0.5662999153137207, + "learning_rate": 9.120727606163442e-05, + "loss": 1.6989, + "step": 3681 + }, + { + "epoch": 0.2052282481467031, + "grad_norm": 0.49450522661209106, + "learning_rate": 9.120223224373238e-05, + "loss": 1.4492, + "step": 3682 + }, + { + "epoch": 0.20528398639986623, + "grad_norm": 0.572935163974762, + "learning_rate": 9.119718711913575e-05, + "loss": 1.6674, + "step": 3683 + }, + { + "epoch": 0.20533972465302938, + "grad_norm": 0.5418963432312012, + "learning_rate": 9.119214068800456e-05, + "loss": 1.6326, + "step": 3684 + }, + { + "epoch": 0.2053954629061925, + "grad_norm": 0.5970882773399353, + "learning_rate": 9.118709295049883e-05, + "loss": 1.9069, + "step": 3685 + }, + { + "epoch": 0.20545120115935567, + "grad_norm": 0.5530537962913513, + "learning_rate": 9.118204390677863e-05, + "loss": 1.6096, + "step": 3686 + }, + { + "epoch": 0.20550693941251882, + "grad_norm": 0.5641506314277649, + "learning_rate": 9.117699355700412e-05, + "loss": 1.7118, + "step": 3687 + }, + { + "epoch": 0.20556267766568195, + "grad_norm": 0.6086058616638184, + "learning_rate": 9.117194190133545e-05, + "loss": 1.713, + "step": 3688 + }, + { + "epoch": 0.2056184159188451, + "grad_norm": 0.577290952205658, + "learning_rate": 9.116688893993284e-05, + "loss": 1.8858, + "step": 3689 + }, + { + "epoch": 0.20567415417200824, + "grad_norm": 0.5066075325012207, + "learning_rate": 9.116183467295651e-05, + "loss": 1.5922, + "step": 3690 + }, + { + "epoch": 0.2057298924251714, + "grad_norm": 0.5287824273109436, + "learning_rate": 9.115677910056681e-05, + "loss": 1.4441, + "step": 3691 + }, + { + "epoch": 0.20578563067833455, + "grad_norm": 0.62456214427948, + "learning_rate": 9.115172222292401e-05, + "loss": 1.9545, + "step": 3692 + }, + { + "epoch": 0.20584136893149768, + "grad_norm": 0.5801160335540771, + "learning_rate": 9.114666404018853e-05, + "loss": 2.0095, + "step": 3693 + }, + { + "epoch": 0.20589710718466084, + "grad_norm": 0.5162177085876465, + "learning_rate": 9.114160455252074e-05, + "loss": 1.7295, + "step": 3694 + }, + { + "epoch": 0.20595284543782397, + "grad_norm": 0.5912075042724609, + "learning_rate": 9.113654376008115e-05, + "loss": 1.787, + "step": 3695 + }, + { + "epoch": 0.20600858369098712, + "grad_norm": 0.5578693747520447, + "learning_rate": 9.113148166303023e-05, + "loss": 1.6167, + "step": 3696 + }, + { + "epoch": 0.20606432194415028, + "grad_norm": 0.5576518177986145, + "learning_rate": 9.112641826152853e-05, + "loss": 1.7855, + "step": 3697 + }, + { + "epoch": 0.2061200601973134, + "grad_norm": 0.5475178360939026, + "learning_rate": 9.11213535557366e-05, + "loss": 1.7013, + "step": 3698 + }, + { + "epoch": 0.20617579845047657, + "grad_norm": 0.5434138178825378, + "learning_rate": 9.111628754581512e-05, + "loss": 1.7804, + "step": 3699 + }, + { + "epoch": 0.2062315367036397, + "grad_norm": 0.5596892237663269, + "learning_rate": 9.111122023192471e-05, + "loss": 1.8347, + "step": 3700 + }, + { + "epoch": 0.20628727495680285, + "grad_norm": 0.5505380034446716, + "learning_rate": 9.110615161422609e-05, + "loss": 1.878, + "step": 3701 + }, + { + "epoch": 0.206343013209966, + "grad_norm": 0.6178278923034668, + "learning_rate": 9.110108169288001e-05, + "loss": 1.7626, + "step": 3702 + }, + { + "epoch": 0.20639875146312914, + "grad_norm": 0.5460211038589478, + "learning_rate": 9.109601046804726e-05, + "loss": 1.8064, + "step": 3703 + }, + { + "epoch": 0.2064544897162923, + "grad_norm": 0.5765804052352905, + "learning_rate": 9.109093793988865e-05, + "loss": 1.8228, + "step": 3704 + }, + { + "epoch": 0.20651022796945545, + "grad_norm": 0.5335574746131897, + "learning_rate": 9.108586410856508e-05, + "loss": 1.8011, + "step": 3705 + }, + { + "epoch": 0.20656596622261858, + "grad_norm": 0.5536273717880249, + "learning_rate": 9.108078897423743e-05, + "loss": 1.8751, + "step": 3706 + }, + { + "epoch": 0.20662170447578174, + "grad_norm": 0.5405413508415222, + "learning_rate": 9.107571253706668e-05, + "loss": 1.8607, + "step": 3707 + }, + { + "epoch": 0.20667744272894487, + "grad_norm": 0.5240110158920288, + "learning_rate": 9.107063479721383e-05, + "loss": 1.4375, + "step": 3708 + }, + { + "epoch": 0.20673318098210802, + "grad_norm": 0.4756803512573242, + "learning_rate": 9.106555575483988e-05, + "loss": 1.3254, + "step": 3709 + }, + { + "epoch": 0.20678891923527118, + "grad_norm": 0.5738046765327454, + "learning_rate": 9.106047541010593e-05, + "loss": 1.776, + "step": 3710 + }, + { + "epoch": 0.2068446574884343, + "grad_norm": 0.5442799925804138, + "learning_rate": 9.105539376317312e-05, + "loss": 1.7099, + "step": 3711 + }, + { + "epoch": 0.20690039574159746, + "grad_norm": 0.5695345401763916, + "learning_rate": 9.105031081420259e-05, + "loss": 1.6337, + "step": 3712 + }, + { + "epoch": 0.2069561339947606, + "grad_norm": 0.4725694954395294, + "learning_rate": 9.104522656335553e-05, + "loss": 1.4172, + "step": 3713 + }, + { + "epoch": 0.20701187224792375, + "grad_norm": 0.5137088894844055, + "learning_rate": 9.10401410107932e-05, + "loss": 1.6826, + "step": 3714 + }, + { + "epoch": 0.2070676105010869, + "grad_norm": 0.5813738703727722, + "learning_rate": 9.103505415667686e-05, + "loss": 1.9881, + "step": 3715 + }, + { + "epoch": 0.20712334875425004, + "grad_norm": 0.5776458382606506, + "learning_rate": 9.102996600116786e-05, + "loss": 1.8194, + "step": 3716 + }, + { + "epoch": 0.2071790870074132, + "grad_norm": 0.6059629917144775, + "learning_rate": 9.102487654442758e-05, + "loss": 1.9822, + "step": 3717 + }, + { + "epoch": 0.20723482526057632, + "grad_norm": 0.5408186912536621, + "learning_rate": 9.101978578661738e-05, + "loss": 1.8422, + "step": 3718 + }, + { + "epoch": 0.20729056351373948, + "grad_norm": 0.5199152231216431, + "learning_rate": 9.101469372789874e-05, + "loss": 1.6269, + "step": 3719 + }, + { + "epoch": 0.20734630176690264, + "grad_norm": 0.4990878105163574, + "learning_rate": 9.100960036843317e-05, + "loss": 1.6431, + "step": 3720 + }, + { + "epoch": 0.20740204002006576, + "grad_norm": 0.539283812046051, + "learning_rate": 9.100450570838216e-05, + "loss": 1.6332, + "step": 3721 + }, + { + "epoch": 0.20745777827322892, + "grad_norm": 0.4963357150554657, + "learning_rate": 9.09994097479073e-05, + "loss": 1.4083, + "step": 3722 + }, + { + "epoch": 0.20751351652639205, + "grad_norm": 0.5257975459098816, + "learning_rate": 9.099431248717022e-05, + "loss": 1.673, + "step": 3723 + }, + { + "epoch": 0.2075692547795552, + "grad_norm": 0.5869825482368469, + "learning_rate": 9.098921392633255e-05, + "loss": 1.8618, + "step": 3724 + }, + { + "epoch": 0.20762499303271836, + "grad_norm": 0.5818216800689697, + "learning_rate": 9.0984114065556e-05, + "loss": 1.761, + "step": 3725 + }, + { + "epoch": 0.2076807312858815, + "grad_norm": 0.5281986594200134, + "learning_rate": 9.097901290500231e-05, + "loss": 1.5652, + "step": 3726 + }, + { + "epoch": 0.20773646953904465, + "grad_norm": 0.5425719618797302, + "learning_rate": 9.097391044483325e-05, + "loss": 1.6899, + "step": 3727 + }, + { + "epoch": 0.2077922077922078, + "grad_norm": 0.5924318432807922, + "learning_rate": 9.096880668521066e-05, + "loss": 2.0674, + "step": 3728 + }, + { + "epoch": 0.20784794604537093, + "grad_norm": 0.5444379448890686, + "learning_rate": 9.096370162629637e-05, + "loss": 1.8427, + "step": 3729 + }, + { + "epoch": 0.2079036842985341, + "grad_norm": 0.5292965769767761, + "learning_rate": 9.09585952682523e-05, + "loss": 1.6487, + "step": 3730 + }, + { + "epoch": 0.20795942255169722, + "grad_norm": 0.5337923765182495, + "learning_rate": 9.09534876112404e-05, + "loss": 1.7153, + "step": 3731 + }, + { + "epoch": 0.20801516080486038, + "grad_norm": 0.5366414785385132, + "learning_rate": 9.094837865542265e-05, + "loss": 1.7336, + "step": 3732 + }, + { + "epoch": 0.20807089905802353, + "grad_norm": 0.5158184766769409, + "learning_rate": 9.094326840096106e-05, + "loss": 1.4747, + "step": 3733 + }, + { + "epoch": 0.20812663731118666, + "grad_norm": 0.5793300867080688, + "learning_rate": 9.093815684801772e-05, + "loss": 1.67, + "step": 3734 + }, + { + "epoch": 0.20818237556434982, + "grad_norm": 0.57293701171875, + "learning_rate": 9.093304399675474e-05, + "loss": 1.8801, + "step": 3735 + }, + { + "epoch": 0.20823811381751295, + "grad_norm": 0.514213502407074, + "learning_rate": 9.092792984733425e-05, + "loss": 1.5878, + "step": 3736 + }, + { + "epoch": 0.2082938520706761, + "grad_norm": 0.5890586376190186, + "learning_rate": 9.092281439991846e-05, + "loss": 1.9247, + "step": 3737 + }, + { + "epoch": 0.20834959032383926, + "grad_norm": 0.5602766275405884, + "learning_rate": 9.091769765466959e-05, + "loss": 1.7421, + "step": 3738 + }, + { + "epoch": 0.2084053285770024, + "grad_norm": 0.586161196231842, + "learning_rate": 9.091257961174991e-05, + "loss": 2.0567, + "step": 3739 + }, + { + "epoch": 0.20846106683016555, + "grad_norm": 0.5134695768356323, + "learning_rate": 9.090746027132175e-05, + "loss": 1.6464, + "step": 3740 + }, + { + "epoch": 0.20851680508332868, + "grad_norm": 0.5447134375572205, + "learning_rate": 9.090233963354746e-05, + "loss": 1.8313, + "step": 3741 + }, + { + "epoch": 0.20857254333649183, + "grad_norm": 0.5118534564971924, + "learning_rate": 9.089721769858943e-05, + "loss": 1.629, + "step": 3742 + }, + { + "epoch": 0.208628281589655, + "grad_norm": 0.5482544898986816, + "learning_rate": 9.08920944666101e-05, + "loss": 1.6353, + "step": 3743 + }, + { + "epoch": 0.20868401984281812, + "grad_norm": 0.542334258556366, + "learning_rate": 9.088696993777194e-05, + "loss": 1.6882, + "step": 3744 + }, + { + "epoch": 0.20873975809598128, + "grad_norm": 0.527746319770813, + "learning_rate": 9.08818441122375e-05, + "loss": 1.5986, + "step": 3745 + }, + { + "epoch": 0.2087954963491444, + "grad_norm": 0.5480045080184937, + "learning_rate": 9.08767169901693e-05, + "loss": 1.6445, + "step": 3746 + }, + { + "epoch": 0.20885123460230756, + "grad_norm": 0.5573908686637878, + "learning_rate": 9.087158857172999e-05, + "loss": 1.851, + "step": 3747 + }, + { + "epoch": 0.20890697285547072, + "grad_norm": 0.5698862671852112, + "learning_rate": 9.086645885708218e-05, + "loss": 1.6359, + "step": 3748 + }, + { + "epoch": 0.20896271110863385, + "grad_norm": 0.557510256767273, + "learning_rate": 9.086132784638857e-05, + "loss": 1.7563, + "step": 3749 + }, + { + "epoch": 0.209018449361797, + "grad_norm": 0.5576832890510559, + "learning_rate": 9.085619553981187e-05, + "loss": 1.8104, + "step": 3750 + }, + { + "epoch": 0.20907418761496016, + "grad_norm": 0.5342584848403931, + "learning_rate": 9.085106193751485e-05, + "loss": 1.4561, + "step": 3751 + }, + { + "epoch": 0.2091299258681233, + "grad_norm": 0.5547382235527039, + "learning_rate": 9.084592703966033e-05, + "loss": 1.6986, + "step": 3752 + }, + { + "epoch": 0.20918566412128645, + "grad_norm": 0.5614180564880371, + "learning_rate": 9.084079084641115e-05, + "loss": 1.7837, + "step": 3753 + }, + { + "epoch": 0.20924140237444958, + "grad_norm": 0.5065221786499023, + "learning_rate": 9.083565335793021e-05, + "loss": 1.7262, + "step": 3754 + }, + { + "epoch": 0.20929714062761273, + "grad_norm": 0.5504621863365173, + "learning_rate": 9.083051457438043e-05, + "loss": 1.7848, + "step": 3755 + }, + { + "epoch": 0.2093528788807759, + "grad_norm": 0.5882393717765808, + "learning_rate": 9.082537449592479e-05, + "loss": 2.0356, + "step": 3756 + }, + { + "epoch": 0.20940861713393902, + "grad_norm": 0.6157543063163757, + "learning_rate": 9.08202331227263e-05, + "loss": 1.9959, + "step": 3757 + }, + { + "epoch": 0.20946435538710217, + "grad_norm": 0.5493510961532593, + "learning_rate": 9.0815090454948e-05, + "loss": 1.7899, + "step": 3758 + }, + { + "epoch": 0.2095200936402653, + "grad_norm": 0.5107924938201904, + "learning_rate": 9.0809946492753e-05, + "loss": 1.4062, + "step": 3759 + }, + { + "epoch": 0.20957583189342846, + "grad_norm": 0.5571010112762451, + "learning_rate": 9.080480123630444e-05, + "loss": 1.6807, + "step": 3760 + }, + { + "epoch": 0.20963157014659162, + "grad_norm": 0.5510861277580261, + "learning_rate": 9.07996546857655e-05, + "loss": 1.9714, + "step": 3761 + }, + { + "epoch": 0.20968730839975475, + "grad_norm": 0.531609296798706, + "learning_rate": 9.07945068412994e-05, + "loss": 1.7811, + "step": 3762 + }, + { + "epoch": 0.2097430466529179, + "grad_norm": 0.5203907489776611, + "learning_rate": 9.078935770306938e-05, + "loss": 1.7003, + "step": 3763 + }, + { + "epoch": 0.20979878490608103, + "grad_norm": 0.5677714347839355, + "learning_rate": 9.078420727123874e-05, + "loss": 2.0188, + "step": 3764 + }, + { + "epoch": 0.2098545231592442, + "grad_norm": 0.5568066239356995, + "learning_rate": 9.077905554597086e-05, + "loss": 1.7745, + "step": 3765 + }, + { + "epoch": 0.20991026141240735, + "grad_norm": 0.564201831817627, + "learning_rate": 9.077390252742907e-05, + "loss": 1.7723, + "step": 3766 + }, + { + "epoch": 0.20996599966557047, + "grad_norm": 0.569828450679779, + "learning_rate": 9.076874821577683e-05, + "loss": 1.7731, + "step": 3767 + }, + { + "epoch": 0.21002173791873363, + "grad_norm": 0.5601812601089478, + "learning_rate": 9.07635926111776e-05, + "loss": 1.6495, + "step": 3768 + }, + { + "epoch": 0.21007747617189676, + "grad_norm": 0.6098621487617493, + "learning_rate": 9.075843571379488e-05, + "loss": 1.9732, + "step": 3769 + }, + { + "epoch": 0.21013321442505992, + "grad_norm": 0.5688888430595398, + "learning_rate": 9.075327752379221e-05, + "loss": 1.8381, + "step": 3770 + }, + { + "epoch": 0.21018895267822307, + "grad_norm": 0.5635893940925598, + "learning_rate": 9.074811804133318e-05, + "loss": 1.7662, + "step": 3771 + }, + { + "epoch": 0.2102446909313862, + "grad_norm": 0.5132915377616882, + "learning_rate": 9.074295726658144e-05, + "loss": 1.6434, + "step": 3772 + }, + { + "epoch": 0.21030042918454936, + "grad_norm": 0.5504310727119446, + "learning_rate": 9.073779519970065e-05, + "loss": 1.7831, + "step": 3773 + }, + { + "epoch": 0.21035616743771252, + "grad_norm": 0.5861356258392334, + "learning_rate": 9.07326318408545e-05, + "loss": 1.9085, + "step": 3774 + }, + { + "epoch": 0.21041190569087564, + "grad_norm": 0.5746229887008667, + "learning_rate": 9.072746719020676e-05, + "loss": 1.851, + "step": 3775 + }, + { + "epoch": 0.2104676439440388, + "grad_norm": 0.5618278980255127, + "learning_rate": 9.072230124792121e-05, + "loss": 1.9007, + "step": 3776 + }, + { + "epoch": 0.21052338219720193, + "grad_norm": 0.5574671030044556, + "learning_rate": 9.07171340141617e-05, + "loss": 1.7664, + "step": 3777 + }, + { + "epoch": 0.2105791204503651, + "grad_norm": 0.5418394207954407, + "learning_rate": 9.071196548909208e-05, + "loss": 1.5912, + "step": 3778 + }, + { + "epoch": 0.21063485870352824, + "grad_norm": 0.5579066872596741, + "learning_rate": 9.070679567287631e-05, + "loss": 1.8595, + "step": 3779 + }, + { + "epoch": 0.21069059695669137, + "grad_norm": 0.5038254261016846, + "learning_rate": 9.07016245656783e-05, + "loss": 1.5864, + "step": 3780 + }, + { + "epoch": 0.21074633520985453, + "grad_norm": 0.5985908508300781, + "learning_rate": 9.069645216766208e-05, + "loss": 1.8166, + "step": 3781 + }, + { + "epoch": 0.21080207346301766, + "grad_norm": 0.5343535542488098, + "learning_rate": 9.069127847899166e-05, + "loss": 1.7482, + "step": 3782 + }, + { + "epoch": 0.21085781171618082, + "grad_norm": 0.513039231300354, + "learning_rate": 9.068610349983113e-05, + "loss": 1.7065, + "step": 3783 + }, + { + "epoch": 0.21091354996934397, + "grad_norm": 0.5761904716491699, + "learning_rate": 9.068092723034462e-05, + "loss": 1.7781, + "step": 3784 + }, + { + "epoch": 0.2109692882225071, + "grad_norm": 0.5832051634788513, + "learning_rate": 9.067574967069628e-05, + "loss": 1.7871, + "step": 3785 + }, + { + "epoch": 0.21102502647567026, + "grad_norm": 0.9756948947906494, + "learning_rate": 9.067057082105032e-05, + "loss": 1.9512, + "step": 3786 + }, + { + "epoch": 0.2110807647288334, + "grad_norm": 0.5692908763885498, + "learning_rate": 9.066539068157098e-05, + "loss": 1.4585, + "step": 3787 + }, + { + "epoch": 0.21113650298199654, + "grad_norm": 0.5954088568687439, + "learning_rate": 9.066020925242256e-05, + "loss": 1.9236, + "step": 3788 + }, + { + "epoch": 0.2111922412351597, + "grad_norm": 0.5660640597343445, + "learning_rate": 9.065502653376935e-05, + "loss": 1.67, + "step": 3789 + }, + { + "epoch": 0.21124797948832283, + "grad_norm": 0.5779823064804077, + "learning_rate": 9.064984252577573e-05, + "loss": 1.8769, + "step": 3790 + }, + { + "epoch": 0.21130371774148599, + "grad_norm": 0.5380722880363464, + "learning_rate": 9.064465722860611e-05, + "loss": 1.6658, + "step": 3791 + }, + { + "epoch": 0.21135945599464911, + "grad_norm": 0.5925493836402893, + "learning_rate": 9.063947064242495e-05, + "loss": 1.7003, + "step": 3792 + }, + { + "epoch": 0.21141519424781227, + "grad_norm": 0.5475820899009705, + "learning_rate": 9.063428276739671e-05, + "loss": 1.7658, + "step": 3793 + }, + { + "epoch": 0.21147093250097543, + "grad_norm": 0.5608733296394348, + "learning_rate": 9.062909360368595e-05, + "loss": 1.7443, + "step": 3794 + }, + { + "epoch": 0.21152667075413856, + "grad_norm": 0.5024067163467407, + "learning_rate": 9.062390315145723e-05, + "loss": 1.4051, + "step": 3795 + }, + { + "epoch": 0.2115824090073017, + "grad_norm": 0.5922258496284485, + "learning_rate": 9.061871141087515e-05, + "loss": 1.6788, + "step": 3796 + }, + { + "epoch": 0.21163814726046487, + "grad_norm": 0.5388537645339966, + "learning_rate": 9.061351838210434e-05, + "loss": 1.7306, + "step": 3797 + }, + { + "epoch": 0.211693885513628, + "grad_norm": 0.5710194706916809, + "learning_rate": 9.060832406530955e-05, + "loss": 1.7759, + "step": 3798 + }, + { + "epoch": 0.21174962376679116, + "grad_norm": 0.5648775696754456, + "learning_rate": 9.060312846065548e-05, + "loss": 1.8155, + "step": 3799 + }, + { + "epoch": 0.21180536201995429, + "grad_norm": 0.5653148293495178, + "learning_rate": 9.059793156830691e-05, + "loss": 1.9103, + "step": 3800 + }, + { + "epoch": 0.21186110027311744, + "grad_norm": 0.5372900366783142, + "learning_rate": 9.059273338842864e-05, + "loss": 1.6333, + "step": 3801 + }, + { + "epoch": 0.2119168385262806, + "grad_norm": 0.6031267046928406, + "learning_rate": 9.058753392118555e-05, + "loss": 1.9328, + "step": 3802 + }, + { + "epoch": 0.21197257677944373, + "grad_norm": 0.5510583519935608, + "learning_rate": 9.058233316674252e-05, + "loss": 1.6296, + "step": 3803 + }, + { + "epoch": 0.21202831503260688, + "grad_norm": 0.5591006875038147, + "learning_rate": 9.057713112526449e-05, + "loss": 1.6512, + "step": 3804 + }, + { + "epoch": 0.21208405328577, + "grad_norm": 0.5050638318061829, + "learning_rate": 9.057192779691645e-05, + "loss": 1.793, + "step": 3805 + }, + { + "epoch": 0.21213979153893317, + "grad_norm": 0.5485342144966125, + "learning_rate": 9.05667231818634e-05, + "loss": 1.8989, + "step": 3806 + }, + { + "epoch": 0.21219552979209633, + "grad_norm": 0.536729633808136, + "learning_rate": 9.056151728027042e-05, + "loss": 1.7235, + "step": 3807 + }, + { + "epoch": 0.21225126804525946, + "grad_norm": 0.5280648469924927, + "learning_rate": 9.055631009230262e-05, + "loss": 1.779, + "step": 3808 + }, + { + "epoch": 0.2123070062984226, + "grad_norm": 0.5353644490242004, + "learning_rate": 9.05511016181251e-05, + "loss": 1.9023, + "step": 3809 + }, + { + "epoch": 0.21236274455158574, + "grad_norm": 0.5420893430709839, + "learning_rate": 9.054589185790305e-05, + "loss": 1.6484, + "step": 3810 + }, + { + "epoch": 0.2124184828047489, + "grad_norm": 0.49997881054878235, + "learning_rate": 9.054068081180173e-05, + "loss": 1.6866, + "step": 3811 + }, + { + "epoch": 0.21247422105791205, + "grad_norm": 0.540344774723053, + "learning_rate": 9.05354684799864e-05, + "loss": 1.6013, + "step": 3812 + }, + { + "epoch": 0.21252995931107518, + "grad_norm": 0.5512065291404724, + "learning_rate": 9.053025486262231e-05, + "loss": 1.7741, + "step": 3813 + }, + { + "epoch": 0.21258569756423834, + "grad_norm": 0.5562829375267029, + "learning_rate": 9.052503995987488e-05, + "loss": 1.7829, + "step": 3814 + }, + { + "epoch": 0.21264143581740147, + "grad_norm": 0.528271496295929, + "learning_rate": 9.051982377190944e-05, + "loss": 1.6395, + "step": 3815 + }, + { + "epoch": 0.21269717407056463, + "grad_norm": 0.5158810019493103, + "learning_rate": 9.051460629889144e-05, + "loss": 1.4752, + "step": 3816 + }, + { + "epoch": 0.21275291232372778, + "grad_norm": 0.5320451855659485, + "learning_rate": 9.050938754098635e-05, + "loss": 1.7896, + "step": 3817 + }, + { + "epoch": 0.2128086505768909, + "grad_norm": 0.503190279006958, + "learning_rate": 9.050416749835968e-05, + "loss": 1.5488, + "step": 3818 + }, + { + "epoch": 0.21286438883005407, + "grad_norm": 0.561086893081665, + "learning_rate": 9.049894617117696e-05, + "loss": 1.9669, + "step": 3819 + }, + { + "epoch": 0.21292012708321723, + "grad_norm": 0.5414785146713257, + "learning_rate": 9.04937235596038e-05, + "loss": 1.761, + "step": 3820 + }, + { + "epoch": 0.21297586533638035, + "grad_norm": 0.5729870796203613, + "learning_rate": 9.04884996638058e-05, + "loss": 1.7399, + "step": 3821 + }, + { + "epoch": 0.2130316035895435, + "grad_norm": 0.5905429720878601, + "learning_rate": 9.048327448394868e-05, + "loss": 1.863, + "step": 3822 + }, + { + "epoch": 0.21308734184270664, + "grad_norm": 0.5414051413536072, + "learning_rate": 9.047804802019813e-05, + "loss": 1.4662, + "step": 3823 + }, + { + "epoch": 0.2131430800958698, + "grad_norm": 0.5677713751792908, + "learning_rate": 9.047282027271988e-05, + "loss": 1.7569, + "step": 3824 + }, + { + "epoch": 0.21319881834903295, + "grad_norm": 0.5208271145820618, + "learning_rate": 9.046759124167976e-05, + "loss": 1.5647, + "step": 3825 + }, + { + "epoch": 0.21325455660219608, + "grad_norm": 0.5930595397949219, + "learning_rate": 9.046236092724357e-05, + "loss": 1.8287, + "step": 3826 + }, + { + "epoch": 0.21331029485535924, + "grad_norm": 0.5405799150466919, + "learning_rate": 9.045712932957722e-05, + "loss": 1.7175, + "step": 3827 + }, + { + "epoch": 0.21336603310852237, + "grad_norm": 0.5690011382102966, + "learning_rate": 9.045189644884661e-05, + "loss": 1.9759, + "step": 3828 + }, + { + "epoch": 0.21342177136168552, + "grad_norm": 0.5739631652832031, + "learning_rate": 9.04466622852177e-05, + "loss": 1.7102, + "step": 3829 + }, + { + "epoch": 0.21347750961484868, + "grad_norm": 0.5377629399299622, + "learning_rate": 9.044142683885645e-05, + "loss": 1.647, + "step": 3830 + }, + { + "epoch": 0.2135332478680118, + "grad_norm": 0.6439347267150879, + "learning_rate": 9.043619010992897e-05, + "loss": 2.2611, + "step": 3831 + }, + { + "epoch": 0.21358898612117497, + "grad_norm": 0.527803897857666, + "learning_rate": 9.043095209860128e-05, + "loss": 1.7279, + "step": 3832 + }, + { + "epoch": 0.2136447243743381, + "grad_norm": 0.549921452999115, + "learning_rate": 9.042571280503951e-05, + "loss": 1.7293, + "step": 3833 + }, + { + "epoch": 0.21370046262750125, + "grad_norm": 0.5477808713912964, + "learning_rate": 9.042047222940985e-05, + "loss": 1.8327, + "step": 3834 + }, + { + "epoch": 0.2137562008806644, + "grad_norm": 0.6052964329719543, + "learning_rate": 9.041523037187847e-05, + "loss": 1.6961, + "step": 3835 + }, + { + "epoch": 0.21381193913382754, + "grad_norm": 0.5640259385108948, + "learning_rate": 9.04099872326116e-05, + "loss": 1.8019, + "step": 3836 + }, + { + "epoch": 0.2138676773869907, + "grad_norm": 0.5238528251647949, + "learning_rate": 9.040474281177557e-05, + "loss": 1.7182, + "step": 3837 + }, + { + "epoch": 0.21392341564015382, + "grad_norm": 0.561298668384552, + "learning_rate": 9.039949710953665e-05, + "loss": 1.903, + "step": 3838 + }, + { + "epoch": 0.21397915389331698, + "grad_norm": 0.5629448890686035, + "learning_rate": 9.039425012606125e-05, + "loss": 1.6652, + "step": 3839 + }, + { + "epoch": 0.21403489214648014, + "grad_norm": 0.5578324794769287, + "learning_rate": 9.038900186151575e-05, + "loss": 1.8062, + "step": 3840 + }, + { + "epoch": 0.21409063039964327, + "grad_norm": 0.5517327785491943, + "learning_rate": 9.038375231606659e-05, + "loss": 1.7376, + "step": 3841 + }, + { + "epoch": 0.21414636865280642, + "grad_norm": 0.5653707385063171, + "learning_rate": 9.037850148988026e-05, + "loss": 1.7724, + "step": 3842 + }, + { + "epoch": 0.21420210690596958, + "grad_norm": 0.6022188663482666, + "learning_rate": 9.037324938312327e-05, + "loss": 1.9338, + "step": 3843 + }, + { + "epoch": 0.2142578451591327, + "grad_norm": 0.5128300189971924, + "learning_rate": 9.036799599596222e-05, + "loss": 1.6685, + "step": 3844 + }, + { + "epoch": 0.21431358341229587, + "grad_norm": 0.5680099129676819, + "learning_rate": 9.036274132856368e-05, + "loss": 1.6111, + "step": 3845 + }, + { + "epoch": 0.214369321665459, + "grad_norm": 0.5332833528518677, + "learning_rate": 9.035748538109433e-05, + "loss": 1.7406, + "step": 3846 + }, + { + "epoch": 0.21442505991862215, + "grad_norm": 0.5845235586166382, + "learning_rate": 9.035222815372084e-05, + "loss": 2.0365, + "step": 3847 + }, + { + "epoch": 0.2144807981717853, + "grad_norm": 0.536208987236023, + "learning_rate": 9.034696964660996e-05, + "loss": 1.7869, + "step": 3848 + }, + { + "epoch": 0.21453653642494844, + "grad_norm": 0.6078736782073975, + "learning_rate": 9.034170985992843e-05, + "loss": 1.9884, + "step": 3849 + }, + { + "epoch": 0.2145922746781116, + "grad_norm": 0.5227762460708618, + "learning_rate": 9.033644879384307e-05, + "loss": 1.7483, + "step": 3850 + }, + { + "epoch": 0.21464801293127472, + "grad_norm": 0.555255115032196, + "learning_rate": 9.033118644852073e-05, + "loss": 1.7319, + "step": 3851 + }, + { + "epoch": 0.21470375118443788, + "grad_norm": 0.5747233033180237, + "learning_rate": 9.032592282412831e-05, + "loss": 1.806, + "step": 3852 + }, + { + "epoch": 0.21475948943760104, + "grad_norm": 0.5099679231643677, + "learning_rate": 9.032065792083271e-05, + "loss": 1.7784, + "step": 3853 + }, + { + "epoch": 0.21481522769076417, + "grad_norm": 0.583080530166626, + "learning_rate": 9.031539173880095e-05, + "loss": 1.8283, + "step": 3854 + }, + { + "epoch": 0.21487096594392732, + "grad_norm": 0.5755245089530945, + "learning_rate": 9.031012427820003e-05, + "loss": 1.8088, + "step": 3855 + }, + { + "epoch": 0.21492670419709045, + "grad_norm": 0.6300316452980042, + "learning_rate": 9.030485553919696e-05, + "loss": 2.021, + "step": 3856 + }, + { + "epoch": 0.2149824424502536, + "grad_norm": 0.48787984251976013, + "learning_rate": 9.029958552195889e-05, + "loss": 1.7416, + "step": 3857 + }, + { + "epoch": 0.21503818070341676, + "grad_norm": 0.5602289438247681, + "learning_rate": 9.029431422665292e-05, + "loss": 1.7158, + "step": 3858 + }, + { + "epoch": 0.2150939189565799, + "grad_norm": 0.6266565918922424, + "learning_rate": 9.028904165344622e-05, + "loss": 1.904, + "step": 3859 + }, + { + "epoch": 0.21514965720974305, + "grad_norm": 0.5256897211074829, + "learning_rate": 9.028376780250605e-05, + "loss": 1.5227, + "step": 3860 + }, + { + "epoch": 0.21520539546290618, + "grad_norm": 0.5775957107543945, + "learning_rate": 9.027849267399962e-05, + "loss": 1.8613, + "step": 3861 + }, + { + "epoch": 0.21526113371606934, + "grad_norm": 0.5759565830230713, + "learning_rate": 9.027321626809425e-05, + "loss": 1.7657, + "step": 3862 + }, + { + "epoch": 0.2153168719692325, + "grad_norm": 0.5388328433036804, + "learning_rate": 9.026793858495727e-05, + "loss": 1.7117, + "step": 3863 + }, + { + "epoch": 0.21537261022239562, + "grad_norm": 0.5749773383140564, + "learning_rate": 9.026265962475605e-05, + "loss": 1.781, + "step": 3864 + }, + { + "epoch": 0.21542834847555878, + "grad_norm": 0.5567165017127991, + "learning_rate": 9.025737938765801e-05, + "loss": 1.8402, + "step": 3865 + }, + { + "epoch": 0.21548408672872194, + "grad_norm": 0.5531468987464905, + "learning_rate": 9.025209787383062e-05, + "loss": 1.637, + "step": 3866 + }, + { + "epoch": 0.21553982498188506, + "grad_norm": 0.5598788261413574, + "learning_rate": 9.024681508344138e-05, + "loss": 1.7487, + "step": 3867 + }, + { + "epoch": 0.21559556323504822, + "grad_norm": 0.5670254826545715, + "learning_rate": 9.024153101665782e-05, + "loss": 1.8556, + "step": 3868 + }, + { + "epoch": 0.21565130148821135, + "grad_norm": 0.5822195410728455, + "learning_rate": 9.02362456736475e-05, + "loss": 1.8144, + "step": 3869 + }, + { + "epoch": 0.2157070397413745, + "grad_norm": 0.5438206791877747, + "learning_rate": 9.023095905457807e-05, + "loss": 1.7709, + "step": 3870 + }, + { + "epoch": 0.21576277799453766, + "grad_norm": 0.5582990646362305, + "learning_rate": 9.022567115961718e-05, + "loss": 1.7373, + "step": 3871 + }, + { + "epoch": 0.2158185162477008, + "grad_norm": 0.5481442809104919, + "learning_rate": 9.022038198893254e-05, + "loss": 1.642, + "step": 3872 + }, + { + "epoch": 0.21587425450086395, + "grad_norm": 0.5365943312644958, + "learning_rate": 9.021509154269187e-05, + "loss": 1.7393, + "step": 3873 + }, + { + "epoch": 0.21592999275402708, + "grad_norm": 0.5595213174819946, + "learning_rate": 9.0209799821063e-05, + "loss": 1.7803, + "step": 3874 + }, + { + "epoch": 0.21598573100719023, + "grad_norm": 0.5171288251876831, + "learning_rate": 9.020450682421368e-05, + "loss": 1.6007, + "step": 3875 + }, + { + "epoch": 0.2160414692603534, + "grad_norm": 0.5536861419677734, + "learning_rate": 9.019921255231183e-05, + "loss": 1.7964, + "step": 3876 + }, + { + "epoch": 0.21609720751351652, + "grad_norm": 0.5218709707260132, + "learning_rate": 9.019391700552533e-05, + "loss": 1.7572, + "step": 3877 + }, + { + "epoch": 0.21615294576667968, + "grad_norm": 0.5276560187339783, + "learning_rate": 9.018862018402214e-05, + "loss": 1.7768, + "step": 3878 + }, + { + "epoch": 0.2162086840198428, + "grad_norm": 0.509636640548706, + "learning_rate": 9.018332208797023e-05, + "loss": 1.8262, + "step": 3879 + }, + { + "epoch": 0.21626442227300596, + "grad_norm": 0.5426955819129944, + "learning_rate": 9.017802271753763e-05, + "loss": 1.7966, + "step": 3880 + }, + { + "epoch": 0.21632016052616912, + "grad_norm": 0.5915662050247192, + "learning_rate": 9.017272207289241e-05, + "loss": 1.7047, + "step": 3881 + }, + { + "epoch": 0.21637589877933225, + "grad_norm": 0.5025647282600403, + "learning_rate": 9.016742015420264e-05, + "loss": 1.6662, + "step": 3882 + }, + { + "epoch": 0.2164316370324954, + "grad_norm": 0.5097705125808716, + "learning_rate": 9.016211696163651e-05, + "loss": 1.6667, + "step": 3883 + }, + { + "epoch": 0.21648737528565853, + "grad_norm": 0.5540134310722351, + "learning_rate": 9.015681249536219e-05, + "loss": 1.7085, + "step": 3884 + }, + { + "epoch": 0.2165431135388217, + "grad_norm": 0.5509772300720215, + "learning_rate": 9.015150675554791e-05, + "loss": 1.7739, + "step": 3885 + }, + { + "epoch": 0.21659885179198485, + "grad_norm": 0.519534170627594, + "learning_rate": 9.014619974236195e-05, + "loss": 1.5412, + "step": 3886 + }, + { + "epoch": 0.21665459004514798, + "grad_norm": 0.5313923954963684, + "learning_rate": 9.014089145597259e-05, + "loss": 1.6956, + "step": 3887 + }, + { + "epoch": 0.21671032829831113, + "grad_norm": 0.5057397484779358, + "learning_rate": 9.013558189654819e-05, + "loss": 1.6772, + "step": 3888 + }, + { + "epoch": 0.2167660665514743, + "grad_norm": 0.5538941621780396, + "learning_rate": 9.013027106425713e-05, + "loss": 1.7071, + "step": 3889 + }, + { + "epoch": 0.21682180480463742, + "grad_norm": 0.5932080149650574, + "learning_rate": 9.012495895926786e-05, + "loss": 1.9886, + "step": 3890 + }, + { + "epoch": 0.21687754305780058, + "grad_norm": 0.5497404932975769, + "learning_rate": 9.011964558174884e-05, + "loss": 1.6111, + "step": 3891 + }, + { + "epoch": 0.2169332813109637, + "grad_norm": 0.5296292304992676, + "learning_rate": 9.011433093186856e-05, + "loss": 1.7192, + "step": 3892 + }, + { + "epoch": 0.21698901956412686, + "grad_norm": 0.5682234168052673, + "learning_rate": 9.01090150097956e-05, + "loss": 1.727, + "step": 3893 + }, + { + "epoch": 0.21704475781729002, + "grad_norm": 0.49014294147491455, + "learning_rate": 9.010369781569854e-05, + "loss": 1.5865, + "step": 3894 + }, + { + "epoch": 0.21710049607045315, + "grad_norm": 0.5291064381599426, + "learning_rate": 9.009837934974598e-05, + "loss": 1.6708, + "step": 3895 + }, + { + "epoch": 0.2171562343236163, + "grad_norm": 0.5380057096481323, + "learning_rate": 9.009305961210664e-05, + "loss": 1.816, + "step": 3896 + }, + { + "epoch": 0.21721197257677943, + "grad_norm": 0.5304032564163208, + "learning_rate": 9.008773860294921e-05, + "loss": 1.6085, + "step": 3897 + }, + { + "epoch": 0.2172677108299426, + "grad_norm": 0.5649582147598267, + "learning_rate": 9.008241632244243e-05, + "loss": 2.0664, + "step": 3898 + }, + { + "epoch": 0.21732344908310575, + "grad_norm": 0.5284783840179443, + "learning_rate": 9.00770927707551e-05, + "loss": 1.6078, + "step": 3899 + }, + { + "epoch": 0.21737918733626888, + "grad_norm": 0.5097172856330872, + "learning_rate": 9.007176794805606e-05, + "loss": 1.6985, + "step": 3900 + }, + { + "epoch": 0.21743492558943203, + "grad_norm": 0.5433828830718994, + "learning_rate": 9.006644185451416e-05, + "loss": 1.824, + "step": 3901 + }, + { + "epoch": 0.21749066384259516, + "grad_norm": 0.5155694484710693, + "learning_rate": 9.006111449029835e-05, + "loss": 1.674, + "step": 3902 + }, + { + "epoch": 0.21754640209575832, + "grad_norm": 0.4952467978000641, + "learning_rate": 9.005578585557754e-05, + "loss": 1.5491, + "step": 3903 + }, + { + "epoch": 0.21760214034892147, + "grad_norm": 0.5352423191070557, + "learning_rate": 9.005045595052077e-05, + "loss": 1.7583, + "step": 3904 + }, + { + "epoch": 0.2176578786020846, + "grad_norm": 0.5036554336547852, + "learning_rate": 9.004512477529702e-05, + "loss": 1.6147, + "step": 3905 + }, + { + "epoch": 0.21771361685524776, + "grad_norm": 0.5414397120475769, + "learning_rate": 9.003979233007541e-05, + "loss": 1.7576, + "step": 3906 + }, + { + "epoch": 0.2177693551084109, + "grad_norm": 0.51963871717453, + "learning_rate": 9.003445861502502e-05, + "loss": 1.7114, + "step": 3907 + }, + { + "epoch": 0.21782509336157405, + "grad_norm": 0.5667458176612854, + "learning_rate": 9.002912363031504e-05, + "loss": 1.904, + "step": 3908 + }, + { + "epoch": 0.2178808316147372, + "grad_norm": 0.5066022872924805, + "learning_rate": 9.002378737611463e-05, + "loss": 1.5851, + "step": 3909 + }, + { + "epoch": 0.21793656986790033, + "grad_norm": 0.5155694484710693, + "learning_rate": 9.001844985259303e-05, + "loss": 1.6766, + "step": 3910 + }, + { + "epoch": 0.2179923081210635, + "grad_norm": 0.5910778641700745, + "learning_rate": 9.001311105991954e-05, + "loss": 1.6309, + "step": 3911 + }, + { + "epoch": 0.21804804637422665, + "grad_norm": 0.5524371862411499, + "learning_rate": 9.000777099826345e-05, + "loss": 1.5347, + "step": 3912 + }, + { + "epoch": 0.21810378462738977, + "grad_norm": 0.5852683186531067, + "learning_rate": 9.000242966779412e-05, + "loss": 1.7077, + "step": 3913 + }, + { + "epoch": 0.21815952288055293, + "grad_norm": 0.511112630367279, + "learning_rate": 8.999708706868097e-05, + "loss": 1.4288, + "step": 3914 + }, + { + "epoch": 0.21821526113371606, + "grad_norm": 0.553582489490509, + "learning_rate": 8.999174320109343e-05, + "loss": 1.6114, + "step": 3915 + }, + { + "epoch": 0.21827099938687922, + "grad_norm": 0.5207599401473999, + "learning_rate": 8.998639806520092e-05, + "loss": 1.6002, + "step": 3916 + }, + { + "epoch": 0.21832673764004237, + "grad_norm": 0.520836591720581, + "learning_rate": 8.998105166117304e-05, + "loss": 1.7308, + "step": 3917 + }, + { + "epoch": 0.2183824758932055, + "grad_norm": 0.5346881151199341, + "learning_rate": 8.99757039891793e-05, + "loss": 1.7732, + "step": 3918 + }, + { + "epoch": 0.21843821414636866, + "grad_norm": 0.5407224893569946, + "learning_rate": 8.997035504938928e-05, + "loss": 1.6927, + "step": 3919 + }, + { + "epoch": 0.2184939523995318, + "grad_norm": 0.6079891324043274, + "learning_rate": 8.996500484197266e-05, + "loss": 1.7503, + "step": 3920 + }, + { + "epoch": 0.21854969065269494, + "grad_norm": 0.5896045565605164, + "learning_rate": 8.995965336709908e-05, + "loss": 1.8189, + "step": 3921 + }, + { + "epoch": 0.2186054289058581, + "grad_norm": 0.5681061148643494, + "learning_rate": 8.99543006249383e-05, + "loss": 1.9138, + "step": 3922 + }, + { + "epoch": 0.21866116715902123, + "grad_norm": 0.5397033095359802, + "learning_rate": 8.994894661566004e-05, + "loss": 1.6947, + "step": 3923 + }, + { + "epoch": 0.2187169054121844, + "grad_norm": 0.5442162752151489, + "learning_rate": 8.994359133943411e-05, + "loss": 1.7947, + "step": 3924 + }, + { + "epoch": 0.21877264366534752, + "grad_norm": 0.5366693735122681, + "learning_rate": 8.993823479643036e-05, + "loss": 1.8557, + "step": 3925 + }, + { + "epoch": 0.21882838191851067, + "grad_norm": 0.5018730163574219, + "learning_rate": 8.993287698681867e-05, + "loss": 1.6033, + "step": 3926 + }, + { + "epoch": 0.21888412017167383, + "grad_norm": 0.5234804749488831, + "learning_rate": 8.992751791076893e-05, + "loss": 1.6927, + "step": 3927 + }, + { + "epoch": 0.21893985842483696, + "grad_norm": 0.5351289510726929, + "learning_rate": 8.992215756845111e-05, + "loss": 1.6108, + "step": 3928 + }, + { + "epoch": 0.21899559667800012, + "grad_norm": 0.5499307513237, + "learning_rate": 8.991679596003521e-05, + "loss": 1.821, + "step": 3929 + }, + { + "epoch": 0.21905133493116324, + "grad_norm": 0.5461710691452026, + "learning_rate": 8.991143308569129e-05, + "loss": 1.6755, + "step": 3930 + }, + { + "epoch": 0.2191070731843264, + "grad_norm": 0.557220458984375, + "learning_rate": 8.990606894558942e-05, + "loss": 1.7568, + "step": 3931 + }, + { + "epoch": 0.21916281143748956, + "grad_norm": 0.5313843488693237, + "learning_rate": 8.99007035398997e-05, + "loss": 1.5701, + "step": 3932 + }, + { + "epoch": 0.2192185496906527, + "grad_norm": 0.5466028451919556, + "learning_rate": 8.98953368687923e-05, + "loss": 1.7533, + "step": 3933 + }, + { + "epoch": 0.21927428794381584, + "grad_norm": 0.5278179049491882, + "learning_rate": 8.988996893243742e-05, + "loss": 1.6604, + "step": 3934 + }, + { + "epoch": 0.219330026196979, + "grad_norm": 0.5555846095085144, + "learning_rate": 8.988459973100529e-05, + "loss": 1.9101, + "step": 3935 + }, + { + "epoch": 0.21938576445014213, + "grad_norm": 0.5475595593452454, + "learning_rate": 8.987922926466621e-05, + "loss": 1.6784, + "step": 3936 + }, + { + "epoch": 0.21944150270330529, + "grad_norm": 0.5606985092163086, + "learning_rate": 8.98738575335905e-05, + "loss": 1.8496, + "step": 3937 + }, + { + "epoch": 0.21949724095646841, + "grad_norm": 0.5272994041442871, + "learning_rate": 8.986848453794849e-05, + "loss": 1.6477, + "step": 3938 + }, + { + "epoch": 0.21955297920963157, + "grad_norm": 0.5808579325675964, + "learning_rate": 8.986311027791061e-05, + "loss": 1.9312, + "step": 3939 + }, + { + "epoch": 0.21960871746279473, + "grad_norm": 0.5892482399940491, + "learning_rate": 8.985773475364729e-05, + "loss": 1.8278, + "step": 3940 + }, + { + "epoch": 0.21966445571595786, + "grad_norm": 0.5204423069953918, + "learning_rate": 8.9852357965329e-05, + "loss": 1.5689, + "step": 3941 + }, + { + "epoch": 0.219720193969121, + "grad_norm": 0.5408873558044434, + "learning_rate": 8.984697991312629e-05, + "loss": 1.6719, + "step": 3942 + }, + { + "epoch": 0.21977593222228414, + "grad_norm": 0.4690547585487366, + "learning_rate": 8.98416005972097e-05, + "loss": 1.4167, + "step": 3943 + }, + { + "epoch": 0.2198316704754473, + "grad_norm": 0.5128321647644043, + "learning_rate": 8.98362200177498e-05, + "loss": 1.5936, + "step": 3944 + }, + { + "epoch": 0.21988740872861046, + "grad_norm": 0.5651824474334717, + "learning_rate": 8.98308381749173e-05, + "loss": 1.7715, + "step": 3945 + }, + { + "epoch": 0.21994314698177359, + "grad_norm": 0.49932271242141724, + "learning_rate": 8.982545506888282e-05, + "loss": 1.5167, + "step": 3946 + }, + { + "epoch": 0.21999888523493674, + "grad_norm": 0.5488872528076172, + "learning_rate": 8.982007069981711e-05, + "loss": 1.6694, + "step": 3947 + }, + { + "epoch": 0.22005462348809987, + "grad_norm": 0.5529676079750061, + "learning_rate": 8.981468506789093e-05, + "loss": 1.7098, + "step": 3948 + }, + { + "epoch": 0.22011036174126303, + "grad_norm": 0.555151104927063, + "learning_rate": 8.980929817327509e-05, + "loss": 1.8188, + "step": 3949 + }, + { + "epoch": 0.22016609999442618, + "grad_norm": 0.5413922667503357, + "learning_rate": 8.980391001614039e-05, + "loss": 1.6947, + "step": 3950 + }, + { + "epoch": 0.2202218382475893, + "grad_norm": 0.5880113244056702, + "learning_rate": 8.979852059665774e-05, + "loss": 1.8565, + "step": 3951 + }, + { + "epoch": 0.22027757650075247, + "grad_norm": 0.5404399037361145, + "learning_rate": 8.979312991499807e-05, + "loss": 1.6119, + "step": 3952 + }, + { + "epoch": 0.2203333147539156, + "grad_norm": 0.5193542838096619, + "learning_rate": 8.97877379713323e-05, + "loss": 1.5012, + "step": 3953 + }, + { + "epoch": 0.22038905300707876, + "grad_norm": 0.5563862323760986, + "learning_rate": 8.97823447658315e-05, + "loss": 1.7968, + "step": 3954 + }, + { + "epoch": 0.2204447912602419, + "grad_norm": 0.5796663165092468, + "learning_rate": 8.977695029866665e-05, + "loss": 1.6924, + "step": 3955 + }, + { + "epoch": 0.22050052951340504, + "grad_norm": 0.5060169100761414, + "learning_rate": 8.977155457000886e-05, + "loss": 1.6837, + "step": 3956 + }, + { + "epoch": 0.2205562677665682, + "grad_norm": 0.5254307389259338, + "learning_rate": 8.976615758002925e-05, + "loss": 1.5339, + "step": 3957 + }, + { + "epoch": 0.22061200601973135, + "grad_norm": 0.4909488260746002, + "learning_rate": 8.976075932889896e-05, + "loss": 1.406, + "step": 3958 + }, + { + "epoch": 0.22066774427289448, + "grad_norm": 0.521052896976471, + "learning_rate": 8.97553598167892e-05, + "loss": 1.6203, + "step": 3959 + }, + { + "epoch": 0.22072348252605764, + "grad_norm": 0.5382006764411926, + "learning_rate": 8.974995904387123e-05, + "loss": 1.6984, + "step": 3960 + }, + { + "epoch": 0.22077922077922077, + "grad_norm": 0.5354267954826355, + "learning_rate": 8.97445570103163e-05, + "loss": 1.7722, + "step": 3961 + }, + { + "epoch": 0.22083495903238393, + "grad_norm": 0.5725782513618469, + "learning_rate": 8.973915371629577e-05, + "loss": 1.8308, + "step": 3962 + }, + { + "epoch": 0.22089069728554708, + "grad_norm": 0.5183130502700806, + "learning_rate": 8.973374916198096e-05, + "loss": 1.6487, + "step": 3963 + }, + { + "epoch": 0.2209464355387102, + "grad_norm": 0.5026050209999084, + "learning_rate": 8.972834334754331e-05, + "loss": 1.4931, + "step": 3964 + }, + { + "epoch": 0.22100217379187337, + "grad_norm": 0.5589287281036377, + "learning_rate": 8.972293627315424e-05, + "loss": 1.9263, + "step": 3965 + }, + { + "epoch": 0.2210579120450365, + "grad_norm": 0.5776212811470032, + "learning_rate": 8.971752793898522e-05, + "loss": 1.8374, + "step": 3966 + }, + { + "epoch": 0.22111365029819965, + "grad_norm": 0.5569107532501221, + "learning_rate": 8.971211834520779e-05, + "loss": 1.7221, + "step": 3967 + }, + { + "epoch": 0.2211693885513628, + "grad_norm": 0.527186930179596, + "learning_rate": 8.970670749199351e-05, + "loss": 1.713, + "step": 3968 + }, + { + "epoch": 0.22122512680452594, + "grad_norm": 0.5234454274177551, + "learning_rate": 8.970129537951395e-05, + "loss": 1.6519, + "step": 3969 + }, + { + "epoch": 0.2212808650576891, + "grad_norm": 0.5419970154762268, + "learning_rate": 8.969588200794079e-05, + "loss": 1.5816, + "step": 3970 + }, + { + "epoch": 0.22133660331085223, + "grad_norm": 0.5328260660171509, + "learning_rate": 8.969046737744571e-05, + "loss": 1.8442, + "step": 3971 + }, + { + "epoch": 0.22139234156401538, + "grad_norm": 0.5527640581130981, + "learning_rate": 8.968505148820039e-05, + "loss": 1.5886, + "step": 3972 + }, + { + "epoch": 0.22144807981717854, + "grad_norm": 0.5386121869087219, + "learning_rate": 8.967963434037663e-05, + "loss": 1.8938, + "step": 3973 + }, + { + "epoch": 0.22150381807034167, + "grad_norm": 0.60856693983078, + "learning_rate": 8.967421593414622e-05, + "loss": 1.7739, + "step": 3974 + }, + { + "epoch": 0.22155955632350482, + "grad_norm": 0.5383316278457642, + "learning_rate": 8.966879626968099e-05, + "loss": 1.5916, + "step": 3975 + }, + { + "epoch": 0.22161529457666795, + "grad_norm": 0.5469935536384583, + "learning_rate": 8.966337534715284e-05, + "loss": 1.6879, + "step": 3976 + }, + { + "epoch": 0.2216710328298311, + "grad_norm": 0.5624483227729797, + "learning_rate": 8.965795316673366e-05, + "loss": 1.5465, + "step": 3977 + }, + { + "epoch": 0.22172677108299427, + "grad_norm": 0.571090817451477, + "learning_rate": 8.965252972859545e-05, + "loss": 1.8477, + "step": 3978 + }, + { + "epoch": 0.2217825093361574, + "grad_norm": 0.5622638463973999, + "learning_rate": 8.964710503291018e-05, + "loss": 1.7961, + "step": 3979 + }, + { + "epoch": 0.22183824758932055, + "grad_norm": 0.54639732837677, + "learning_rate": 8.964167907984988e-05, + "loss": 1.7795, + "step": 3980 + }, + { + "epoch": 0.2218939858424837, + "grad_norm": 0.5762872099876404, + "learning_rate": 8.963625186958666e-05, + "loss": 1.7824, + "step": 3981 + }, + { + "epoch": 0.22194972409564684, + "grad_norm": 0.5208929777145386, + "learning_rate": 8.963082340229263e-05, + "loss": 1.7521, + "step": 3982 + }, + { + "epoch": 0.22200546234881, + "grad_norm": 0.49496889114379883, + "learning_rate": 8.962539367813993e-05, + "loss": 1.5493, + "step": 3983 + }, + { + "epoch": 0.22206120060197312, + "grad_norm": 0.4936692714691162, + "learning_rate": 8.961996269730078e-05, + "loss": 1.5015, + "step": 3984 + }, + { + "epoch": 0.22211693885513628, + "grad_norm": 0.5555882453918457, + "learning_rate": 8.961453045994742e-05, + "loss": 1.7563, + "step": 3985 + }, + { + "epoch": 0.22217267710829944, + "grad_norm": 0.5514853596687317, + "learning_rate": 8.960909696625213e-05, + "loss": 1.6671, + "step": 3986 + }, + { + "epoch": 0.22222841536146257, + "grad_norm": 0.5259945392608643, + "learning_rate": 8.960366221638721e-05, + "loss": 1.7181, + "step": 3987 + }, + { + "epoch": 0.22228415361462572, + "grad_norm": 0.5564213395118713, + "learning_rate": 8.959822621052502e-05, + "loss": 1.8017, + "step": 3988 + }, + { + "epoch": 0.22233989186778885, + "grad_norm": 0.5879985094070435, + "learning_rate": 8.959278894883797e-05, + "loss": 1.8768, + "step": 3989 + }, + { + "epoch": 0.222395630120952, + "grad_norm": 0.5429808497428894, + "learning_rate": 8.958735043149852e-05, + "loss": 1.6246, + "step": 3990 + }, + { + "epoch": 0.22245136837411517, + "grad_norm": 0.5388792753219604, + "learning_rate": 8.958191065867912e-05, + "loss": 1.8083, + "step": 3991 + }, + { + "epoch": 0.2225071066272783, + "grad_norm": 0.5783261060714722, + "learning_rate": 8.957646963055227e-05, + "loss": 1.9074, + "step": 3992 + }, + { + "epoch": 0.22256284488044145, + "grad_norm": 0.5076984167098999, + "learning_rate": 8.957102734729057e-05, + "loss": 1.6518, + "step": 3993 + }, + { + "epoch": 0.22261858313360458, + "grad_norm": 0.6677889823913574, + "learning_rate": 8.956558380906659e-05, + "loss": 2.3105, + "step": 3994 + }, + { + "epoch": 0.22267432138676774, + "grad_norm": 0.5451659560203552, + "learning_rate": 8.956013901605299e-05, + "loss": 1.7229, + "step": 3995 + }, + { + "epoch": 0.2227300596399309, + "grad_norm": 0.5508718490600586, + "learning_rate": 8.955469296842241e-05, + "loss": 1.641, + "step": 3996 + }, + { + "epoch": 0.22278579789309402, + "grad_norm": 0.5317922234535217, + "learning_rate": 8.95492456663476e-05, + "loss": 1.6717, + "step": 3997 + }, + { + "epoch": 0.22284153614625718, + "grad_norm": 0.5446794033050537, + "learning_rate": 8.954379711000129e-05, + "loss": 1.7382, + "step": 3998 + }, + { + "epoch": 0.2228972743994203, + "grad_norm": 0.5360628962516785, + "learning_rate": 8.95383472995563e-05, + "loss": 1.7489, + "step": 3999 + }, + { + "epoch": 0.22295301265258347, + "grad_norm": 0.5646945238113403, + "learning_rate": 8.953289623518545e-05, + "loss": 1.7241, + "step": 4000 + }, + { + "epoch": 0.22300875090574662, + "grad_norm": 0.5079129338264465, + "learning_rate": 8.952744391706165e-05, + "loss": 1.6683, + "step": 4001 + }, + { + "epoch": 0.22306448915890975, + "grad_norm": 0.5274491906166077, + "learning_rate": 8.952199034535778e-05, + "loss": 1.6086, + "step": 4002 + }, + { + "epoch": 0.2231202274120729, + "grad_norm": 0.5475561618804932, + "learning_rate": 8.95165355202468e-05, + "loss": 1.9497, + "step": 4003 + }, + { + "epoch": 0.22317596566523606, + "grad_norm": 0.5520079135894775, + "learning_rate": 8.951107944190171e-05, + "loss": 1.9735, + "step": 4004 + }, + { + "epoch": 0.2232317039183992, + "grad_norm": 0.5097377300262451, + "learning_rate": 8.950562211049556e-05, + "loss": 1.5424, + "step": 4005 + }, + { + "epoch": 0.22328744217156235, + "grad_norm": 0.5405047535896301, + "learning_rate": 8.950016352620139e-05, + "loss": 1.6966, + "step": 4006 + }, + { + "epoch": 0.22334318042472548, + "grad_norm": 0.5254392027854919, + "learning_rate": 8.949470368919235e-05, + "loss": 1.6651, + "step": 4007 + }, + { + "epoch": 0.22339891867788864, + "grad_norm": 0.5582841634750366, + "learning_rate": 8.948924259964157e-05, + "loss": 1.7668, + "step": 4008 + }, + { + "epoch": 0.2234546569310518, + "grad_norm": 0.5375759601593018, + "learning_rate": 8.948378025772227e-05, + "loss": 1.7271, + "step": 4009 + }, + { + "epoch": 0.22351039518421492, + "grad_norm": 0.5370509028434753, + "learning_rate": 8.947831666360765e-05, + "loss": 1.7851, + "step": 4010 + }, + { + "epoch": 0.22356613343737808, + "grad_norm": 0.5874437093734741, + "learning_rate": 8.947285181747098e-05, + "loss": 1.8569, + "step": 4011 + }, + { + "epoch": 0.2236218716905412, + "grad_norm": 0.566886305809021, + "learning_rate": 8.946738571948562e-05, + "loss": 1.6114, + "step": 4012 + }, + { + "epoch": 0.22367760994370436, + "grad_norm": 0.5747610926628113, + "learning_rate": 8.946191836982489e-05, + "loss": 1.8552, + "step": 4013 + }, + { + "epoch": 0.22373334819686752, + "grad_norm": 0.5414125919342041, + "learning_rate": 8.945644976866219e-05, + "loss": 1.5846, + "step": 4014 + }, + { + "epoch": 0.22378908645003065, + "grad_norm": 0.5818209648132324, + "learning_rate": 8.945097991617096e-05, + "loss": 1.8305, + "step": 4015 + }, + { + "epoch": 0.2238448247031938, + "grad_norm": 0.5896833539009094, + "learning_rate": 8.944550881252465e-05, + "loss": 1.6642, + "step": 4016 + }, + { + "epoch": 0.22390056295635694, + "grad_norm": 0.5750831365585327, + "learning_rate": 8.944003645789678e-05, + "loss": 1.7286, + "step": 4017 + }, + { + "epoch": 0.2239563012095201, + "grad_norm": 0.514319896697998, + "learning_rate": 8.943456285246091e-05, + "loss": 1.6254, + "step": 4018 + }, + { + "epoch": 0.22401203946268325, + "grad_norm": 0.48393240571022034, + "learning_rate": 8.942908799639062e-05, + "loss": 1.4306, + "step": 4019 + }, + { + "epoch": 0.22406777771584638, + "grad_norm": 0.5655490756034851, + "learning_rate": 8.942361188985957e-05, + "loss": 1.8686, + "step": 4020 + }, + { + "epoch": 0.22412351596900953, + "grad_norm": 0.7101614475250244, + "learning_rate": 8.941813453304138e-05, + "loss": 1.6244, + "step": 4021 + }, + { + "epoch": 0.22417925422217266, + "grad_norm": 0.5121461153030396, + "learning_rate": 8.941265592610979e-05, + "loss": 1.5336, + "step": 4022 + }, + { + "epoch": 0.22423499247533582, + "grad_norm": 0.5167136192321777, + "learning_rate": 8.940717606923857e-05, + "loss": 1.5896, + "step": 4023 + }, + { + "epoch": 0.22429073072849898, + "grad_norm": 0.5683619379997253, + "learning_rate": 8.940169496260144e-05, + "loss": 1.8004, + "step": 4024 + }, + { + "epoch": 0.2243464689816621, + "grad_norm": 0.5303056240081787, + "learning_rate": 8.939621260637231e-05, + "loss": 1.6034, + "step": 4025 + }, + { + "epoch": 0.22440220723482526, + "grad_norm": 0.5514824986457825, + "learning_rate": 8.9390729000725e-05, + "loss": 1.7099, + "step": 4026 + }, + { + "epoch": 0.22445794548798842, + "grad_norm": 0.5117455720901489, + "learning_rate": 8.938524414583343e-05, + "loss": 1.8367, + "step": 4027 + }, + { + "epoch": 0.22451368374115155, + "grad_norm": 0.5556350946426392, + "learning_rate": 8.937975804187156e-05, + "loss": 1.6737, + "step": 4028 + }, + { + "epoch": 0.2245694219943147, + "grad_norm": 0.5511283874511719, + "learning_rate": 8.937427068901335e-05, + "loss": 1.7541, + "step": 4029 + }, + { + "epoch": 0.22462516024747783, + "grad_norm": 0.5651305317878723, + "learning_rate": 8.936878208743285e-05, + "loss": 1.7383, + "step": 4030 + }, + { + "epoch": 0.224680898500641, + "grad_norm": 0.5192481875419617, + "learning_rate": 8.93632922373041e-05, + "loss": 1.5392, + "step": 4031 + }, + { + "epoch": 0.22473663675380415, + "grad_norm": 0.5942433476448059, + "learning_rate": 8.935780113880125e-05, + "loss": 1.9703, + "step": 4032 + }, + { + "epoch": 0.22479237500696728, + "grad_norm": 0.5313376188278198, + "learning_rate": 8.93523087920984e-05, + "loss": 1.7827, + "step": 4033 + }, + { + "epoch": 0.22484811326013043, + "grad_norm": 0.5464789271354675, + "learning_rate": 8.934681519736977e-05, + "loss": 1.8036, + "step": 4034 + }, + { + "epoch": 0.22490385151329356, + "grad_norm": 0.5823439955711365, + "learning_rate": 8.934132035478955e-05, + "loss": 1.9969, + "step": 4035 + }, + { + "epoch": 0.22495958976645672, + "grad_norm": 0.5518758296966553, + "learning_rate": 8.933582426453205e-05, + "loss": 1.7836, + "step": 4036 + }, + { + "epoch": 0.22501532801961988, + "grad_norm": 0.529864490032196, + "learning_rate": 8.933032692677153e-05, + "loss": 1.8767, + "step": 4037 + }, + { + "epoch": 0.225071066272783, + "grad_norm": 0.5450250506401062, + "learning_rate": 8.932482834168237e-05, + "loss": 1.6584, + "step": 4038 + }, + { + "epoch": 0.22512680452594616, + "grad_norm": 0.5210989713668823, + "learning_rate": 8.931932850943892e-05, + "loss": 1.6707, + "step": 4039 + }, + { + "epoch": 0.2251825427791093, + "grad_norm": 0.5319432616233826, + "learning_rate": 8.931382743021562e-05, + "loss": 1.5798, + "step": 4040 + }, + { + "epoch": 0.22523828103227245, + "grad_norm": 0.502311110496521, + "learning_rate": 8.930832510418692e-05, + "loss": 1.5718, + "step": 4041 + }, + { + "epoch": 0.2252940192854356, + "grad_norm": 0.5432561635971069, + "learning_rate": 8.930282153152734e-05, + "loss": 1.7996, + "step": 4042 + }, + { + "epoch": 0.22534975753859873, + "grad_norm": 0.5339439511299133, + "learning_rate": 8.92973167124114e-05, + "loss": 1.8783, + "step": 4043 + }, + { + "epoch": 0.2254054957917619, + "grad_norm": 0.5929161310195923, + "learning_rate": 8.92918106470137e-05, + "loss": 1.9278, + "step": 4044 + }, + { + "epoch": 0.22546123404492502, + "grad_norm": 0.5356025695800781, + "learning_rate": 8.928630333550886e-05, + "loss": 1.6555, + "step": 4045 + }, + { + "epoch": 0.22551697229808818, + "grad_norm": 0.6173697113990784, + "learning_rate": 8.928079477807155e-05, + "loss": 1.6326, + "step": 4046 + }, + { + "epoch": 0.22557271055125133, + "grad_norm": 0.5391169786453247, + "learning_rate": 8.927528497487642e-05, + "loss": 1.7983, + "step": 4047 + }, + { + "epoch": 0.22562844880441446, + "grad_norm": 0.541691780090332, + "learning_rate": 8.926977392609826e-05, + "loss": 1.9013, + "step": 4048 + }, + { + "epoch": 0.22568418705757762, + "grad_norm": 0.5518167018890381, + "learning_rate": 8.926426163191182e-05, + "loss": 1.8038, + "step": 4049 + }, + { + "epoch": 0.22573992531074077, + "grad_norm": 0.5680546164512634, + "learning_rate": 8.925874809249193e-05, + "loss": 1.893, + "step": 4050 + }, + { + "epoch": 0.2257956635639039, + "grad_norm": 0.531597912311554, + "learning_rate": 8.925323330801345e-05, + "loss": 1.6987, + "step": 4051 + }, + { + "epoch": 0.22585140181706706, + "grad_norm": 0.5005265474319458, + "learning_rate": 8.924771727865126e-05, + "loss": 1.4703, + "step": 4052 + }, + { + "epoch": 0.2259071400702302, + "grad_norm": 0.4409901201725006, + "learning_rate": 8.924220000458032e-05, + "loss": 1.1188, + "step": 4053 + }, + { + "epoch": 0.22596287832339335, + "grad_norm": 0.5583540797233582, + "learning_rate": 8.92366814859756e-05, + "loss": 1.8899, + "step": 4054 + }, + { + "epoch": 0.2260186165765565, + "grad_norm": 0.5503487586975098, + "learning_rate": 8.923116172301208e-05, + "loss": 1.7006, + "step": 4055 + }, + { + "epoch": 0.22607435482971963, + "grad_norm": 0.5401930212974548, + "learning_rate": 8.922564071586487e-05, + "loss": 1.7435, + "step": 4056 + }, + { + "epoch": 0.2261300930828828, + "grad_norm": 0.5470068454742432, + "learning_rate": 8.922011846470903e-05, + "loss": 1.7926, + "step": 4057 + }, + { + "epoch": 0.22618583133604592, + "grad_norm": 0.5655896663665771, + "learning_rate": 8.921459496971971e-05, + "loss": 1.8028, + "step": 4058 + }, + { + "epoch": 0.22624156958920907, + "grad_norm": 0.520338237285614, + "learning_rate": 8.920907023107208e-05, + "loss": 1.7713, + "step": 4059 + }, + { + "epoch": 0.22629730784237223, + "grad_norm": 0.5628316402435303, + "learning_rate": 8.920354424894133e-05, + "loss": 1.8308, + "step": 4060 + }, + { + "epoch": 0.22635304609553536, + "grad_norm": 0.5436638593673706, + "learning_rate": 8.919801702350272e-05, + "loss": 1.7824, + "step": 4061 + }, + { + "epoch": 0.22640878434869852, + "grad_norm": 0.6150013208389282, + "learning_rate": 8.919248855493156e-05, + "loss": 1.6801, + "step": 4062 + }, + { + "epoch": 0.22646452260186165, + "grad_norm": 0.5413832068443298, + "learning_rate": 8.918695884340318e-05, + "loss": 1.7266, + "step": 4063 + }, + { + "epoch": 0.2265202608550248, + "grad_norm": 0.6004742980003357, + "learning_rate": 8.918142788909294e-05, + "loss": 1.9331, + "step": 4064 + }, + { + "epoch": 0.22657599910818796, + "grad_norm": 0.5428612232208252, + "learning_rate": 8.917589569217624e-05, + "loss": 1.8074, + "step": 4065 + }, + { + "epoch": 0.2266317373613511, + "grad_norm": 0.5653241276741028, + "learning_rate": 8.917036225282855e-05, + "loss": 1.8719, + "step": 4066 + }, + { + "epoch": 0.22668747561451424, + "grad_norm": 0.5411580801010132, + "learning_rate": 8.916482757122535e-05, + "loss": 1.7155, + "step": 4067 + }, + { + "epoch": 0.22674321386767737, + "grad_norm": 0.5733420252799988, + "learning_rate": 8.915929164754215e-05, + "loss": 1.8401, + "step": 4068 + }, + { + "epoch": 0.22679895212084053, + "grad_norm": 0.5870828032493591, + "learning_rate": 8.915375448195455e-05, + "loss": 1.6825, + "step": 4069 + }, + { + "epoch": 0.2268546903740037, + "grad_norm": 0.5373989939689636, + "learning_rate": 8.914821607463814e-05, + "loss": 1.6471, + "step": 4070 + }, + { + "epoch": 0.22691042862716682, + "grad_norm": 0.5650984048843384, + "learning_rate": 8.914267642576857e-05, + "loss": 2.0078, + "step": 4071 + }, + { + "epoch": 0.22696616688032997, + "grad_norm": 0.5647602677345276, + "learning_rate": 8.91371355355215e-05, + "loss": 1.8949, + "step": 4072 + }, + { + "epoch": 0.22702190513349313, + "grad_norm": 0.5225738286972046, + "learning_rate": 8.913159340407269e-05, + "loss": 1.787, + "step": 4073 + }, + { + "epoch": 0.22707764338665626, + "grad_norm": 0.4927429258823395, + "learning_rate": 8.912605003159788e-05, + "loss": 1.6022, + "step": 4074 + }, + { + "epoch": 0.22713338163981941, + "grad_norm": 0.5242977738380432, + "learning_rate": 8.912050541827291e-05, + "loss": 1.6286, + "step": 4075 + }, + { + "epoch": 0.22718911989298254, + "grad_norm": 0.5272535681724548, + "learning_rate": 8.911495956427357e-05, + "loss": 1.8091, + "step": 4076 + }, + { + "epoch": 0.2272448581461457, + "grad_norm": 0.5660970211029053, + "learning_rate": 8.910941246977577e-05, + "loss": 1.7518, + "step": 4077 + }, + { + "epoch": 0.22730059639930886, + "grad_norm": 0.5166184902191162, + "learning_rate": 8.910386413495544e-05, + "loss": 1.7051, + "step": 4078 + }, + { + "epoch": 0.227356334652472, + "grad_norm": 0.5315423607826233, + "learning_rate": 8.909831455998854e-05, + "loss": 1.5667, + "step": 4079 + }, + { + "epoch": 0.22741207290563514, + "grad_norm": 0.5121911764144897, + "learning_rate": 8.909276374505104e-05, + "loss": 1.6594, + "step": 4080 + }, + { + "epoch": 0.22746781115879827, + "grad_norm": 0.5725307464599609, + "learning_rate": 8.908721169031901e-05, + "loss": 1.7931, + "step": 4081 + }, + { + "epoch": 0.22752354941196143, + "grad_norm": 0.6129924058914185, + "learning_rate": 8.908165839596852e-05, + "loss": 2.0539, + "step": 4082 + }, + { + "epoch": 0.22757928766512459, + "grad_norm": 0.6019653677940369, + "learning_rate": 8.907610386217568e-05, + "loss": 2.1055, + "step": 4083 + }, + { + "epoch": 0.22763502591828771, + "grad_norm": 0.5589843392372131, + "learning_rate": 8.907054808911668e-05, + "loss": 1.8536, + "step": 4084 + }, + { + "epoch": 0.22769076417145087, + "grad_norm": 0.5030215382575989, + "learning_rate": 8.906499107696766e-05, + "loss": 1.5868, + "step": 4085 + }, + { + "epoch": 0.227746502424614, + "grad_norm": 0.5388656258583069, + "learning_rate": 8.90594328259049e-05, + "loss": 1.611, + "step": 4086 + }, + { + "epoch": 0.22780224067777716, + "grad_norm": 0.5835996270179749, + "learning_rate": 8.905387333610466e-05, + "loss": 1.3946, + "step": 4087 + }, + { + "epoch": 0.2278579789309403, + "grad_norm": 0.5778213739395142, + "learning_rate": 8.904831260774327e-05, + "loss": 1.9145, + "step": 4088 + }, + { + "epoch": 0.22791371718410344, + "grad_norm": 0.5685307383537292, + "learning_rate": 8.904275064099708e-05, + "loss": 1.8516, + "step": 4089 + }, + { + "epoch": 0.2279694554372666, + "grad_norm": 0.5906243324279785, + "learning_rate": 8.903718743604244e-05, + "loss": 1.7872, + "step": 4090 + }, + { + "epoch": 0.22802519369042973, + "grad_norm": 0.5142653584480286, + "learning_rate": 8.903162299305585e-05, + "loss": 1.5771, + "step": 4091 + }, + { + "epoch": 0.22808093194359289, + "grad_norm": 0.5752720832824707, + "learning_rate": 8.902605731221373e-05, + "loss": 1.7952, + "step": 4092 + }, + { + "epoch": 0.22813667019675604, + "grad_norm": 0.5666948556900024, + "learning_rate": 8.902049039369261e-05, + "loss": 1.7417, + "step": 4093 + }, + { + "epoch": 0.22819240844991917, + "grad_norm": 0.5241186618804932, + "learning_rate": 8.901492223766906e-05, + "loss": 1.6605, + "step": 4094 + }, + { + "epoch": 0.22824814670308233, + "grad_norm": 0.548561155796051, + "learning_rate": 8.900935284431961e-05, + "loss": 1.8027, + "step": 4095 + }, + { + "epoch": 0.22830388495624548, + "grad_norm": 0.5435733795166016, + "learning_rate": 8.900378221382097e-05, + "loss": 1.6941, + "step": 4096 + }, + { + "epoch": 0.2283596232094086, + "grad_norm": 0.5925113558769226, + "learning_rate": 8.899821034634974e-05, + "loss": 1.9182, + "step": 4097 + }, + { + "epoch": 0.22841536146257177, + "grad_norm": 0.5289484262466431, + "learning_rate": 8.899263724208266e-05, + "loss": 1.7512, + "step": 4098 + }, + { + "epoch": 0.2284710997157349, + "grad_norm": 0.5516422390937805, + "learning_rate": 8.898706290119647e-05, + "loss": 1.8606, + "step": 4099 + }, + { + "epoch": 0.22852683796889806, + "grad_norm": 0.5578961372375488, + "learning_rate": 8.898148732386795e-05, + "loss": 1.7136, + "step": 4100 + }, + { + "epoch": 0.2285825762220612, + "grad_norm": 0.5643925666809082, + "learning_rate": 8.897591051027394e-05, + "loss": 1.8315, + "step": 4101 + }, + { + "epoch": 0.22863831447522434, + "grad_norm": 0.4974330961704254, + "learning_rate": 8.89703324605913e-05, + "loss": 1.4505, + "step": 4102 + }, + { + "epoch": 0.2286940527283875, + "grad_norm": 0.5316607356071472, + "learning_rate": 8.896475317499691e-05, + "loss": 1.662, + "step": 4103 + }, + { + "epoch": 0.22874979098155063, + "grad_norm": 0.48880115151405334, + "learning_rate": 8.895917265366773e-05, + "loss": 1.6713, + "step": 4104 + }, + { + "epoch": 0.22880552923471378, + "grad_norm": 0.5647329092025757, + "learning_rate": 8.895359089678075e-05, + "loss": 1.6645, + "step": 4105 + }, + { + "epoch": 0.22886126748787694, + "grad_norm": 0.588045060634613, + "learning_rate": 8.894800790451298e-05, + "loss": 1.7344, + "step": 4106 + }, + { + "epoch": 0.22891700574104007, + "grad_norm": 0.5201917290687561, + "learning_rate": 8.894242367704149e-05, + "loss": 1.7137, + "step": 4107 + }, + { + "epoch": 0.22897274399420323, + "grad_norm": 0.5581889152526855, + "learning_rate": 8.893683821454335e-05, + "loss": 1.689, + "step": 4108 + }, + { + "epoch": 0.22902848224736636, + "grad_norm": 0.533208429813385, + "learning_rate": 8.893125151719574e-05, + "loss": 1.7345, + "step": 4109 + }, + { + "epoch": 0.2290842205005295, + "grad_norm": 0.5409815907478333, + "learning_rate": 8.89256635851758e-05, + "loss": 1.6921, + "step": 4110 + }, + { + "epoch": 0.22913995875369267, + "grad_norm": 0.5371890664100647, + "learning_rate": 8.892007441866076e-05, + "loss": 1.7282, + "step": 4111 + }, + { + "epoch": 0.2291956970068558, + "grad_norm": 0.5628719925880432, + "learning_rate": 8.89144840178279e-05, + "loss": 1.6771, + "step": 4112 + }, + { + "epoch": 0.22925143526001895, + "grad_norm": 0.5631751418113708, + "learning_rate": 8.89088923828545e-05, + "loss": 1.9474, + "step": 4113 + }, + { + "epoch": 0.22930717351318208, + "grad_norm": 0.5464017987251282, + "learning_rate": 8.890329951391787e-05, + "loss": 1.7969, + "step": 4114 + }, + { + "epoch": 0.22936291176634524, + "grad_norm": 0.5662708878517151, + "learning_rate": 8.88977054111954e-05, + "loss": 1.6611, + "step": 4115 + }, + { + "epoch": 0.2294186500195084, + "grad_norm": 0.607832670211792, + "learning_rate": 8.889211007486451e-05, + "loss": 1.6558, + "step": 4116 + }, + { + "epoch": 0.22947438827267153, + "grad_norm": 0.5683878064155579, + "learning_rate": 8.888651350510265e-05, + "loss": 1.712, + "step": 4117 + }, + { + "epoch": 0.22953012652583468, + "grad_norm": 0.5762284398078918, + "learning_rate": 8.888091570208729e-05, + "loss": 1.8012, + "step": 4118 + }, + { + "epoch": 0.22958586477899784, + "grad_norm": 0.5987650752067566, + "learning_rate": 8.887531666599598e-05, + "loss": 2.0303, + "step": 4119 + }, + { + "epoch": 0.22964160303216097, + "grad_norm": 0.5141220092773438, + "learning_rate": 8.88697163970063e-05, + "loss": 1.6133, + "step": 4120 + }, + { + "epoch": 0.22969734128532412, + "grad_norm": 0.5571396946907043, + "learning_rate": 8.886411489529583e-05, + "loss": 1.6117, + "step": 4121 + }, + { + "epoch": 0.22975307953848725, + "grad_norm": 0.5717421770095825, + "learning_rate": 8.885851216104222e-05, + "loss": 1.8159, + "step": 4122 + }, + { + "epoch": 0.2298088177916504, + "grad_norm": 0.5314472913742065, + "learning_rate": 8.885290819442319e-05, + "loss": 1.8198, + "step": 4123 + }, + { + "epoch": 0.22986455604481357, + "grad_norm": 0.5760038495063782, + "learning_rate": 8.884730299561642e-05, + "loss": 1.8839, + "step": 4124 + }, + { + "epoch": 0.2299202942979767, + "grad_norm": 0.5187524557113647, + "learning_rate": 8.88416965647997e-05, + "loss": 1.5981, + "step": 4125 + }, + { + "epoch": 0.22997603255113985, + "grad_norm": 0.5539306998252869, + "learning_rate": 8.883608890215083e-05, + "loss": 1.5802, + "step": 4126 + }, + { + "epoch": 0.23003177080430298, + "grad_norm": 0.5440337061882019, + "learning_rate": 8.883048000784764e-05, + "loss": 1.7884, + "step": 4127 + }, + { + "epoch": 0.23008750905746614, + "grad_norm": 0.6190919876098633, + "learning_rate": 8.882486988206803e-05, + "loss": 1.8968, + "step": 4128 + }, + { + "epoch": 0.2301432473106293, + "grad_norm": 0.5481730103492737, + "learning_rate": 8.881925852498991e-05, + "loss": 1.5026, + "step": 4129 + }, + { + "epoch": 0.23019898556379242, + "grad_norm": 0.5920677185058594, + "learning_rate": 8.881364593679124e-05, + "loss": 2.02, + "step": 4130 + }, + { + "epoch": 0.23025472381695558, + "grad_norm": 0.580629289150238, + "learning_rate": 8.880803211765003e-05, + "loss": 1.8447, + "step": 4131 + }, + { + "epoch": 0.2303104620701187, + "grad_norm": 0.5800060033798218, + "learning_rate": 8.880241706774431e-05, + "loss": 1.8952, + "step": 4132 + }, + { + "epoch": 0.23036620032328187, + "grad_norm": 0.5633650422096252, + "learning_rate": 8.879680078725214e-05, + "loss": 1.79, + "step": 4133 + }, + { + "epoch": 0.23042193857644502, + "grad_norm": 0.503121554851532, + "learning_rate": 8.879118327635165e-05, + "loss": 1.31, + "step": 4134 + }, + { + "epoch": 0.23047767682960815, + "grad_norm": 0.5033895373344421, + "learning_rate": 8.8785564535221e-05, + "loss": 1.388, + "step": 4135 + }, + { + "epoch": 0.2305334150827713, + "grad_norm": 0.5460697412490845, + "learning_rate": 8.877994456403838e-05, + "loss": 1.8455, + "step": 4136 + }, + { + "epoch": 0.23058915333593444, + "grad_norm": 0.5005971193313599, + "learning_rate": 8.877432336298201e-05, + "loss": 1.513, + "step": 4137 + }, + { + "epoch": 0.2306448915890976, + "grad_norm": 0.5267760753631592, + "learning_rate": 8.876870093223019e-05, + "loss": 1.6449, + "step": 4138 + }, + { + "epoch": 0.23070062984226075, + "grad_norm": 0.5714914202690125, + "learning_rate": 8.87630772719612e-05, + "loss": 2.0891, + "step": 4139 + }, + { + "epoch": 0.23075636809542388, + "grad_norm": 0.5814961194992065, + "learning_rate": 8.875745238235341e-05, + "loss": 1.6314, + "step": 4140 + }, + { + "epoch": 0.23081210634858704, + "grad_norm": 0.5237919092178345, + "learning_rate": 8.87518262635852e-05, + "loss": 1.5437, + "step": 4141 + }, + { + "epoch": 0.2308678446017502, + "grad_norm": 0.5390162467956543, + "learning_rate": 8.8746198915835e-05, + "loss": 1.8075, + "step": 4142 + }, + { + "epoch": 0.23092358285491332, + "grad_norm": 0.5281346440315247, + "learning_rate": 8.874057033928128e-05, + "loss": 1.7196, + "step": 4143 + }, + { + "epoch": 0.23097932110807648, + "grad_norm": 0.5769410133361816, + "learning_rate": 8.873494053410254e-05, + "loss": 1.7623, + "step": 4144 + }, + { + "epoch": 0.2310350593612396, + "grad_norm": 0.5773770213127136, + "learning_rate": 8.872930950047733e-05, + "loss": 1.6683, + "step": 4145 + }, + { + "epoch": 0.23109079761440277, + "grad_norm": 0.5479909777641296, + "learning_rate": 8.872367723858422e-05, + "loss": 1.8277, + "step": 4146 + }, + { + "epoch": 0.23114653586756592, + "grad_norm": 0.5558038949966431, + "learning_rate": 8.871804374860185e-05, + "loss": 1.9413, + "step": 4147 + }, + { + "epoch": 0.23120227412072905, + "grad_norm": 0.5571532249450684, + "learning_rate": 8.871240903070888e-05, + "loss": 1.7471, + "step": 4148 + }, + { + "epoch": 0.2312580123738922, + "grad_norm": 0.63371741771698, + "learning_rate": 8.870677308508399e-05, + "loss": 2.0195, + "step": 4149 + }, + { + "epoch": 0.23131375062705534, + "grad_norm": 0.5300304889678955, + "learning_rate": 8.870113591190595e-05, + "loss": 1.5686, + "step": 4150 + }, + { + "epoch": 0.2313694888802185, + "grad_norm": 0.6006084680557251, + "learning_rate": 8.869549751135352e-05, + "loss": 1.7178, + "step": 4151 + }, + { + "epoch": 0.23142522713338165, + "grad_norm": 0.5930531024932861, + "learning_rate": 8.868985788360551e-05, + "loss": 1.6998, + "step": 4152 + }, + { + "epoch": 0.23148096538654478, + "grad_norm": 0.5450523495674133, + "learning_rate": 8.868421702884077e-05, + "loss": 1.5045, + "step": 4153 + }, + { + "epoch": 0.23153670363970794, + "grad_norm": 0.519468367099762, + "learning_rate": 8.867857494723824e-05, + "loss": 1.6035, + "step": 4154 + }, + { + "epoch": 0.23159244189287106, + "grad_norm": 0.5567930936813354, + "learning_rate": 8.867293163897681e-05, + "loss": 1.8108, + "step": 4155 + }, + { + "epoch": 0.23164818014603422, + "grad_norm": 0.5138580799102783, + "learning_rate": 8.866728710423547e-05, + "loss": 1.5952, + "step": 4156 + }, + { + "epoch": 0.23170391839919738, + "grad_norm": 0.5398350954055786, + "learning_rate": 8.866164134319323e-05, + "loss": 1.8621, + "step": 4157 + }, + { + "epoch": 0.2317596566523605, + "grad_norm": 0.5708958506584167, + "learning_rate": 8.865599435602915e-05, + "loss": 1.5408, + "step": 4158 + }, + { + "epoch": 0.23181539490552366, + "grad_norm": 0.62980717420578, + "learning_rate": 8.86503461429223e-05, + "loss": 2.2779, + "step": 4159 + }, + { + "epoch": 0.2318711331586868, + "grad_norm": 0.5782346129417419, + "learning_rate": 8.86446967040518e-05, + "loss": 1.6574, + "step": 4160 + }, + { + "epoch": 0.23192687141184995, + "grad_norm": 0.5406448841094971, + "learning_rate": 8.863904603959686e-05, + "loss": 1.6591, + "step": 4161 + }, + { + "epoch": 0.2319826096650131, + "grad_norm": 0.533285915851593, + "learning_rate": 8.863339414973664e-05, + "loss": 1.7869, + "step": 4162 + }, + { + "epoch": 0.23203834791817624, + "grad_norm": 0.5359031558036804, + "learning_rate": 8.862774103465042e-05, + "loss": 1.8322, + "step": 4163 + }, + { + "epoch": 0.2320940861713394, + "grad_norm": 0.5305787920951843, + "learning_rate": 8.862208669451748e-05, + "loss": 1.5869, + "step": 4164 + }, + { + "epoch": 0.23214982442450255, + "grad_norm": 0.5482218861579895, + "learning_rate": 8.861643112951712e-05, + "loss": 1.9482, + "step": 4165 + }, + { + "epoch": 0.23220556267766568, + "grad_norm": 0.5915202498435974, + "learning_rate": 8.86107743398287e-05, + "loss": 1.9292, + "step": 4166 + }, + { + "epoch": 0.23226130093082883, + "grad_norm": 0.5175179243087769, + "learning_rate": 8.860511632563166e-05, + "loss": 1.5677, + "step": 4167 + }, + { + "epoch": 0.23231703918399196, + "grad_norm": 0.5698404908180237, + "learning_rate": 8.85994570871054e-05, + "loss": 1.8537, + "step": 4168 + }, + { + "epoch": 0.23237277743715512, + "grad_norm": 0.5476871728897095, + "learning_rate": 8.859379662442941e-05, + "loss": 1.7031, + "step": 4169 + }, + { + "epoch": 0.23242851569031828, + "grad_norm": 0.5611745119094849, + "learning_rate": 8.858813493778322e-05, + "loss": 1.9365, + "step": 4170 + }, + { + "epoch": 0.2324842539434814, + "grad_norm": 0.5908852219581604, + "learning_rate": 8.858247202734637e-05, + "loss": 1.7084, + "step": 4171 + }, + { + "epoch": 0.23253999219664456, + "grad_norm": 0.5042490363121033, + "learning_rate": 8.857680789329844e-05, + "loss": 1.6353, + "step": 4172 + }, + { + "epoch": 0.2325957304498077, + "grad_norm": 0.535675048828125, + "learning_rate": 8.85711425358191e-05, + "loss": 1.523, + "step": 4173 + }, + { + "epoch": 0.23265146870297085, + "grad_norm": 0.5372074246406555, + "learning_rate": 8.8565475955088e-05, + "loss": 1.38, + "step": 4174 + }, + { + "epoch": 0.232707206956134, + "grad_norm": 0.554507315158844, + "learning_rate": 8.855980815128486e-05, + "loss": 1.5261, + "step": 4175 + }, + { + "epoch": 0.23276294520929713, + "grad_norm": 0.5450062155723572, + "learning_rate": 8.85541391245894e-05, + "loss": 1.7725, + "step": 4176 + }, + { + "epoch": 0.2328186834624603, + "grad_norm": 0.5121927857398987, + "learning_rate": 8.854846887518147e-05, + "loss": 1.4857, + "step": 4177 + }, + { + "epoch": 0.23287442171562342, + "grad_norm": 0.5284276008605957, + "learning_rate": 8.854279740324086e-05, + "loss": 1.8393, + "step": 4178 + }, + { + "epoch": 0.23293015996878658, + "grad_norm": 0.5464218258857727, + "learning_rate": 8.85371247089474e-05, + "loss": 1.6455, + "step": 4179 + }, + { + "epoch": 0.23298589822194973, + "grad_norm": 0.515756368637085, + "learning_rate": 8.853145079248106e-05, + "loss": 1.6739, + "step": 4180 + }, + { + "epoch": 0.23304163647511286, + "grad_norm": 0.5167007446289062, + "learning_rate": 8.852577565402175e-05, + "loss": 1.6312, + "step": 4181 + }, + { + "epoch": 0.23309737472827602, + "grad_norm": 0.5863040089607239, + "learning_rate": 8.852009929374945e-05, + "loss": 1.8519, + "step": 4182 + }, + { + "epoch": 0.23315311298143915, + "grad_norm": 0.5061371922492981, + "learning_rate": 8.851442171184418e-05, + "loss": 1.6562, + "step": 4183 + }, + { + "epoch": 0.2332088512346023, + "grad_norm": 0.5501469969749451, + "learning_rate": 8.850874290848603e-05, + "loss": 1.7597, + "step": 4184 + }, + { + "epoch": 0.23326458948776546, + "grad_norm": 0.5034657716751099, + "learning_rate": 8.850306288385505e-05, + "loss": 1.7217, + "step": 4185 + }, + { + "epoch": 0.2333203277409286, + "grad_norm": 0.5563570857048035, + "learning_rate": 8.849738163813143e-05, + "loss": 1.7315, + "step": 4186 + }, + { + "epoch": 0.23337606599409175, + "grad_norm": 0.5461295247077942, + "learning_rate": 8.849169917149531e-05, + "loss": 1.7419, + "step": 4187 + }, + { + "epoch": 0.2334318042472549, + "grad_norm": 0.5286983251571655, + "learning_rate": 8.848601548412691e-05, + "loss": 1.8088, + "step": 4188 + }, + { + "epoch": 0.23348754250041803, + "grad_norm": 0.5308994650840759, + "learning_rate": 8.848033057620651e-05, + "loss": 1.6436, + "step": 4189 + }, + { + "epoch": 0.2335432807535812, + "grad_norm": 0.5667473673820496, + "learning_rate": 8.847464444791435e-05, + "loss": 1.6382, + "step": 4190 + }, + { + "epoch": 0.23359901900674432, + "grad_norm": 0.5432576537132263, + "learning_rate": 8.846895709943082e-05, + "loss": 1.8993, + "step": 4191 + }, + { + "epoch": 0.23365475725990748, + "grad_norm": 0.6006546020507812, + "learning_rate": 8.846326853093623e-05, + "loss": 1.7459, + "step": 4192 + }, + { + "epoch": 0.23371049551307063, + "grad_norm": 0.5638506412506104, + "learning_rate": 8.845757874261104e-05, + "loss": 1.618, + "step": 4193 + }, + { + "epoch": 0.23376623376623376, + "grad_norm": 0.5464212894439697, + "learning_rate": 8.845188773463566e-05, + "loss": 1.6731, + "step": 4194 + }, + { + "epoch": 0.23382197201939692, + "grad_norm": 0.5781604051589966, + "learning_rate": 8.84461955071906e-05, + "loss": 1.8368, + "step": 4195 + }, + { + "epoch": 0.23387771027256005, + "grad_norm": 0.5308955907821655, + "learning_rate": 8.844050206045637e-05, + "loss": 1.805, + "step": 4196 + }, + { + "epoch": 0.2339334485257232, + "grad_norm": 0.5154343843460083, + "learning_rate": 8.843480739461356e-05, + "loss": 1.4806, + "step": 4197 + }, + { + "epoch": 0.23398918677888636, + "grad_norm": 0.5477091073989868, + "learning_rate": 8.842911150984272e-05, + "loss": 1.7506, + "step": 4198 + }, + { + "epoch": 0.2340449250320495, + "grad_norm": 0.5401119589805603, + "learning_rate": 8.842341440632454e-05, + "loss": 1.8434, + "step": 4199 + }, + { + "epoch": 0.23410066328521265, + "grad_norm": 0.5683028697967529, + "learning_rate": 8.841771608423967e-05, + "loss": 1.6289, + "step": 4200 + }, + { + "epoch": 0.23415640153837577, + "grad_norm": 0.5980592370033264, + "learning_rate": 8.841201654376883e-05, + "loss": 1.782, + "step": 4201 + }, + { + "epoch": 0.23421213979153893, + "grad_norm": 0.5431941151618958, + "learning_rate": 8.84063157850928e-05, + "loss": 1.7904, + "step": 4202 + }, + { + "epoch": 0.2342678780447021, + "grad_norm": 0.6389545202255249, + "learning_rate": 8.840061380839235e-05, + "loss": 1.5506, + "step": 4203 + }, + { + "epoch": 0.23432361629786522, + "grad_norm": 0.5594901442527771, + "learning_rate": 8.839491061384832e-05, + "loss": 1.7914, + "step": 4204 + }, + { + "epoch": 0.23437935455102837, + "grad_norm": 0.5211427211761475, + "learning_rate": 8.838920620164157e-05, + "loss": 1.5682, + "step": 4205 + }, + { + "epoch": 0.23443509280419153, + "grad_norm": 0.5244554281234741, + "learning_rate": 8.838350057195304e-05, + "loss": 1.6598, + "step": 4206 + }, + { + "epoch": 0.23449083105735466, + "grad_norm": 0.5590394735336304, + "learning_rate": 8.837779372496367e-05, + "loss": 1.6682, + "step": 4207 + }, + { + "epoch": 0.23454656931051782, + "grad_norm": 0.5445299744606018, + "learning_rate": 8.837208566085441e-05, + "loss": 1.8047, + "step": 4208 + }, + { + "epoch": 0.23460230756368095, + "grad_norm": 0.5209025144577026, + "learning_rate": 8.836637637980636e-05, + "loss": 1.6225, + "step": 4209 + }, + { + "epoch": 0.2346580458168441, + "grad_norm": 0.5524556040763855, + "learning_rate": 8.836066588200051e-05, + "loss": 1.7139, + "step": 4210 + }, + { + "epoch": 0.23471378407000726, + "grad_norm": 0.5641475915908813, + "learning_rate": 8.8354954167618e-05, + "loss": 1.7928, + "step": 4211 + }, + { + "epoch": 0.2347695223231704, + "grad_norm": 0.57920241355896, + "learning_rate": 8.834924123683998e-05, + "loss": 1.7035, + "step": 4212 + }, + { + "epoch": 0.23482526057633354, + "grad_norm": 0.5374131202697754, + "learning_rate": 8.834352708984762e-05, + "loss": 1.6887, + "step": 4213 + }, + { + "epoch": 0.23488099882949667, + "grad_norm": 0.5739797353744507, + "learning_rate": 8.833781172682214e-05, + "loss": 1.7476, + "step": 4214 + }, + { + "epoch": 0.23493673708265983, + "grad_norm": 0.5460266470909119, + "learning_rate": 8.833209514794479e-05, + "loss": 1.569, + "step": 4215 + }, + { + "epoch": 0.234992475335823, + "grad_norm": 0.5776944160461426, + "learning_rate": 8.832637735339688e-05, + "loss": 1.6762, + "step": 4216 + }, + { + "epoch": 0.23504821358898612, + "grad_norm": 0.593519926071167, + "learning_rate": 8.832065834335973e-05, + "loss": 1.6699, + "step": 4217 + }, + { + "epoch": 0.23510395184214927, + "grad_norm": 0.5690516233444214, + "learning_rate": 8.831493811801472e-05, + "loss": 1.8292, + "step": 4218 + }, + { + "epoch": 0.2351596900953124, + "grad_norm": 0.5436887741088867, + "learning_rate": 8.830921667754328e-05, + "loss": 1.6958, + "step": 4219 + }, + { + "epoch": 0.23521542834847556, + "grad_norm": 0.54433673620224, + "learning_rate": 8.830349402212683e-05, + "loss": 1.7544, + "step": 4220 + }, + { + "epoch": 0.23527116660163871, + "grad_norm": 0.5694179534912109, + "learning_rate": 8.82977701519469e-05, + "loss": 1.676, + "step": 4221 + }, + { + "epoch": 0.23532690485480184, + "grad_norm": 0.5544805526733398, + "learning_rate": 8.829204506718496e-05, + "loss": 1.7395, + "step": 4222 + }, + { + "epoch": 0.235382643107965, + "grad_norm": 0.586121141910553, + "learning_rate": 8.828631876802263e-05, + "loss": 1.8418, + "step": 4223 + }, + { + "epoch": 0.23543838136112813, + "grad_norm": 0.5376494526863098, + "learning_rate": 8.828059125464148e-05, + "loss": 1.5981, + "step": 4224 + }, + { + "epoch": 0.2354941196142913, + "grad_norm": 0.5764834880828857, + "learning_rate": 8.827486252722316e-05, + "loss": 1.9862, + "step": 4225 + }, + { + "epoch": 0.23554985786745444, + "grad_norm": 0.6348791122436523, + "learning_rate": 8.826913258594937e-05, + "loss": 1.9931, + "step": 4226 + }, + { + "epoch": 0.23560559612061757, + "grad_norm": 0.5736886262893677, + "learning_rate": 8.826340143100182e-05, + "loss": 1.8651, + "step": 4227 + }, + { + "epoch": 0.23566133437378073, + "grad_norm": 0.5940203070640564, + "learning_rate": 8.825766906256228e-05, + "loss": 1.6837, + "step": 4228 + }, + { + "epoch": 0.23571707262694389, + "grad_norm": 0.5036525726318359, + "learning_rate": 8.825193548081252e-05, + "loss": 1.4064, + "step": 4229 + }, + { + "epoch": 0.23577281088010701, + "grad_norm": 0.5096335411071777, + "learning_rate": 8.824620068593439e-05, + "loss": 1.7501, + "step": 4230 + }, + { + "epoch": 0.23582854913327017, + "grad_norm": 0.5474448204040527, + "learning_rate": 8.824046467810976e-05, + "loss": 1.7263, + "step": 4231 + }, + { + "epoch": 0.2358842873864333, + "grad_norm": 0.5364823937416077, + "learning_rate": 8.823472745752055e-05, + "loss": 1.7752, + "step": 4232 + }, + { + "epoch": 0.23594002563959646, + "grad_norm": 0.5261183977127075, + "learning_rate": 8.822898902434873e-05, + "loss": 1.7809, + "step": 4233 + }, + { + "epoch": 0.2359957638927596, + "grad_norm": 0.5040357708930969, + "learning_rate": 8.822324937877624e-05, + "loss": 1.5033, + "step": 4234 + }, + { + "epoch": 0.23605150214592274, + "grad_norm": 0.534517228603363, + "learning_rate": 8.821750852098515e-05, + "loss": 1.735, + "step": 4235 + }, + { + "epoch": 0.2361072403990859, + "grad_norm": 0.5336146950721741, + "learning_rate": 8.821176645115752e-05, + "loss": 1.8211, + "step": 4236 + }, + { + "epoch": 0.23616297865224903, + "grad_norm": 0.5576988458633423, + "learning_rate": 8.820602316947544e-05, + "loss": 1.6501, + "step": 4237 + }, + { + "epoch": 0.23621871690541218, + "grad_norm": 0.6140468716621399, + "learning_rate": 8.820027867612107e-05, + "loss": 1.9297, + "step": 4238 + }, + { + "epoch": 0.23627445515857534, + "grad_norm": 0.6102777123451233, + "learning_rate": 8.819453297127657e-05, + "loss": 1.7881, + "step": 4239 + }, + { + "epoch": 0.23633019341173847, + "grad_norm": 0.5396928787231445, + "learning_rate": 8.818878605512418e-05, + "loss": 1.7629, + "step": 4240 + }, + { + "epoch": 0.23638593166490163, + "grad_norm": 0.5476622581481934, + "learning_rate": 8.818303792784615e-05, + "loss": 1.939, + "step": 4241 + }, + { + "epoch": 0.23644166991806476, + "grad_norm": 0.5725302696228027, + "learning_rate": 8.817728858962478e-05, + "loss": 1.7058, + "step": 4242 + }, + { + "epoch": 0.2364974081712279, + "grad_norm": 0.5522921085357666, + "learning_rate": 8.817153804064241e-05, + "loss": 1.6284, + "step": 4243 + }, + { + "epoch": 0.23655314642439107, + "grad_norm": 0.5554071664810181, + "learning_rate": 8.81657862810814e-05, + "loss": 1.7203, + "step": 4244 + }, + { + "epoch": 0.2366088846775542, + "grad_norm": 0.6202051639556885, + "learning_rate": 8.816003331112419e-05, + "loss": 2.0629, + "step": 4245 + }, + { + "epoch": 0.23666462293071736, + "grad_norm": 0.5647374391555786, + "learning_rate": 8.81542791309532e-05, + "loss": 1.7256, + "step": 4246 + }, + { + "epoch": 0.23672036118388048, + "grad_norm": 0.5261071920394897, + "learning_rate": 8.814852374075093e-05, + "loss": 1.6476, + "step": 4247 + }, + { + "epoch": 0.23677609943704364, + "grad_norm": 0.5051866173744202, + "learning_rate": 8.81427671406999e-05, + "loss": 1.57, + "step": 4248 + }, + { + "epoch": 0.2368318376902068, + "grad_norm": 0.5553388595581055, + "learning_rate": 8.81370093309827e-05, + "loss": 1.497, + "step": 4249 + }, + { + "epoch": 0.23688757594336993, + "grad_norm": 0.6159742474555969, + "learning_rate": 8.813125031178191e-05, + "loss": 1.9324, + "step": 4250 + }, + { + "epoch": 0.23694331419653308, + "grad_norm": 0.5158507227897644, + "learning_rate": 8.812549008328017e-05, + "loss": 1.7841, + "step": 4251 + }, + { + "epoch": 0.23699905244969624, + "grad_norm": 0.5447210073471069, + "learning_rate": 8.811972864566018e-05, + "loss": 1.6966, + "step": 4252 + }, + { + "epoch": 0.23705479070285937, + "grad_norm": 0.5115744471549988, + "learning_rate": 8.811396599910467e-05, + "loss": 1.6449, + "step": 4253 + }, + { + "epoch": 0.23711052895602253, + "grad_norm": 0.5265628695487976, + "learning_rate": 8.810820214379636e-05, + "loss": 1.8372, + "step": 4254 + }, + { + "epoch": 0.23716626720918565, + "grad_norm": 0.5546838045120239, + "learning_rate": 8.810243707991805e-05, + "loss": 1.9996, + "step": 4255 + }, + { + "epoch": 0.2372220054623488, + "grad_norm": 0.5540011525154114, + "learning_rate": 8.809667080765262e-05, + "loss": 1.7619, + "step": 4256 + }, + { + "epoch": 0.23727774371551197, + "grad_norm": 0.5753396153450012, + "learning_rate": 8.809090332718288e-05, + "loss": 1.8621, + "step": 4257 + }, + { + "epoch": 0.2373334819686751, + "grad_norm": 0.5528965592384338, + "learning_rate": 8.808513463869179e-05, + "loss": 1.6625, + "step": 4258 + }, + { + "epoch": 0.23738922022183825, + "grad_norm": 0.5542230010032654, + "learning_rate": 8.80793647423623e-05, + "loss": 1.5929, + "step": 4259 + }, + { + "epoch": 0.23744495847500138, + "grad_norm": 0.6071727275848389, + "learning_rate": 8.807359363837734e-05, + "loss": 1.7551, + "step": 4260 + }, + { + "epoch": 0.23750069672816454, + "grad_norm": 0.5722533464431763, + "learning_rate": 8.806782132691999e-05, + "loss": 1.9474, + "step": 4261 + }, + { + "epoch": 0.2375564349813277, + "grad_norm": 0.5362473130226135, + "learning_rate": 8.806204780817331e-05, + "loss": 1.6914, + "step": 4262 + }, + { + "epoch": 0.23761217323449083, + "grad_norm": 0.519892156124115, + "learning_rate": 8.805627308232036e-05, + "loss": 1.4148, + "step": 4263 + }, + { + "epoch": 0.23766791148765398, + "grad_norm": 0.5315799713134766, + "learning_rate": 8.805049714954434e-05, + "loss": 1.8304, + "step": 4264 + }, + { + "epoch": 0.2377236497408171, + "grad_norm": 0.5093747973442078, + "learning_rate": 8.804472001002839e-05, + "loss": 1.4575, + "step": 4265 + }, + { + "epoch": 0.23777938799398027, + "grad_norm": 0.5335510969161987, + "learning_rate": 8.803894166395574e-05, + "loss": 1.515, + "step": 4266 + }, + { + "epoch": 0.23783512624714342, + "grad_norm": 0.5546256303787231, + "learning_rate": 8.803316211150964e-05, + "loss": 1.657, + "step": 4267 + }, + { + "epoch": 0.23789086450030655, + "grad_norm": 0.5256768465042114, + "learning_rate": 8.802738135287338e-05, + "loss": 1.5228, + "step": 4268 + }, + { + "epoch": 0.2379466027534697, + "grad_norm": 0.5291659235954285, + "learning_rate": 8.802159938823031e-05, + "loss": 1.5667, + "step": 4269 + }, + { + "epoch": 0.23800234100663284, + "grad_norm": 0.5859813094139099, + "learning_rate": 8.801581621776379e-05, + "loss": 1.9385, + "step": 4270 + }, + { + "epoch": 0.238058079259796, + "grad_norm": 0.6084904670715332, + "learning_rate": 8.801003184165722e-05, + "loss": 1.9139, + "step": 4271 + }, + { + "epoch": 0.23811381751295915, + "grad_norm": 0.5245258212089539, + "learning_rate": 8.800424626009407e-05, + "loss": 1.8107, + "step": 4272 + }, + { + "epoch": 0.23816955576612228, + "grad_norm": 0.5182399749755859, + "learning_rate": 8.799845947325777e-05, + "loss": 1.72, + "step": 4273 + }, + { + "epoch": 0.23822529401928544, + "grad_norm": 0.5252156257629395, + "learning_rate": 8.799267148133192e-05, + "loss": 1.6711, + "step": 4274 + }, + { + "epoch": 0.2382810322724486, + "grad_norm": 0.49757280945777893, + "learning_rate": 8.798688228450002e-05, + "loss": 1.5716, + "step": 4275 + }, + { + "epoch": 0.23833677052561172, + "grad_norm": 0.5291200876235962, + "learning_rate": 8.798109188294572e-05, + "loss": 1.6498, + "step": 4276 + }, + { + "epoch": 0.23839250877877488, + "grad_norm": 0.5830451250076294, + "learning_rate": 8.797530027685261e-05, + "loss": 1.8761, + "step": 4277 + }, + { + "epoch": 0.238448247031938, + "grad_norm": 0.5453559756278992, + "learning_rate": 8.796950746640439e-05, + "loss": 1.6984, + "step": 4278 + }, + { + "epoch": 0.23850398528510117, + "grad_norm": 0.5068353414535522, + "learning_rate": 8.796371345178476e-05, + "loss": 1.3414, + "step": 4279 + }, + { + "epoch": 0.23855972353826432, + "grad_norm": 0.5567828416824341, + "learning_rate": 8.79579182331775e-05, + "loss": 1.716, + "step": 4280 + }, + { + "epoch": 0.23861546179142745, + "grad_norm": 0.5418634414672852, + "learning_rate": 8.795212181076638e-05, + "loss": 1.6889, + "step": 4281 + }, + { + "epoch": 0.2386712000445906, + "grad_norm": 0.5291851162910461, + "learning_rate": 8.794632418473522e-05, + "loss": 1.6941, + "step": 4282 + }, + { + "epoch": 0.23872693829775374, + "grad_norm": 0.5776856541633606, + "learning_rate": 8.794052535526792e-05, + "loss": 1.756, + "step": 4283 + }, + { + "epoch": 0.2387826765509169, + "grad_norm": 0.5982547998428345, + "learning_rate": 8.793472532254836e-05, + "loss": 1.8349, + "step": 4284 + }, + { + "epoch": 0.23883841480408005, + "grad_norm": 0.5404837727546692, + "learning_rate": 8.792892408676048e-05, + "loss": 1.6617, + "step": 4285 + }, + { + "epoch": 0.23889415305724318, + "grad_norm": 0.5049643516540527, + "learning_rate": 8.792312164808827e-05, + "loss": 1.5132, + "step": 4286 + }, + { + "epoch": 0.23894989131040634, + "grad_norm": 0.5474380254745483, + "learning_rate": 8.791731800671575e-05, + "loss": 1.7937, + "step": 4287 + }, + { + "epoch": 0.23900562956356947, + "grad_norm": 0.5853757858276367, + "learning_rate": 8.791151316282698e-05, + "loss": 1.8488, + "step": 4288 + }, + { + "epoch": 0.23906136781673262, + "grad_norm": 0.574220597743988, + "learning_rate": 8.790570711660604e-05, + "loss": 1.7211, + "step": 4289 + }, + { + "epoch": 0.23911710606989578, + "grad_norm": 0.580944836139679, + "learning_rate": 8.789989986823707e-05, + "loss": 1.6015, + "step": 4290 + }, + { + "epoch": 0.2391728443230589, + "grad_norm": 0.5716251730918884, + "learning_rate": 8.789409141790426e-05, + "loss": 1.7375, + "step": 4291 + }, + { + "epoch": 0.23922858257622207, + "grad_norm": 0.5204554200172424, + "learning_rate": 8.788828176579182e-05, + "loss": 1.7231, + "step": 4292 + }, + { + "epoch": 0.2392843208293852, + "grad_norm": 0.529961884021759, + "learning_rate": 8.788247091208397e-05, + "loss": 1.7355, + "step": 4293 + }, + { + "epoch": 0.23934005908254835, + "grad_norm": 0.5950244665145874, + "learning_rate": 8.787665885696502e-05, + "loss": 2.0786, + "step": 4294 + }, + { + "epoch": 0.2393957973357115, + "grad_norm": 0.5200558304786682, + "learning_rate": 8.78708456006193e-05, + "loss": 1.6045, + "step": 4295 + }, + { + "epoch": 0.23945153558887464, + "grad_norm": 0.5256621241569519, + "learning_rate": 8.786503114323113e-05, + "loss": 1.6679, + "step": 4296 + }, + { + "epoch": 0.2395072738420378, + "grad_norm": 0.5340785980224609, + "learning_rate": 8.785921548498494e-05, + "loss": 1.6646, + "step": 4297 + }, + { + "epoch": 0.23956301209520095, + "grad_norm": 0.5381552577018738, + "learning_rate": 8.785339862606521e-05, + "loss": 1.7888, + "step": 4298 + }, + { + "epoch": 0.23961875034836408, + "grad_norm": 0.6692368984222412, + "learning_rate": 8.784758056665634e-05, + "loss": 1.9363, + "step": 4299 + }, + { + "epoch": 0.23967448860152724, + "grad_norm": 0.5429602265357971, + "learning_rate": 8.784176130694289e-05, + "loss": 1.8477, + "step": 4300 + }, + { + "epoch": 0.23973022685469036, + "grad_norm": 0.5760909914970398, + "learning_rate": 8.783594084710941e-05, + "loss": 1.9106, + "step": 4301 + }, + { + "epoch": 0.23978596510785352, + "grad_norm": 0.5410770773887634, + "learning_rate": 8.783011918734048e-05, + "loss": 1.7685, + "step": 4302 + }, + { + "epoch": 0.23984170336101668, + "grad_norm": 0.6343144774436951, + "learning_rate": 8.782429632782073e-05, + "loss": 1.6641, + "step": 4303 + }, + { + "epoch": 0.2398974416141798, + "grad_norm": 0.5951781868934631, + "learning_rate": 8.781847226873484e-05, + "loss": 1.8908, + "step": 4304 + }, + { + "epoch": 0.23995317986734296, + "grad_norm": 0.5187268257141113, + "learning_rate": 8.78126470102675e-05, + "loss": 1.5571, + "step": 4305 + }, + { + "epoch": 0.2400089181205061, + "grad_norm": 0.5376867651939392, + "learning_rate": 8.780682055260348e-05, + "loss": 1.514, + "step": 4306 + }, + { + "epoch": 0.24006465637366925, + "grad_norm": 0.5534177422523499, + "learning_rate": 8.780099289592751e-05, + "loss": 1.581, + "step": 4307 + }, + { + "epoch": 0.2401203946268324, + "grad_norm": 0.5672261714935303, + "learning_rate": 8.779516404042446e-05, + "loss": 1.7344, + "step": 4308 + }, + { + "epoch": 0.24017613287999554, + "grad_norm": 0.5509449243545532, + "learning_rate": 8.778933398627915e-05, + "loss": 1.7162, + "step": 4309 + }, + { + "epoch": 0.2402318711331587, + "grad_norm": 0.5842772126197815, + "learning_rate": 8.778350273367653e-05, + "loss": 1.7958, + "step": 4310 + }, + { + "epoch": 0.24028760938632182, + "grad_norm": 0.50345379114151, + "learning_rate": 8.777767028280145e-05, + "loss": 1.4958, + "step": 4311 + }, + { + "epoch": 0.24034334763948498, + "grad_norm": 0.5337620377540588, + "learning_rate": 8.777183663383896e-05, + "loss": 1.6876, + "step": 4312 + }, + { + "epoch": 0.24039908589264813, + "grad_norm": 0.5183177590370178, + "learning_rate": 8.776600178697402e-05, + "loss": 1.7538, + "step": 4313 + }, + { + "epoch": 0.24045482414581126, + "grad_norm": 0.5510264039039612, + "learning_rate": 8.776016574239171e-05, + "loss": 1.7722, + "step": 4314 + }, + { + "epoch": 0.24051056239897442, + "grad_norm": 0.5638562440872192, + "learning_rate": 8.77543285002771e-05, + "loss": 1.8447, + "step": 4315 + }, + { + "epoch": 0.24056630065213755, + "grad_norm": 0.6304780840873718, + "learning_rate": 8.774849006081529e-05, + "loss": 2.111, + "step": 4316 + }, + { + "epoch": 0.2406220389053007, + "grad_norm": 0.5731822848320007, + "learning_rate": 8.774265042419148e-05, + "loss": 1.9022, + "step": 4317 + }, + { + "epoch": 0.24067777715846386, + "grad_norm": 0.5105111002922058, + "learning_rate": 8.773680959059086e-05, + "loss": 1.4723, + "step": 4318 + }, + { + "epoch": 0.240733515411627, + "grad_norm": 0.5694832801818848, + "learning_rate": 8.773096756019866e-05, + "loss": 1.8138, + "step": 4319 + }, + { + "epoch": 0.24078925366479015, + "grad_norm": 0.5039976835250854, + "learning_rate": 8.772512433320014e-05, + "loss": 1.5152, + "step": 4320 + }, + { + "epoch": 0.2408449919179533, + "grad_norm": 0.5481953024864197, + "learning_rate": 8.771927990978063e-05, + "loss": 1.7373, + "step": 4321 + }, + { + "epoch": 0.24090073017111643, + "grad_norm": 0.5046210885047913, + "learning_rate": 8.771343429012549e-05, + "loss": 1.3736, + "step": 4322 + }, + { + "epoch": 0.2409564684242796, + "grad_norm": 0.5144927501678467, + "learning_rate": 8.77075874744201e-05, + "loss": 1.7854, + "step": 4323 + }, + { + "epoch": 0.24101220667744272, + "grad_norm": 0.5863038301467896, + "learning_rate": 8.770173946284987e-05, + "loss": 1.9596, + "step": 4324 + }, + { + "epoch": 0.24106794493060588, + "grad_norm": 0.5546390414237976, + "learning_rate": 8.76958902556003e-05, + "loss": 1.6905, + "step": 4325 + }, + { + "epoch": 0.24112368318376903, + "grad_norm": 0.5615156888961792, + "learning_rate": 8.769003985285686e-05, + "loss": 1.8015, + "step": 4326 + }, + { + "epoch": 0.24117942143693216, + "grad_norm": 0.5112027525901794, + "learning_rate": 8.76841882548051e-05, + "loss": 1.7408, + "step": 4327 + }, + { + "epoch": 0.24123515969009532, + "grad_norm": 0.523891270160675, + "learning_rate": 8.767833546163062e-05, + "loss": 1.6473, + "step": 4328 + }, + { + "epoch": 0.24129089794325845, + "grad_norm": 0.5263711214065552, + "learning_rate": 8.767248147351902e-05, + "loss": 1.724, + "step": 4329 + }, + { + "epoch": 0.2413466361964216, + "grad_norm": 0.5724520683288574, + "learning_rate": 8.766662629065594e-05, + "loss": 1.7469, + "step": 4330 + }, + { + "epoch": 0.24140237444958476, + "grad_norm": 0.5471790432929993, + "learning_rate": 8.76607699132271e-05, + "loss": 1.7262, + "step": 4331 + }, + { + "epoch": 0.2414581127027479, + "grad_norm": 0.6246349215507507, + "learning_rate": 8.76549123414182e-05, + "loss": 2.0055, + "step": 4332 + }, + { + "epoch": 0.24151385095591105, + "grad_norm": 0.5492396354675293, + "learning_rate": 8.764905357541505e-05, + "loss": 1.7602, + "step": 4333 + }, + { + "epoch": 0.24156958920907418, + "grad_norm": 0.5340796113014221, + "learning_rate": 8.76431936154034e-05, + "loss": 1.7666, + "step": 4334 + }, + { + "epoch": 0.24162532746223733, + "grad_norm": 0.5311720967292786, + "learning_rate": 8.763733246156913e-05, + "loss": 1.5892, + "step": 4335 + }, + { + "epoch": 0.2416810657154005, + "grad_norm": 0.5926803350448608, + "learning_rate": 8.763147011409811e-05, + "loss": 1.9398, + "step": 4336 + }, + { + "epoch": 0.24173680396856362, + "grad_norm": 0.5204175710678101, + "learning_rate": 8.762560657317629e-05, + "loss": 1.4019, + "step": 4337 + }, + { + "epoch": 0.24179254222172678, + "grad_norm": 0.5834428071975708, + "learning_rate": 8.761974183898957e-05, + "loss": 1.7063, + "step": 4338 + }, + { + "epoch": 0.2418482804748899, + "grad_norm": 0.5776971578598022, + "learning_rate": 8.7613875911724e-05, + "loss": 1.7957, + "step": 4339 + }, + { + "epoch": 0.24190401872805306, + "grad_norm": 0.5160627365112305, + "learning_rate": 8.760800879156558e-05, + "loss": 1.5686, + "step": 4340 + }, + { + "epoch": 0.24195975698121622, + "grad_norm": 0.5783469676971436, + "learning_rate": 8.760214047870039e-05, + "loss": 2.0046, + "step": 4341 + }, + { + "epoch": 0.24201549523437935, + "grad_norm": 0.5625891089439392, + "learning_rate": 8.759627097331455e-05, + "loss": 1.6902, + "step": 4342 + }, + { + "epoch": 0.2420712334875425, + "grad_norm": 0.5326409935951233, + "learning_rate": 8.759040027559418e-05, + "loss": 1.9046, + "step": 4343 + }, + { + "epoch": 0.24212697174070566, + "grad_norm": 0.5869771838188171, + "learning_rate": 8.758452838572551e-05, + "loss": 1.7593, + "step": 4344 + }, + { + "epoch": 0.2421827099938688, + "grad_norm": 0.6008633971214294, + "learning_rate": 8.75786553038947e-05, + "loss": 2.0021, + "step": 4345 + }, + { + "epoch": 0.24223844824703195, + "grad_norm": 0.48187822103500366, + "learning_rate": 8.757278103028806e-05, + "loss": 1.1718, + "step": 4346 + }, + { + "epoch": 0.24229418650019507, + "grad_norm": 0.5490634441375732, + "learning_rate": 8.756690556509186e-05, + "loss": 1.6083, + "step": 4347 + }, + { + "epoch": 0.24234992475335823, + "grad_norm": 0.5408362746238708, + "learning_rate": 8.756102890849246e-05, + "loss": 1.6982, + "step": 4348 + }, + { + "epoch": 0.2424056630065214, + "grad_norm": 0.5706157684326172, + "learning_rate": 8.75551510606762e-05, + "loss": 1.8505, + "step": 4349 + }, + { + "epoch": 0.24246140125968452, + "grad_norm": 0.573557436466217, + "learning_rate": 8.754927202182953e-05, + "loss": 1.8455, + "step": 4350 + }, + { + "epoch": 0.24251713951284767, + "grad_norm": 0.5338667035102844, + "learning_rate": 8.754339179213886e-05, + "loss": 1.5964, + "step": 4351 + }, + { + "epoch": 0.2425728777660108, + "grad_norm": 0.5258156061172485, + "learning_rate": 8.753751037179073e-05, + "loss": 1.7428, + "step": 4352 + }, + { + "epoch": 0.24262861601917396, + "grad_norm": 0.5279545783996582, + "learning_rate": 8.75316277609716e-05, + "loss": 1.7279, + "step": 4353 + }, + { + "epoch": 0.24268435427233712, + "grad_norm": 0.5074349045753479, + "learning_rate": 8.752574395986806e-05, + "loss": 1.508, + "step": 4354 + }, + { + "epoch": 0.24274009252550025, + "grad_norm": 0.5738914012908936, + "learning_rate": 8.751985896866672e-05, + "loss": 1.8978, + "step": 4355 + }, + { + "epoch": 0.2427958307786634, + "grad_norm": 0.6244510412216187, + "learning_rate": 8.75139727875542e-05, + "loss": 1.94, + "step": 4356 + }, + { + "epoch": 0.24285156903182653, + "grad_norm": 0.5642906427383423, + "learning_rate": 8.75080854167172e-05, + "loss": 1.9239, + "step": 4357 + }, + { + "epoch": 0.2429073072849897, + "grad_norm": 0.5614916086196899, + "learning_rate": 8.75021968563424e-05, + "loss": 1.6965, + "step": 4358 + }, + { + "epoch": 0.24296304553815284, + "grad_norm": 0.5800240635871887, + "learning_rate": 8.749630710661658e-05, + "loss": 1.7979, + "step": 4359 + }, + { + "epoch": 0.24301878379131597, + "grad_norm": 0.5255259871482849, + "learning_rate": 8.749041616772653e-05, + "loss": 1.753, + "step": 4360 + }, + { + "epoch": 0.24307452204447913, + "grad_norm": 0.5205305814743042, + "learning_rate": 8.748452403985905e-05, + "loss": 1.518, + "step": 4361 + }, + { + "epoch": 0.24313026029764226, + "grad_norm": 0.5705804824829102, + "learning_rate": 8.747863072320102e-05, + "loss": 1.7267, + "step": 4362 + }, + { + "epoch": 0.24318599855080542, + "grad_norm": 0.5209723114967346, + "learning_rate": 8.747273621793932e-05, + "loss": 1.6697, + "step": 4363 + }, + { + "epoch": 0.24324173680396857, + "grad_norm": 0.5164801478385925, + "learning_rate": 8.746684052426093e-05, + "loss": 1.628, + "step": 4364 + }, + { + "epoch": 0.2432974750571317, + "grad_norm": 0.6018537282943726, + "learning_rate": 8.74609436423528e-05, + "loss": 1.8611, + "step": 4365 + }, + { + "epoch": 0.24335321331029486, + "grad_norm": 0.5693862438201904, + "learning_rate": 8.745504557240195e-05, + "loss": 1.8587, + "step": 4366 + }, + { + "epoch": 0.24340895156345801, + "grad_norm": 0.5834870338439941, + "learning_rate": 8.744914631459544e-05, + "loss": 1.82, + "step": 4367 + }, + { + "epoch": 0.24346468981662114, + "grad_norm": 0.5055362582206726, + "learning_rate": 8.744324586912033e-05, + "loss": 1.5662, + "step": 4368 + }, + { + "epoch": 0.2435204280697843, + "grad_norm": 0.5283217430114746, + "learning_rate": 8.74373442361638e-05, + "loss": 1.618, + "step": 4369 + }, + { + "epoch": 0.24357616632294743, + "grad_norm": 0.5035987496376038, + "learning_rate": 8.743144141591297e-05, + "loss": 1.6436, + "step": 4370 + }, + { + "epoch": 0.2436319045761106, + "grad_norm": 0.5793476700782776, + "learning_rate": 8.742553740855506e-05, + "loss": 1.9764, + "step": 4371 + }, + { + "epoch": 0.24368764282927374, + "grad_norm": 0.5031444430351257, + "learning_rate": 8.741963221427732e-05, + "loss": 1.4643, + "step": 4372 + }, + { + "epoch": 0.24374338108243687, + "grad_norm": 0.5925171971321106, + "learning_rate": 8.7413725833267e-05, + "loss": 1.7132, + "step": 4373 + }, + { + "epoch": 0.24379911933560003, + "grad_norm": 0.5252764225006104, + "learning_rate": 8.740781826571144e-05, + "loss": 1.613, + "step": 4374 + }, + { + "epoch": 0.24385485758876316, + "grad_norm": 0.5435476899147034, + "learning_rate": 8.740190951179799e-05, + "loss": 1.7225, + "step": 4375 + }, + { + "epoch": 0.24391059584192631, + "grad_norm": 0.5505743026733398, + "learning_rate": 8.739599957171404e-05, + "loss": 1.7796, + "step": 4376 + }, + { + "epoch": 0.24396633409508947, + "grad_norm": 0.5711907148361206, + "learning_rate": 8.7390088445647e-05, + "loss": 1.8918, + "step": 4377 + }, + { + "epoch": 0.2440220723482526, + "grad_norm": 0.617215096950531, + "learning_rate": 8.738417613378439e-05, + "loss": 1.6408, + "step": 4378 + }, + { + "epoch": 0.24407781060141576, + "grad_norm": 0.5194396376609802, + "learning_rate": 8.737826263631363e-05, + "loss": 1.5007, + "step": 4379 + }, + { + "epoch": 0.24413354885457889, + "grad_norm": NaN, + "learning_rate": 8.737826263631363e-05, + "loss": 1.8818, + "step": 4380 + }, + { + "epoch": 0.24418928710774204, + "grad_norm": 0.5449255704879761, + "learning_rate": 8.737234795342234e-05, + "loss": 1.6008, + "step": 4381 + }, + { + "epoch": 0.2442450253609052, + "grad_norm": 0.517254650592804, + "learning_rate": 8.736643208529807e-05, + "loss": 1.5589, + "step": 4382 + }, + { + "epoch": 0.24430076361406833, + "grad_norm": 0.5613778829574585, + "learning_rate": 8.736051503212843e-05, + "loss": 1.8349, + "step": 4383 + }, + { + "epoch": 0.24435650186723148, + "grad_norm": 0.5578374266624451, + "learning_rate": 8.735459679410108e-05, + "loss": 1.6444, + "step": 4384 + }, + { + "epoch": 0.2444122401203946, + "grad_norm": 0.5179364681243896, + "learning_rate": 8.734867737140371e-05, + "loss": 1.5685, + "step": 4385 + }, + { + "epoch": 0.24446797837355777, + "grad_norm": 0.5676231980323792, + "learning_rate": 8.734275676422406e-05, + "loss": 1.7138, + "step": 4386 + }, + { + "epoch": 0.24452371662672093, + "grad_norm": 0.5979743599891663, + "learning_rate": 8.73368349727499e-05, + "loss": 1.8035, + "step": 4387 + }, + { + "epoch": 0.24457945487988406, + "grad_norm": 0.566631555557251, + "learning_rate": 8.733091199716899e-05, + "loss": 1.7692, + "step": 4388 + }, + { + "epoch": 0.2446351931330472, + "grad_norm": 0.5594037175178528, + "learning_rate": 8.732498783766923e-05, + "loss": 1.7145, + "step": 4389 + }, + { + "epoch": 0.24469093138621037, + "grad_norm": 0.47728872299194336, + "learning_rate": 8.731906249443847e-05, + "loss": 1.3759, + "step": 4390 + }, + { + "epoch": 0.2447466696393735, + "grad_norm": 0.5077241063117981, + "learning_rate": 8.731313596766461e-05, + "loss": 1.6403, + "step": 4391 + }, + { + "epoch": 0.24480240789253666, + "grad_norm": 0.51840740442276, + "learning_rate": 8.730720825753567e-05, + "loss": 1.7304, + "step": 4392 + }, + { + "epoch": 0.24485814614569978, + "grad_norm": 0.555458664894104, + "learning_rate": 8.730127936423957e-05, + "loss": 1.7039, + "step": 4393 + }, + { + "epoch": 0.24491388439886294, + "grad_norm": 0.530720591545105, + "learning_rate": 8.729534928796438e-05, + "loss": 1.87, + "step": 4394 + }, + { + "epoch": 0.2449696226520261, + "grad_norm": 0.5183333158493042, + "learning_rate": 8.728941802889816e-05, + "loss": 1.6194, + "step": 4395 + }, + { + "epoch": 0.24502536090518923, + "grad_norm": 0.5418990254402161, + "learning_rate": 8.728348558722901e-05, + "loss": 1.6804, + "step": 4396 + }, + { + "epoch": 0.24508109915835238, + "grad_norm": 0.5377148985862732, + "learning_rate": 8.727755196314507e-05, + "loss": 1.5289, + "step": 4397 + }, + { + "epoch": 0.2451368374115155, + "grad_norm": 0.5729206800460815, + "learning_rate": 8.727161715683452e-05, + "loss": 1.7488, + "step": 4398 + }, + { + "epoch": 0.24519257566467867, + "grad_norm": 0.5957255363464355, + "learning_rate": 8.726568116848559e-05, + "loss": 1.4552, + "step": 4399 + }, + { + "epoch": 0.24524831391784183, + "grad_norm": 0.6279282569885254, + "learning_rate": 8.725974399828653e-05, + "loss": 1.8822, + "step": 4400 + }, + { + "epoch": 0.24530405217100495, + "grad_norm": 0.5379980802536011, + "learning_rate": 8.725380564642563e-05, + "loss": 1.7286, + "step": 4401 + }, + { + "epoch": 0.2453597904241681, + "grad_norm": 0.506988525390625, + "learning_rate": 8.724786611309123e-05, + "loss": 1.5182, + "step": 4402 + }, + { + "epoch": 0.24541552867733124, + "grad_norm": 0.5806999206542969, + "learning_rate": 8.724192539847167e-05, + "loss": 1.7967, + "step": 4403 + }, + { + "epoch": 0.2454712669304944, + "grad_norm": 0.6368009448051453, + "learning_rate": 8.723598350275537e-05, + "loss": 1.8081, + "step": 4404 + }, + { + "epoch": 0.24552700518365755, + "grad_norm": 0.6073201894760132, + "learning_rate": 8.723004042613079e-05, + "loss": 1.8369, + "step": 4405 + }, + { + "epoch": 0.24558274343682068, + "grad_norm": 0.5500373244285583, + "learning_rate": 8.722409616878637e-05, + "loss": 1.6556, + "step": 4406 + }, + { + "epoch": 0.24563848168998384, + "grad_norm": 0.5122720003128052, + "learning_rate": 8.721815073091068e-05, + "loss": 1.5745, + "step": 4407 + }, + { + "epoch": 0.24569421994314697, + "grad_norm": 0.5759167671203613, + "learning_rate": 8.721220411269222e-05, + "loss": 1.8282, + "step": 4408 + }, + { + "epoch": 0.24574995819631013, + "grad_norm": 0.5656915307044983, + "learning_rate": 8.720625631431963e-05, + "loss": 1.6782, + "step": 4409 + }, + { + "epoch": 0.24580569644947328, + "grad_norm": 0.5352250933647156, + "learning_rate": 8.72003073359815e-05, + "loss": 1.7703, + "step": 4410 + }, + { + "epoch": 0.2458614347026364, + "grad_norm": 0.6013755798339844, + "learning_rate": 8.719435717786653e-05, + "loss": 1.4931, + "step": 4411 + }, + { + "epoch": 0.24591717295579957, + "grad_norm": 0.5831592082977295, + "learning_rate": 8.718840584016339e-05, + "loss": 1.8267, + "step": 4412 + }, + { + "epoch": 0.24597291120896272, + "grad_norm": 0.5686485767364502, + "learning_rate": 8.718245332306086e-05, + "loss": 1.7073, + "step": 4413 + }, + { + "epoch": 0.24602864946212585, + "grad_norm": 0.5540615320205688, + "learning_rate": 8.717649962674768e-05, + "loss": 1.7481, + "step": 4414 + }, + { + "epoch": 0.246084387715289, + "grad_norm": 0.4984779953956604, + "learning_rate": 8.71705447514127e-05, + "loss": 1.4674, + "step": 4415 + }, + { + "epoch": 0.24614012596845214, + "grad_norm": 0.5658791065216064, + "learning_rate": 8.716458869724475e-05, + "loss": 1.7044, + "step": 4416 + }, + { + "epoch": 0.2461958642216153, + "grad_norm": 0.6222524046897888, + "learning_rate": 8.715863146443273e-05, + "loss": 1.9216, + "step": 4417 + }, + { + "epoch": 0.24625160247477845, + "grad_norm": 0.5234952569007874, + "learning_rate": 8.715267305316558e-05, + "loss": 1.3814, + "step": 4418 + }, + { + "epoch": 0.24630734072794158, + "grad_norm": 0.5298272371292114, + "learning_rate": 8.714671346363226e-05, + "loss": 1.7245, + "step": 4419 + }, + { + "epoch": 0.24636307898110474, + "grad_norm": 0.5426690578460693, + "learning_rate": 8.714075269602176e-05, + "loss": 1.7225, + "step": 4420 + }, + { + "epoch": 0.24641881723426787, + "grad_norm": 0.5064488649368286, + "learning_rate": 8.713479075052312e-05, + "loss": 1.637, + "step": 4421 + }, + { + "epoch": 0.24647455548743102, + "grad_norm": 0.6294771432876587, + "learning_rate": 8.712882762732543e-05, + "loss": 2.0957, + "step": 4422 + }, + { + "epoch": 0.24653029374059418, + "grad_norm": 0.5518829226493835, + "learning_rate": 8.712286332661783e-05, + "loss": 1.8551, + "step": 4423 + }, + { + "epoch": 0.2465860319937573, + "grad_norm": 0.5775428414344788, + "learning_rate": 8.711689784858943e-05, + "loss": 2.0364, + "step": 4424 + }, + { + "epoch": 0.24664177024692047, + "grad_norm": 0.585757851600647, + "learning_rate": 8.711093119342944e-05, + "loss": 1.9078, + "step": 4425 + }, + { + "epoch": 0.2466975085000836, + "grad_norm": 0.49010977149009705, + "learning_rate": 8.710496336132707e-05, + "loss": 1.7235, + "step": 4426 + }, + { + "epoch": 0.24675324675324675, + "grad_norm": 0.4925966262817383, + "learning_rate": 8.709899435247162e-05, + "loss": 1.5281, + "step": 4427 + }, + { + "epoch": 0.2468089850064099, + "grad_norm": 0.5210297107696533, + "learning_rate": 8.709302416705235e-05, + "loss": 1.6194, + "step": 4428 + }, + { + "epoch": 0.24686472325957304, + "grad_norm": 0.5486511588096619, + "learning_rate": 8.708705280525863e-05, + "loss": 1.8987, + "step": 4429 + }, + { + "epoch": 0.2469204615127362, + "grad_norm": 0.5911165475845337, + "learning_rate": 8.708108026727983e-05, + "loss": 1.8762, + "step": 4430 + }, + { + "epoch": 0.24697619976589932, + "grad_norm": 0.557861864566803, + "learning_rate": 8.707510655330535e-05, + "loss": 1.7246, + "step": 4431 + }, + { + "epoch": 0.24703193801906248, + "grad_norm": 0.5598505139350891, + "learning_rate": 8.706913166352468e-05, + "loss": 1.7012, + "step": 4432 + }, + { + "epoch": 0.24708767627222564, + "grad_norm": 0.523493230342865, + "learning_rate": 8.706315559812725e-05, + "loss": 1.6476, + "step": 4433 + }, + { + "epoch": 0.24714341452538877, + "grad_norm": 0.5727233290672302, + "learning_rate": 8.705717835730263e-05, + "loss": 1.7085, + "step": 4434 + }, + { + "epoch": 0.24719915277855192, + "grad_norm": 0.5231149792671204, + "learning_rate": 8.705119994124038e-05, + "loss": 1.6553, + "step": 4435 + }, + { + "epoch": 0.24725489103171508, + "grad_norm": 0.5807697176933289, + "learning_rate": 8.70452203501301e-05, + "loss": 1.9495, + "step": 4436 + }, + { + "epoch": 0.2473106292848782, + "grad_norm": 0.538212239742279, + "learning_rate": 8.703923958416141e-05, + "loss": 1.6201, + "step": 4437 + }, + { + "epoch": 0.24736636753804137, + "grad_norm": 0.5267363786697388, + "learning_rate": 8.703325764352397e-05, + "loss": 1.6372, + "step": 4438 + }, + { + "epoch": 0.2474221057912045, + "grad_norm": 0.5570881962776184, + "learning_rate": 8.702727452840753e-05, + "loss": 1.7135, + "step": 4439 + }, + { + "epoch": 0.24747784404436765, + "grad_norm": 0.5702007412910461, + "learning_rate": 8.702129023900184e-05, + "loss": 1.7636, + "step": 4440 + }, + { + "epoch": 0.2475335822975308, + "grad_norm": 0.5725893974304199, + "learning_rate": 8.701530477549666e-05, + "loss": 1.7144, + "step": 4441 + }, + { + "epoch": 0.24758932055069394, + "grad_norm": 0.5385577082633972, + "learning_rate": 8.700931813808182e-05, + "loss": 1.7915, + "step": 4442 + }, + { + "epoch": 0.2476450588038571, + "grad_norm": 0.625249981880188, + "learning_rate": 8.700333032694721e-05, + "loss": 1.8956, + "step": 4443 + }, + { + "epoch": 0.24770079705702022, + "grad_norm": 0.6568485498428345, + "learning_rate": 8.69973413422827e-05, + "loss": 2.0, + "step": 4444 + }, + { + "epoch": 0.24775653531018338, + "grad_norm": 0.5595792531967163, + "learning_rate": 8.699135118427821e-05, + "loss": 1.7215, + "step": 4445 + }, + { + "epoch": 0.24781227356334654, + "grad_norm": 0.5085048675537109, + "learning_rate": 8.698535985312376e-05, + "loss": 1.5958, + "step": 4446 + }, + { + "epoch": 0.24786801181650966, + "grad_norm": 0.5155544281005859, + "learning_rate": 8.697936734900932e-05, + "loss": 1.7741, + "step": 4447 + }, + { + "epoch": 0.24792375006967282, + "grad_norm": 0.5145729780197144, + "learning_rate": 8.697337367212496e-05, + "loss": 1.7966, + "step": 4448 + }, + { + "epoch": 0.24797948832283595, + "grad_norm": 0.5527476072311401, + "learning_rate": 8.696737882266076e-05, + "loss": 1.8026, + "step": 4449 + }, + { + "epoch": 0.2480352265759991, + "grad_norm": 0.5763035416603088, + "learning_rate": 8.696138280080684e-05, + "loss": 1.7823, + "step": 4450 + }, + { + "epoch": 0.24809096482916226, + "grad_norm": 0.5513672828674316, + "learning_rate": 8.695538560675334e-05, + "loss": 1.5817, + "step": 4451 + }, + { + "epoch": 0.2481467030823254, + "grad_norm": 0.5553067922592163, + "learning_rate": 8.694938724069048e-05, + "loss": 1.8425, + "step": 4452 + }, + { + "epoch": 0.24820244133548855, + "grad_norm": 0.49385184049606323, + "learning_rate": 8.69433877028085e-05, + "loss": 1.6939, + "step": 4453 + }, + { + "epoch": 0.24825817958865168, + "grad_norm": 0.5889978408813477, + "learning_rate": 8.693738699329765e-05, + "loss": 1.6874, + "step": 4454 + }, + { + "epoch": 0.24831391784181484, + "grad_norm": 0.556433916091919, + "learning_rate": 8.693138511234825e-05, + "loss": 1.7339, + "step": 4455 + }, + { + "epoch": 0.248369656094978, + "grad_norm": 0.5483202338218689, + "learning_rate": 8.692538206015062e-05, + "loss": 1.8301, + "step": 4456 + }, + { + "epoch": 0.24842539434814112, + "grad_norm": 0.5633078813552856, + "learning_rate": 8.691937783689518e-05, + "loss": 1.7435, + "step": 4457 + }, + { + "epoch": 0.24848113260130428, + "grad_norm": 0.5544833540916443, + "learning_rate": 8.691337244277231e-05, + "loss": 1.6348, + "step": 4458 + }, + { + "epoch": 0.24853687085446743, + "grad_norm": 0.5703203082084656, + "learning_rate": 8.69073658779725e-05, + "loss": 1.6839, + "step": 4459 + }, + { + "epoch": 0.24859260910763056, + "grad_norm": 0.5441849231719971, + "learning_rate": 8.690135814268623e-05, + "loss": 1.7292, + "step": 4460 + }, + { + "epoch": 0.24864834736079372, + "grad_norm": 0.5759615302085876, + "learning_rate": 8.689534923710403e-05, + "loss": 1.8113, + "step": 4461 + }, + { + "epoch": 0.24870408561395685, + "grad_norm": 0.568762481212616, + "learning_rate": 8.688933916141647e-05, + "loss": 1.9261, + "step": 4462 + }, + { + "epoch": 0.24875982386712, + "grad_norm": 0.5397505164146423, + "learning_rate": 8.688332791581415e-05, + "loss": 1.8136, + "step": 4463 + }, + { + "epoch": 0.24881556212028316, + "grad_norm": 0.5890788435935974, + "learning_rate": 8.68773155004877e-05, + "loss": 1.6383, + "step": 4464 + }, + { + "epoch": 0.2488713003734463, + "grad_norm": 0.5507654547691345, + "learning_rate": 8.687130191562782e-05, + "loss": 1.7313, + "step": 4465 + }, + { + "epoch": 0.24892703862660945, + "grad_norm": 0.5670168399810791, + "learning_rate": 8.686528716142523e-05, + "loss": 1.7355, + "step": 4466 + }, + { + "epoch": 0.24898277687977258, + "grad_norm": 0.5866429805755615, + "learning_rate": 8.685927123807065e-05, + "loss": 1.7786, + "step": 4467 + }, + { + "epoch": 0.24903851513293573, + "grad_norm": 0.5706139206886292, + "learning_rate": 8.68532541457549e-05, + "loss": 1.8995, + "step": 4468 + }, + { + "epoch": 0.2490942533860989, + "grad_norm": 0.5574220418930054, + "learning_rate": 8.68472358846688e-05, + "loss": 1.86, + "step": 4469 + }, + { + "epoch": 0.24914999163926202, + "grad_norm": 0.5442642569541931, + "learning_rate": 8.684121645500322e-05, + "loss": 1.803, + "step": 4470 + }, + { + "epoch": 0.24920572989242518, + "grad_norm": 0.5070736408233643, + "learning_rate": 8.683519585694903e-05, + "loss": 1.5786, + "step": 4471 + }, + { + "epoch": 0.2492614681455883, + "grad_norm": 0.5622973442077637, + "learning_rate": 8.682917409069721e-05, + "loss": 1.8524, + "step": 4472 + }, + { + "epoch": 0.24931720639875146, + "grad_norm": 0.5547112226486206, + "learning_rate": 8.682315115643872e-05, + "loss": 1.7891, + "step": 4473 + }, + { + "epoch": 0.24937294465191462, + "grad_norm": 0.5251905918121338, + "learning_rate": 8.681712705436455e-05, + "loss": 1.3104, + "step": 4474 + }, + { + "epoch": 0.24942868290507775, + "grad_norm": 0.5507151484489441, + "learning_rate": 8.68111017846658e-05, + "loss": 1.7571, + "step": 4475 + }, + { + "epoch": 0.2494844211582409, + "grad_norm": 0.628353476524353, + "learning_rate": 8.68050753475335e-05, + "loss": 1.7915, + "step": 4476 + }, + { + "epoch": 0.24954015941140403, + "grad_norm": 0.5899398922920227, + "learning_rate": 8.67990477431588e-05, + "loss": 1.7928, + "step": 4477 + }, + { + "epoch": 0.2495958976645672, + "grad_norm": 0.5376555919647217, + "learning_rate": 8.679301897173287e-05, + "loss": 1.6592, + "step": 4478 + }, + { + "epoch": 0.24965163591773035, + "grad_norm": 0.5241808891296387, + "learning_rate": 8.678698903344689e-05, + "loss": 1.6482, + "step": 4479 + }, + { + "epoch": 0.24970737417089348, + "grad_norm": 0.6054913997650146, + "learning_rate": 8.67809579284921e-05, + "loss": 1.7838, + "step": 4480 + }, + { + "epoch": 0.24976311242405663, + "grad_norm": 0.56617671251297, + "learning_rate": 8.677492565705976e-05, + "loss": 1.7705, + "step": 4481 + }, + { + "epoch": 0.2498188506772198, + "grad_norm": 0.549431324005127, + "learning_rate": 8.676889221934121e-05, + "loss": 1.8349, + "step": 4482 + }, + { + "epoch": 0.24987458893038292, + "grad_norm": 0.5290791392326355, + "learning_rate": 8.676285761552775e-05, + "loss": 1.6761, + "step": 4483 + }, + { + "epoch": 0.24993032718354607, + "grad_norm": 0.7188482880592346, + "learning_rate": 8.675682184581081e-05, + "loss": 1.6409, + "step": 4484 + }, + { + "epoch": 0.2499860654367092, + "grad_norm": 0.5338848233222961, + "learning_rate": 8.67507849103818e-05, + "loss": 1.4604, + "step": 4485 + }, + { + "epoch": 0.25004180368987233, + "grad_norm": 0.5384326577186584, + "learning_rate": 8.674474680943215e-05, + "loss": 1.5605, + "step": 4486 + }, + { + "epoch": 0.2500975419430355, + "grad_norm": 0.521425724029541, + "learning_rate": 8.673870754315336e-05, + "loss": 1.625, + "step": 4487 + }, + { + "epoch": 0.25015328019619865, + "grad_norm": 0.5739718079566956, + "learning_rate": 8.673266711173698e-05, + "loss": 1.7826, + "step": 4488 + }, + { + "epoch": 0.2502090184493618, + "grad_norm": 0.5505213141441345, + "learning_rate": 8.672662551537457e-05, + "loss": 1.595, + "step": 4489 + }, + { + "epoch": 0.25026475670252496, + "grad_norm": 0.5271283388137817, + "learning_rate": 8.672058275425772e-05, + "loss": 1.5468, + "step": 4490 + }, + { + "epoch": 0.2503204949556881, + "grad_norm": 0.5678611993789673, + "learning_rate": 8.671453882857808e-05, + "loss": 1.8862, + "step": 4491 + }, + { + "epoch": 0.2503762332088512, + "grad_norm": 0.6000241041183472, + "learning_rate": 8.670849373852734e-05, + "loss": 1.6133, + "step": 4492 + }, + { + "epoch": 0.2504319714620144, + "grad_norm": 0.5662490129470825, + "learning_rate": 8.670244748429719e-05, + "loss": 1.7045, + "step": 4493 + }, + { + "epoch": 0.25048770971517753, + "grad_norm": 0.5680144429206848, + "learning_rate": 8.66964000660794e-05, + "loss": 1.6462, + "step": 4494 + }, + { + "epoch": 0.25054344796834066, + "grad_norm": 0.5490357279777527, + "learning_rate": 8.669035148406577e-05, + "loss": 1.5736, + "step": 4495 + }, + { + "epoch": 0.25059918622150384, + "grad_norm": 0.5800120234489441, + "learning_rate": 8.668430173844808e-05, + "loss": 1.8931, + "step": 4496 + }, + { + "epoch": 0.250654924474667, + "grad_norm": 0.5286765694618225, + "learning_rate": 8.667825082941826e-05, + "loss": 1.6553, + "step": 4497 + }, + { + "epoch": 0.2507106627278301, + "grad_norm": 0.5452672839164734, + "learning_rate": 8.667219875716814e-05, + "loss": 1.7692, + "step": 4498 + }, + { + "epoch": 0.25076640098099323, + "grad_norm": 0.5615769028663635, + "learning_rate": 8.66661455218897e-05, + "loss": 1.8116, + "step": 4499 + }, + { + "epoch": 0.2508221392341564, + "grad_norm": 0.5832181572914124, + "learning_rate": 8.666009112377491e-05, + "loss": 1.938, + "step": 4500 + }, + { + "epoch": 0.25087787748731954, + "grad_norm": 0.5258188247680664, + "learning_rate": 8.665403556301576e-05, + "loss": 1.6026, + "step": 4501 + }, + { + "epoch": 0.2509336157404827, + "grad_norm": 0.6271452307701111, + "learning_rate": 8.664797883980434e-05, + "loss": 1.6589, + "step": 4502 + }, + { + "epoch": 0.25098935399364586, + "grad_norm": 0.5411872267723083, + "learning_rate": 8.664192095433266e-05, + "loss": 1.7016, + "step": 4503 + }, + { + "epoch": 0.251045092246809, + "grad_norm": 0.5610190629959106, + "learning_rate": 8.663586190679291e-05, + "loss": 1.8425, + "step": 4504 + }, + { + "epoch": 0.2511008304999721, + "grad_norm": 0.5276908278465271, + "learning_rate": 8.662980169737723e-05, + "loss": 1.6105, + "step": 4505 + }, + { + "epoch": 0.2511565687531353, + "grad_norm": 0.5493645668029785, + "learning_rate": 8.662374032627778e-05, + "loss": 1.9352, + "step": 4506 + }, + { + "epoch": 0.25121230700629843, + "grad_norm": 0.5296374559402466, + "learning_rate": 8.661767779368683e-05, + "loss": 1.7867, + "step": 4507 + }, + { + "epoch": 0.25126804525946156, + "grad_norm": 0.6600750684738159, + "learning_rate": 8.661161409979665e-05, + "loss": 1.6947, + "step": 4508 + }, + { + "epoch": 0.2513237835126247, + "grad_norm": 0.5515453815460205, + "learning_rate": 8.66055492447995e-05, + "loss": 1.796, + "step": 4509 + }, + { + "epoch": 0.25137952176578787, + "grad_norm": 0.5651318430900574, + "learning_rate": 8.659948322888777e-05, + "loss": 1.6343, + "step": 4510 + }, + { + "epoch": 0.251435260018951, + "grad_norm": 0.5783109664916992, + "learning_rate": 8.659341605225384e-05, + "loss": 1.8057, + "step": 4511 + }, + { + "epoch": 0.25149099827211413, + "grad_norm": 0.5711765885353088, + "learning_rate": 8.65873477150901e-05, + "loss": 1.8123, + "step": 4512 + }, + { + "epoch": 0.2515467365252773, + "grad_norm": 0.5652083158493042, + "learning_rate": 8.658127821758899e-05, + "loss": 1.7952, + "step": 4513 + }, + { + "epoch": 0.25160247477844044, + "grad_norm": 0.5652216076850891, + "learning_rate": 8.657520755994305e-05, + "loss": 1.8295, + "step": 4514 + }, + { + "epoch": 0.2516582130316036, + "grad_norm": 0.5443994998931885, + "learning_rate": 8.656913574234474e-05, + "loss": 1.6294, + "step": 4515 + }, + { + "epoch": 0.25171395128476676, + "grad_norm": 0.5845414400100708, + "learning_rate": 8.656306276498667e-05, + "loss": 1.8597, + "step": 4516 + }, + { + "epoch": 0.2517696895379299, + "grad_norm": 0.5372679233551025, + "learning_rate": 8.655698862806143e-05, + "loss": 1.7067, + "step": 4517 + }, + { + "epoch": 0.251825427791093, + "grad_norm": 0.5330473780632019, + "learning_rate": 8.655091333176165e-05, + "loss": 1.7043, + "step": 4518 + }, + { + "epoch": 0.2518811660442562, + "grad_norm": 0.5988831520080566, + "learning_rate": 8.654483687628002e-05, + "loss": 1.7418, + "step": 4519 + }, + { + "epoch": 0.25193690429741933, + "grad_norm": 0.5914613604545593, + "learning_rate": 8.65387592618092e-05, + "loss": 1.6442, + "step": 4520 + }, + { + "epoch": 0.25199264255058246, + "grad_norm": 0.5800835490226746, + "learning_rate": 8.653268048854201e-05, + "loss": 1.7816, + "step": 4521 + }, + { + "epoch": 0.2520483808037456, + "grad_norm": 0.5335732102394104, + "learning_rate": 8.652660055667117e-05, + "loss": 1.5046, + "step": 4522 + }, + { + "epoch": 0.25210411905690877, + "grad_norm": 0.48013389110565186, + "learning_rate": 8.652051946638953e-05, + "loss": 1.582, + "step": 4523 + }, + { + "epoch": 0.2521598573100719, + "grad_norm": 0.6047071814537048, + "learning_rate": 8.651443721788996e-05, + "loss": 1.6199, + "step": 4524 + }, + { + "epoch": 0.25221559556323503, + "grad_norm": 0.5248143672943115, + "learning_rate": 8.650835381136533e-05, + "loss": 1.5345, + "step": 4525 + }, + { + "epoch": 0.2522713338163982, + "grad_norm": 0.5078330636024475, + "learning_rate": 8.650226924700855e-05, + "loss": 1.6656, + "step": 4526 + }, + { + "epoch": 0.25232707206956134, + "grad_norm": 0.5320603251457214, + "learning_rate": 8.649618352501264e-05, + "loss": 1.598, + "step": 4527 + }, + { + "epoch": 0.25238281032272447, + "grad_norm": 0.49775633215904236, + "learning_rate": 8.649009664557057e-05, + "loss": 1.3941, + "step": 4528 + }, + { + "epoch": 0.25243854857588766, + "grad_norm": 0.5565609931945801, + "learning_rate": 8.648400860887538e-05, + "loss": 1.7144, + "step": 4529 + }, + { + "epoch": 0.2524942868290508, + "grad_norm": 0.5529298782348633, + "learning_rate": 8.647791941512016e-05, + "loss": 1.8223, + "step": 4530 + }, + { + "epoch": 0.2525500250822139, + "grad_norm": 0.5692974328994751, + "learning_rate": 8.6471829064498e-05, + "loss": 1.6577, + "step": 4531 + }, + { + "epoch": 0.25260576333537704, + "grad_norm": 0.49695706367492676, + "learning_rate": 8.646573755720209e-05, + "loss": 1.6222, + "step": 4532 + }, + { + "epoch": 0.2526615015885402, + "grad_norm": 0.5647556781768799, + "learning_rate": 8.645964489342557e-05, + "loss": 1.8348, + "step": 4533 + }, + { + "epoch": 0.25271723984170336, + "grad_norm": 0.5597743391990662, + "learning_rate": 8.645355107336171e-05, + "loss": 1.7095, + "step": 4534 + }, + { + "epoch": 0.2527729780948665, + "grad_norm": 0.5715233683586121, + "learning_rate": 8.644745609720375e-05, + "loss": 1.9243, + "step": 4535 + }, + { + "epoch": 0.25282871634802967, + "grad_norm": 0.5817229747772217, + "learning_rate": 8.644135996514498e-05, + "loss": 1.782, + "step": 4536 + }, + { + "epoch": 0.2528844546011928, + "grad_norm": 0.5697113275527954, + "learning_rate": 8.643526267737873e-05, + "loss": 1.6014, + "step": 4537 + }, + { + "epoch": 0.2529401928543559, + "grad_norm": 0.5716366767883301, + "learning_rate": 8.642916423409839e-05, + "loss": 1.6435, + "step": 4538 + }, + { + "epoch": 0.2529959311075191, + "grad_norm": 0.5631042718887329, + "learning_rate": 8.642306463549736e-05, + "loss": 1.7686, + "step": 4539 + }, + { + "epoch": 0.25305166936068224, + "grad_norm": 0.596517026424408, + "learning_rate": 8.641696388176907e-05, + "loss": 1.8116, + "step": 4540 + }, + { + "epoch": 0.25310740761384537, + "grad_norm": 0.47129639983177185, + "learning_rate": 8.641086197310703e-05, + "loss": 1.4985, + "step": 4541 + }, + { + "epoch": 0.25316314586700855, + "grad_norm": 0.551607072353363, + "learning_rate": 8.640475890970471e-05, + "loss": 1.7948, + "step": 4542 + }, + { + "epoch": 0.2532188841201717, + "grad_norm": 0.559027910232544, + "learning_rate": 8.639865469175572e-05, + "loss": 1.5825, + "step": 4543 + }, + { + "epoch": 0.2532746223733348, + "grad_norm": 0.5063076019287109, + "learning_rate": 8.639254931945362e-05, + "loss": 1.4125, + "step": 4544 + }, + { + "epoch": 0.25333036062649794, + "grad_norm": 0.5271062254905701, + "learning_rate": 8.638644279299202e-05, + "loss": 1.6964, + "step": 4545 + }, + { + "epoch": 0.2533860988796611, + "grad_norm": 0.4922122657299042, + "learning_rate": 8.638033511256462e-05, + "loss": 1.6725, + "step": 4546 + }, + { + "epoch": 0.25344183713282425, + "grad_norm": 0.5734017491340637, + "learning_rate": 8.637422627836509e-05, + "loss": 2.0334, + "step": 4547 + }, + { + "epoch": 0.2534975753859874, + "grad_norm": 0.4978555738925934, + "learning_rate": 8.636811629058718e-05, + "loss": 1.6665, + "step": 4548 + }, + { + "epoch": 0.25355331363915057, + "grad_norm": 0.5637436509132385, + "learning_rate": 8.636200514942467e-05, + "loss": 1.5875, + "step": 4549 + }, + { + "epoch": 0.2536090518923137, + "grad_norm": 0.5382322072982788, + "learning_rate": 8.635589285507135e-05, + "loss": 1.838, + "step": 4550 + }, + { + "epoch": 0.2536647901454768, + "grad_norm": 0.518650233745575, + "learning_rate": 8.634977940772108e-05, + "loss": 1.7802, + "step": 4551 + }, + { + "epoch": 0.25372052839864, + "grad_norm": 0.5153575539588928, + "learning_rate": 8.634366480756774e-05, + "loss": 1.6153, + "step": 4552 + }, + { + "epoch": 0.25377626665180314, + "grad_norm": 0.5355269908905029, + "learning_rate": 8.633754905480527e-05, + "loss": 1.8255, + "step": 4553 + }, + { + "epoch": 0.25383200490496627, + "grad_norm": 0.5261843204498291, + "learning_rate": 8.63314321496276e-05, + "loss": 1.6177, + "step": 4554 + }, + { + "epoch": 0.2538877431581294, + "grad_norm": 0.557314395904541, + "learning_rate": 8.632531409222872e-05, + "loss": 1.8342, + "step": 4555 + }, + { + "epoch": 0.2539434814112926, + "grad_norm": 0.5285095572471619, + "learning_rate": 8.631919488280267e-05, + "loss": 1.6217, + "step": 4556 + }, + { + "epoch": 0.2539992196644557, + "grad_norm": 0.5471826195716858, + "learning_rate": 8.631307452154352e-05, + "loss": 1.5318, + "step": 4557 + }, + { + "epoch": 0.25405495791761884, + "grad_norm": 0.5375044941902161, + "learning_rate": 8.630695300864536e-05, + "loss": 1.7415, + "step": 4558 + }, + { + "epoch": 0.254110696170782, + "grad_norm": 0.566832423210144, + "learning_rate": 8.630083034430232e-05, + "loss": 1.9215, + "step": 4559 + }, + { + "epoch": 0.25416643442394515, + "grad_norm": 0.5262976884841919, + "learning_rate": 8.629470652870861e-05, + "loss": 1.5432, + "step": 4560 + }, + { + "epoch": 0.2542221726771083, + "grad_norm": 0.5495408177375793, + "learning_rate": 8.628858156205842e-05, + "loss": 1.9161, + "step": 4561 + }, + { + "epoch": 0.25427791093027147, + "grad_norm": 0.5776422023773193, + "learning_rate": 8.6282455444546e-05, + "loss": 1.8547, + "step": 4562 + }, + { + "epoch": 0.2543336491834346, + "grad_norm": 0.5136664509773254, + "learning_rate": 8.627632817636563e-05, + "loss": 1.3558, + "step": 4563 + }, + { + "epoch": 0.2543893874365977, + "grad_norm": 0.5449255108833313, + "learning_rate": 8.627019975771165e-05, + "loss": 1.7991, + "step": 4564 + }, + { + "epoch": 0.2544451256897609, + "grad_norm": 0.49720707535743713, + "learning_rate": 8.626407018877837e-05, + "loss": 1.5515, + "step": 4565 + }, + { + "epoch": 0.25450086394292404, + "grad_norm": 0.5493996739387512, + "learning_rate": 8.625793946976026e-05, + "loss": 1.7666, + "step": 4566 + }, + { + "epoch": 0.25455660219608717, + "grad_norm": 0.5458593368530273, + "learning_rate": 8.625180760085167e-05, + "loss": 1.9701, + "step": 4567 + }, + { + "epoch": 0.2546123404492503, + "grad_norm": 0.5866237878799438, + "learning_rate": 8.624567458224713e-05, + "loss": 1.7123, + "step": 4568 + }, + { + "epoch": 0.2546680787024135, + "grad_norm": 0.5610763430595398, + "learning_rate": 8.62395404141411e-05, + "loss": 1.8511, + "step": 4569 + }, + { + "epoch": 0.2547238169555766, + "grad_norm": 0.5264028906822205, + "learning_rate": 8.623340509672817e-05, + "loss": 1.6913, + "step": 4570 + }, + { + "epoch": 0.25477955520873974, + "grad_norm": 0.5024250745773315, + "learning_rate": 8.622726863020285e-05, + "loss": 1.6337, + "step": 4571 + }, + { + "epoch": 0.2548352934619029, + "grad_norm": 0.6130850315093994, + "learning_rate": 8.622113101475982e-05, + "loss": 1.8858, + "step": 4572 + }, + { + "epoch": 0.25489103171506605, + "grad_norm": 0.5543071627616882, + "learning_rate": 8.621499225059369e-05, + "loss": 1.6353, + "step": 4573 + }, + { + "epoch": 0.2549467699682292, + "grad_norm": 0.5286437273025513, + "learning_rate": 8.620885233789914e-05, + "loss": 1.4418, + "step": 4574 + }, + { + "epoch": 0.25500250822139237, + "grad_norm": 0.5485914349555969, + "learning_rate": 8.620271127687092e-05, + "loss": 1.7161, + "step": 4575 + }, + { + "epoch": 0.2550582464745555, + "grad_norm": 0.612994909286499, + "learning_rate": 8.619656906770377e-05, + "loss": 1.8467, + "step": 4576 + }, + { + "epoch": 0.2551139847277186, + "grad_norm": 0.5447350740432739, + "learning_rate": 8.619042571059248e-05, + "loss": 1.7528, + "step": 4577 + }, + { + "epoch": 0.25516972298088175, + "grad_norm": 0.5236079096794128, + "learning_rate": 8.61842812057319e-05, + "loss": 1.5648, + "step": 4578 + }, + { + "epoch": 0.25522546123404494, + "grad_norm": 0.534354567527771, + "learning_rate": 8.617813555331689e-05, + "loss": 1.5093, + "step": 4579 + }, + { + "epoch": 0.25528119948720807, + "grad_norm": 0.5146899819374084, + "learning_rate": 8.617198875354235e-05, + "loss": 1.6445, + "step": 4580 + }, + { + "epoch": 0.2553369377403712, + "grad_norm": 0.5606057047843933, + "learning_rate": 8.616584080660323e-05, + "loss": 1.6225, + "step": 4581 + }, + { + "epoch": 0.2553926759935344, + "grad_norm": 0.557131290435791, + "learning_rate": 8.615969171269449e-05, + "loss": 1.8017, + "step": 4582 + }, + { + "epoch": 0.2554484142466975, + "grad_norm": 0.5046922564506531, + "learning_rate": 8.615354147201116e-05, + "loss": 1.6034, + "step": 4583 + }, + { + "epoch": 0.25550415249986064, + "grad_norm": 0.5313592553138733, + "learning_rate": 8.614739008474829e-05, + "loss": 1.481, + "step": 4584 + }, + { + "epoch": 0.2555598907530238, + "grad_norm": 0.5347174406051636, + "learning_rate": 8.614123755110096e-05, + "loss": 1.6323, + "step": 4585 + }, + { + "epoch": 0.25561562900618695, + "grad_norm": 0.5261495113372803, + "learning_rate": 8.61350838712643e-05, + "loss": 1.4896, + "step": 4586 + }, + { + "epoch": 0.2556713672593501, + "grad_norm": 0.5374502539634705, + "learning_rate": 8.612892904543344e-05, + "loss": 1.6488, + "step": 4587 + }, + { + "epoch": 0.25572710551251326, + "grad_norm": 0.5835258960723877, + "learning_rate": 8.612277307380361e-05, + "loss": 1.7467, + "step": 4588 + }, + { + "epoch": 0.2557828437656764, + "grad_norm": 0.519822359085083, + "learning_rate": 8.611661595657004e-05, + "loss": 1.4627, + "step": 4589 + }, + { + "epoch": 0.2558385820188395, + "grad_norm": 0.5837191343307495, + "learning_rate": 8.611045769392796e-05, + "loss": 1.654, + "step": 4590 + }, + { + "epoch": 0.25589432027200265, + "grad_norm": 0.5844641327857971, + "learning_rate": 8.610429828607271e-05, + "loss": 1.6177, + "step": 4591 + }, + { + "epoch": 0.25595005852516584, + "grad_norm": 0.5927681922912598, + "learning_rate": 8.609813773319963e-05, + "loss": 1.9184, + "step": 4592 + }, + { + "epoch": 0.25600579677832896, + "grad_norm": 0.6149387955665588, + "learning_rate": 8.609197603550409e-05, + "loss": 1.6321, + "step": 4593 + }, + { + "epoch": 0.2560615350314921, + "grad_norm": 0.5619008541107178, + "learning_rate": 8.608581319318148e-05, + "loss": 1.6094, + "step": 4594 + }, + { + "epoch": 0.2561172732846553, + "grad_norm": 0.5645739436149597, + "learning_rate": 8.607964920642728e-05, + "loss": 1.7111, + "step": 4595 + }, + { + "epoch": 0.2561730115378184, + "grad_norm": 0.5264320373535156, + "learning_rate": 8.607348407543699e-05, + "loss": 1.5206, + "step": 4596 + }, + { + "epoch": 0.25622874979098154, + "grad_norm": 0.5533236861228943, + "learning_rate": 8.606731780040608e-05, + "loss": 1.9129, + "step": 4597 + }, + { + "epoch": 0.2562844880441447, + "grad_norm": 0.5276892781257629, + "learning_rate": 8.606115038153015e-05, + "loss": 1.7739, + "step": 4598 + }, + { + "epoch": 0.25634022629730785, + "grad_norm": 0.5314942598342896, + "learning_rate": 8.605498181900477e-05, + "loss": 1.6853, + "step": 4599 + }, + { + "epoch": 0.256395964550471, + "grad_norm": 0.540059506893158, + "learning_rate": 8.604881211302559e-05, + "loss": 1.8345, + "step": 4600 + }, + { + "epoch": 0.2564517028036341, + "grad_norm": 0.5306822657585144, + "learning_rate": 8.604264126378827e-05, + "loss": 1.9012, + "step": 4601 + }, + { + "epoch": 0.2565074410567973, + "grad_norm": 0.5294952988624573, + "learning_rate": 8.603646927148849e-05, + "loss": 1.5109, + "step": 4602 + }, + { + "epoch": 0.2565631793099604, + "grad_norm": 0.5673249959945679, + "learning_rate": 8.603029613632205e-05, + "loss": 1.758, + "step": 4603 + }, + { + "epoch": 0.25661891756312355, + "grad_norm": 0.5006965398788452, + "learning_rate": 8.602412185848466e-05, + "loss": 1.6211, + "step": 4604 + }, + { + "epoch": 0.25667465581628673, + "grad_norm": 0.5873995423316956, + "learning_rate": 8.601794643817216e-05, + "loss": 1.8896, + "step": 4605 + }, + { + "epoch": 0.25673039406944986, + "grad_norm": 0.56819748878479, + "learning_rate": 8.601176987558041e-05, + "loss": 1.6733, + "step": 4606 + }, + { + "epoch": 0.256786132322613, + "grad_norm": 0.5610432624816895, + "learning_rate": 8.600559217090529e-05, + "loss": 1.824, + "step": 4607 + }, + { + "epoch": 0.2568418705757762, + "grad_norm": 0.5451894998550415, + "learning_rate": 8.599941332434269e-05, + "loss": 1.7229, + "step": 4608 + }, + { + "epoch": 0.2568976088289393, + "grad_norm": 0.9107519388198853, + "learning_rate": 8.599323333608861e-05, + "loss": 1.846, + "step": 4609 + }, + { + "epoch": 0.25695334708210243, + "grad_norm": 0.5975711941719055, + "learning_rate": 8.598705220633903e-05, + "loss": 1.7334, + "step": 4610 + }, + { + "epoch": 0.2570090853352656, + "grad_norm": 0.5969035625457764, + "learning_rate": 8.598086993528996e-05, + "loss": 1.9449, + "step": 4611 + }, + { + "epoch": 0.25706482358842875, + "grad_norm": 0.6146485805511475, + "learning_rate": 8.597468652313747e-05, + "loss": 1.8884, + "step": 4612 + }, + { + "epoch": 0.2571205618415919, + "grad_norm": 0.5359372496604919, + "learning_rate": 8.596850197007767e-05, + "loss": 1.6199, + "step": 4613 + }, + { + "epoch": 0.257176300094755, + "grad_norm": 0.5491176247596741, + "learning_rate": 8.596231627630671e-05, + "loss": 1.5702, + "step": 4614 + }, + { + "epoch": 0.2572320383479182, + "grad_norm": 0.5316644310951233, + "learning_rate": 8.595612944202076e-05, + "loss": 1.6538, + "step": 4615 + }, + { + "epoch": 0.2572877766010813, + "grad_norm": 0.5944792032241821, + "learning_rate": 8.5949941467416e-05, + "loss": 1.79, + "step": 4616 + }, + { + "epoch": 0.25734351485424445, + "grad_norm": 0.5629575848579407, + "learning_rate": 8.594375235268872e-05, + "loss": 2.0629, + "step": 4617 + }, + { + "epoch": 0.25739925310740763, + "grad_norm": 0.5681300163269043, + "learning_rate": 8.593756209803518e-05, + "loss": 1.7105, + "step": 4618 + }, + { + "epoch": 0.25745499136057076, + "grad_norm": 0.5259959697723389, + "learning_rate": 8.59313707036517e-05, + "loss": 1.7797, + "step": 4619 + }, + { + "epoch": 0.2575107296137339, + "grad_norm": 0.5173026323318481, + "learning_rate": 8.592517816973462e-05, + "loss": 1.6879, + "step": 4620 + }, + { + "epoch": 0.2575664678668971, + "grad_norm": 0.5310641527175903, + "learning_rate": 8.591898449648035e-05, + "loss": 1.6947, + "step": 4621 + }, + { + "epoch": 0.2576222061200602, + "grad_norm": 0.5746062397956848, + "learning_rate": 8.591278968408532e-05, + "loss": 1.8276, + "step": 4622 + }, + { + "epoch": 0.25767794437322333, + "grad_norm": 0.5601612329483032, + "learning_rate": 8.590659373274599e-05, + "loss": 1.6054, + "step": 4623 + }, + { + "epoch": 0.25773368262638646, + "grad_norm": 0.5777058601379395, + "learning_rate": 8.590039664265885e-05, + "loss": 1.612, + "step": 4624 + }, + { + "epoch": 0.25778942087954965, + "grad_norm": 0.6337921023368835, + "learning_rate": 8.589419841402047e-05, + "loss": 2.1569, + "step": 4625 + }, + { + "epoch": 0.2578451591327128, + "grad_norm": 0.5203370451927185, + "learning_rate": 8.588799904702736e-05, + "loss": 1.4849, + "step": 4626 + }, + { + "epoch": 0.2579008973858759, + "grad_norm": 0.55791175365448, + "learning_rate": 8.588179854187616e-05, + "loss": 1.882, + "step": 4627 + }, + { + "epoch": 0.2579566356390391, + "grad_norm": 0.581343948841095, + "learning_rate": 8.587559689876354e-05, + "loss": 1.7811, + "step": 4628 + }, + { + "epoch": 0.2580123738922022, + "grad_norm": 0.6163395047187805, + "learning_rate": 8.586939411788615e-05, + "loss": 1.8589, + "step": 4629 + }, + { + "epoch": 0.25806811214536535, + "grad_norm": 0.5277383327484131, + "learning_rate": 8.586319019944071e-05, + "loss": 1.5817, + "step": 4630 + }, + { + "epoch": 0.25812385039852853, + "grad_norm": 0.5042583346366882, + "learning_rate": 8.585698514362397e-05, + "loss": 1.4472, + "step": 4631 + }, + { + "epoch": 0.25817958865169166, + "grad_norm": 0.5802309513092041, + "learning_rate": 8.585077895063271e-05, + "loss": 1.9396, + "step": 4632 + }, + { + "epoch": 0.2582353269048548, + "grad_norm": 0.5798273682594299, + "learning_rate": 8.58445716206638e-05, + "loss": 1.6806, + "step": 4633 + }, + { + "epoch": 0.258291065158018, + "grad_norm": 0.5102317333221436, + "learning_rate": 8.583836315391403e-05, + "loss": 1.5884, + "step": 4634 + }, + { + "epoch": 0.2583468034111811, + "grad_norm": 0.6215993165969849, + "learning_rate": 8.583215355058035e-05, + "loss": 2.001, + "step": 4635 + }, + { + "epoch": 0.25840254166434423, + "grad_norm": 0.5116714835166931, + "learning_rate": 8.582594281085967e-05, + "loss": 1.6639, + "step": 4636 + }, + { + "epoch": 0.25845827991750736, + "grad_norm": 0.5677070617675781, + "learning_rate": 8.581973093494897e-05, + "loss": 1.841, + "step": 4637 + }, + { + "epoch": 0.25851401817067055, + "grad_norm": 0.5552488565444946, + "learning_rate": 8.581351792304524e-05, + "loss": 1.6623, + "step": 4638 + }, + { + "epoch": 0.2585697564238337, + "grad_norm": 0.5567041635513306, + "learning_rate": 8.580730377534554e-05, + "loss": 1.5144, + "step": 4639 + }, + { + "epoch": 0.2586254946769968, + "grad_norm": 0.5067396759986877, + "learning_rate": 8.580108849204693e-05, + "loss": 1.4875, + "step": 4640 + }, + { + "epoch": 0.25868123293016, + "grad_norm": 0.5226799845695496, + "learning_rate": 8.579487207334653e-05, + "loss": 1.7197, + "step": 4641 + }, + { + "epoch": 0.2587369711833231, + "grad_norm": 0.5152204036712646, + "learning_rate": 8.578865451944148e-05, + "loss": 1.4488, + "step": 4642 + }, + { + "epoch": 0.25879270943648625, + "grad_norm": 0.5446513295173645, + "learning_rate": 8.578243583052897e-05, + "loss": 1.7116, + "step": 4643 + }, + { + "epoch": 0.25884844768964943, + "grad_norm": 0.5753796696662903, + "learning_rate": 8.577621600680623e-05, + "loss": 1.5765, + "step": 4644 + }, + { + "epoch": 0.25890418594281256, + "grad_norm": 0.53980952501297, + "learning_rate": 8.57699950484705e-05, + "loss": 1.7881, + "step": 4645 + }, + { + "epoch": 0.2589599241959757, + "grad_norm": 0.5444200038909912, + "learning_rate": 8.57637729557191e-05, + "loss": 1.8373, + "step": 4646 + }, + { + "epoch": 0.2590156624491388, + "grad_norm": 0.5415917634963989, + "learning_rate": 8.575754972874931e-05, + "loss": 1.6772, + "step": 4647 + }, + { + "epoch": 0.259071400702302, + "grad_norm": 0.5910305380821228, + "learning_rate": 8.575132536775853e-05, + "loss": 1.8558, + "step": 4648 + }, + { + "epoch": 0.25912713895546513, + "grad_norm": 0.5802417397499084, + "learning_rate": 8.574509987294417e-05, + "loss": 1.9364, + "step": 4649 + }, + { + "epoch": 0.25918287720862826, + "grad_norm": 0.573726236820221, + "learning_rate": 8.573887324450364e-05, + "loss": 1.8956, + "step": 4650 + }, + { + "epoch": 0.25923861546179144, + "grad_norm": 0.5909465551376343, + "learning_rate": 8.573264548263442e-05, + "loss": 1.7338, + "step": 4651 + }, + { + "epoch": 0.2592943537149546, + "grad_norm": 0.6169442534446716, + "learning_rate": 8.572641658753404e-05, + "loss": 1.5941, + "step": 4652 + }, + { + "epoch": 0.2593500919681177, + "grad_norm": 0.5135464668273926, + "learning_rate": 8.572018655940001e-05, + "loss": 1.7035, + "step": 4653 + }, + { + "epoch": 0.2594058302212809, + "grad_norm": 0.5379095077514648, + "learning_rate": 8.571395539842992e-05, + "loss": 1.7387, + "step": 4654 + }, + { + "epoch": 0.259461568474444, + "grad_norm": 0.5439580678939819, + "learning_rate": 8.570772310482141e-05, + "loss": 1.7089, + "step": 4655 + }, + { + "epoch": 0.25951730672760714, + "grad_norm": 0.5132806301116943, + "learning_rate": 8.57014896787721e-05, + "loss": 1.5298, + "step": 4656 + }, + { + "epoch": 0.25957304498077033, + "grad_norm": 0.5612521171569824, + "learning_rate": 8.569525512047969e-05, + "loss": 1.7676, + "step": 4657 + }, + { + "epoch": 0.25962878323393346, + "grad_norm": 0.5397217273712158, + "learning_rate": 8.56890194301419e-05, + "loss": 1.636, + "step": 4658 + }, + { + "epoch": 0.2596845214870966, + "grad_norm": 0.6334729194641113, + "learning_rate": 8.56827826079565e-05, + "loss": 1.8281, + "step": 4659 + }, + { + "epoch": 0.2597402597402597, + "grad_norm": 0.5931346416473389, + "learning_rate": 8.56765446541213e-05, + "loss": 1.7335, + "step": 4660 + }, + { + "epoch": 0.2597959979934229, + "grad_norm": 0.5085331201553345, + "learning_rate": 8.567030556883408e-05, + "loss": 1.8524, + "step": 4661 + }, + { + "epoch": 0.25985173624658603, + "grad_norm": 0.5508363246917725, + "learning_rate": 8.566406535229276e-05, + "loss": 1.7883, + "step": 4662 + }, + { + "epoch": 0.25990747449974916, + "grad_norm": 0.5742567181587219, + "learning_rate": 8.565782400469522e-05, + "loss": 1.7011, + "step": 4663 + }, + { + "epoch": 0.25996321275291234, + "grad_norm": 0.4922592043876648, + "learning_rate": 8.56515815262394e-05, + "loss": 1.4828, + "step": 4664 + }, + { + "epoch": 0.26001895100607547, + "grad_norm": 0.5450266003608704, + "learning_rate": 8.564533791712328e-05, + "loss": 1.7885, + "step": 4665 + }, + { + "epoch": 0.2600746892592386, + "grad_norm": 0.5942632555961609, + "learning_rate": 8.563909317754487e-05, + "loss": 1.9297, + "step": 4666 + }, + { + "epoch": 0.2601304275124018, + "grad_norm": 0.5638509392738342, + "learning_rate": 8.563284730770221e-05, + "loss": 1.9536, + "step": 4667 + }, + { + "epoch": 0.2601861657655649, + "grad_norm": 0.5848171710968018, + "learning_rate": 8.56266003077934e-05, + "loss": 2.003, + "step": 4668 + }, + { + "epoch": 0.26024190401872804, + "grad_norm": 0.5629677176475525, + "learning_rate": 8.562035217801652e-05, + "loss": 2.0024, + "step": 4669 + }, + { + "epoch": 0.26029764227189117, + "grad_norm": 0.5268816351890564, + "learning_rate": 8.561410291856977e-05, + "loss": 1.5865, + "step": 4670 + }, + { + "epoch": 0.26035338052505436, + "grad_norm": 0.545254647731781, + "learning_rate": 8.560785252965131e-05, + "loss": 1.7586, + "step": 4671 + }, + { + "epoch": 0.2604091187782175, + "grad_norm": 0.5406084060668945, + "learning_rate": 8.560160101145937e-05, + "loss": 1.9274, + "step": 4672 + }, + { + "epoch": 0.2604648570313806, + "grad_norm": 0.5519586801528931, + "learning_rate": 8.559534836419224e-05, + "loss": 1.7652, + "step": 4673 + }, + { + "epoch": 0.2605205952845438, + "grad_norm": 0.5398983955383301, + "learning_rate": 8.558909458804818e-05, + "loss": 1.9096, + "step": 4674 + }, + { + "epoch": 0.26057633353770693, + "grad_norm": 0.5414653420448303, + "learning_rate": 8.558283968322555e-05, + "loss": 1.6586, + "step": 4675 + }, + { + "epoch": 0.26063207179087006, + "grad_norm": 0.5628217458724976, + "learning_rate": 8.55765836499227e-05, + "loss": 1.606, + "step": 4676 + }, + { + "epoch": 0.26068781004403324, + "grad_norm": 0.5232682228088379, + "learning_rate": 8.557032648833804e-05, + "loss": 1.698, + "step": 4677 + }, + { + "epoch": 0.26074354829719637, + "grad_norm": 0.588845431804657, + "learning_rate": 8.556406819867001e-05, + "loss": 1.9568, + "step": 4678 + }, + { + "epoch": 0.2607992865503595, + "grad_norm": 0.5363548994064331, + "learning_rate": 8.55578087811171e-05, + "loss": 1.6827, + "step": 4679 + }, + { + "epoch": 0.2608550248035227, + "grad_norm": 0.514584481716156, + "learning_rate": 8.55515482358778e-05, + "loss": 1.631, + "step": 4680 + }, + { + "epoch": 0.2609107630566858, + "grad_norm": 0.5446624159812927, + "learning_rate": 8.554528656315069e-05, + "loss": 1.7978, + "step": 4681 + }, + { + "epoch": 0.26096650130984894, + "grad_norm": 0.5160642266273499, + "learning_rate": 8.55390237631343e-05, + "loss": 1.4935, + "step": 4682 + }, + { + "epoch": 0.26102223956301207, + "grad_norm": 0.5020194053649902, + "learning_rate": 8.553275983602732e-05, + "loss": 1.3459, + "step": 4683 + }, + { + "epoch": 0.26107797781617526, + "grad_norm": 0.5197760462760925, + "learning_rate": 8.552649478202834e-05, + "loss": 1.8008, + "step": 4684 + }, + { + "epoch": 0.2611337160693384, + "grad_norm": 0.5080288648605347, + "learning_rate": 8.55202286013361e-05, + "loss": 1.5853, + "step": 4685 + }, + { + "epoch": 0.2611894543225015, + "grad_norm": 0.5232203602790833, + "learning_rate": 8.551396129414928e-05, + "loss": 1.7352, + "step": 4686 + }, + { + "epoch": 0.2612451925756647, + "grad_norm": 0.5843389630317688, + "learning_rate": 8.550769286066669e-05, + "loss": 1.5833, + "step": 4687 + }, + { + "epoch": 0.2613009308288278, + "grad_norm": 0.5756316184997559, + "learning_rate": 8.55014233010871e-05, + "loss": 1.8692, + "step": 4688 + }, + { + "epoch": 0.26135666908199096, + "grad_norm": 0.5456770658493042, + "learning_rate": 8.549515261560937e-05, + "loss": 1.6987, + "step": 4689 + }, + { + "epoch": 0.26141240733515414, + "grad_norm": 0.5343070030212402, + "learning_rate": 8.548888080443231e-05, + "loss": 1.4492, + "step": 4690 + }, + { + "epoch": 0.26146814558831727, + "grad_norm": 0.546418309211731, + "learning_rate": 8.54826078677549e-05, + "loss": 1.7292, + "step": 4691 + }, + { + "epoch": 0.2615238838414804, + "grad_norm": 0.5571802258491516, + "learning_rate": 8.547633380577604e-05, + "loss": 1.9054, + "step": 4692 + }, + { + "epoch": 0.2615796220946435, + "grad_norm": 0.5529661774635315, + "learning_rate": 8.54700586186947e-05, + "loss": 1.8537, + "step": 4693 + }, + { + "epoch": 0.2616353603478067, + "grad_norm": 0.5503031611442566, + "learning_rate": 8.546378230670992e-05, + "loss": 1.7507, + "step": 4694 + }, + { + "epoch": 0.26169109860096984, + "grad_norm": 0.5290326476097107, + "learning_rate": 8.545750487002073e-05, + "loss": 1.5895, + "step": 4695 + }, + { + "epoch": 0.26174683685413297, + "grad_norm": 0.5247073769569397, + "learning_rate": 8.54512263088262e-05, + "loss": 1.5736, + "step": 4696 + }, + { + "epoch": 0.26180257510729615, + "grad_norm": 0.575093686580658, + "learning_rate": 8.544494662332548e-05, + "loss": 1.5192, + "step": 4697 + }, + { + "epoch": 0.2618583133604593, + "grad_norm": 0.5360473990440369, + "learning_rate": 8.543866581371771e-05, + "loss": 1.7796, + "step": 4698 + }, + { + "epoch": 0.2619140516136224, + "grad_norm": 0.5478860139846802, + "learning_rate": 8.54323838802021e-05, + "loss": 1.756, + "step": 4699 + }, + { + "epoch": 0.2619697898667856, + "grad_norm": 0.5454539060592651, + "learning_rate": 8.542610082297783e-05, + "loss": 1.7589, + "step": 4700 + }, + { + "epoch": 0.2620255281199487, + "grad_norm": 0.5187868475914001, + "learning_rate": 8.541981664224421e-05, + "loss": 1.5043, + "step": 4701 + }, + { + "epoch": 0.26208126637311185, + "grad_norm": 0.5362755060195923, + "learning_rate": 8.54135313382005e-05, + "loss": 1.731, + "step": 4702 + }, + { + "epoch": 0.26213700462627504, + "grad_norm": 0.5599364638328552, + "learning_rate": 8.540724491104606e-05, + "loss": 1.6976, + "step": 4703 + }, + { + "epoch": 0.26219274287943817, + "grad_norm": 0.5924205183982849, + "learning_rate": 8.540095736098026e-05, + "loss": 1.8049, + "step": 4704 + }, + { + "epoch": 0.2622484811326013, + "grad_norm": 0.5288107395172119, + "learning_rate": 8.539466868820247e-05, + "loss": 1.5834, + "step": 4705 + }, + { + "epoch": 0.2623042193857644, + "grad_norm": 0.5498400330543518, + "learning_rate": 8.538837889291218e-05, + "loss": 1.6546, + "step": 4706 + }, + { + "epoch": 0.2623599576389276, + "grad_norm": 0.5080811381340027, + "learning_rate": 8.538208797530883e-05, + "loss": 1.434, + "step": 4707 + }, + { + "epoch": 0.26241569589209074, + "grad_norm": 0.5125556588172913, + "learning_rate": 8.537579593559195e-05, + "loss": 1.6628, + "step": 4708 + }, + { + "epoch": 0.26247143414525387, + "grad_norm": 0.5489838123321533, + "learning_rate": 8.536950277396106e-05, + "loss": 1.5702, + "step": 4709 + }, + { + "epoch": 0.26252717239841705, + "grad_norm": 0.5346508622169495, + "learning_rate": 8.536320849061577e-05, + "loss": 1.7829, + "step": 4710 + }, + { + "epoch": 0.2625829106515802, + "grad_norm": 0.5648466944694519, + "learning_rate": 8.535691308575569e-05, + "loss": 1.8271, + "step": 4711 + }, + { + "epoch": 0.2626386489047433, + "grad_norm": 0.5875536203384399, + "learning_rate": 8.535061655958048e-05, + "loss": 1.888, + "step": 4712 + }, + { + "epoch": 0.2626943871579065, + "grad_norm": 0.5403586626052856, + "learning_rate": 8.534431891228981e-05, + "loss": 1.5633, + "step": 4713 + }, + { + "epoch": 0.2627501254110696, + "grad_norm": 0.5541427135467529, + "learning_rate": 8.533802014408341e-05, + "loss": 1.7778, + "step": 4714 + }, + { + "epoch": 0.26280586366423275, + "grad_norm": 0.5390727519989014, + "learning_rate": 8.533172025516106e-05, + "loss": 1.6732, + "step": 4715 + }, + { + "epoch": 0.2628616019173959, + "grad_norm": 0.5591700077056885, + "learning_rate": 8.532541924572254e-05, + "loss": 1.7714, + "step": 4716 + }, + { + "epoch": 0.26291734017055907, + "grad_norm": 0.5306904911994934, + "learning_rate": 8.531911711596767e-05, + "loss": 1.7311, + "step": 4717 + }, + { + "epoch": 0.2629730784237222, + "grad_norm": 0.5665531158447266, + "learning_rate": 8.531281386609633e-05, + "loss": 1.684, + "step": 4718 + }, + { + "epoch": 0.2630288166768853, + "grad_norm": 0.5404395461082458, + "learning_rate": 8.530650949630844e-05, + "loss": 1.7727, + "step": 4719 + }, + { + "epoch": 0.2630845549300485, + "grad_norm": 0.5549681782722473, + "learning_rate": 8.530020400680392e-05, + "loss": 1.6802, + "step": 4720 + }, + { + "epoch": 0.26314029318321164, + "grad_norm": 0.5529362559318542, + "learning_rate": 8.529389739778272e-05, + "loss": 1.6691, + "step": 4721 + }, + { + "epoch": 0.26319603143637477, + "grad_norm": 0.5257294178009033, + "learning_rate": 8.528758966944489e-05, + "loss": 1.6649, + "step": 4722 + }, + { + "epoch": 0.26325176968953795, + "grad_norm": 0.5499683022499084, + "learning_rate": 8.528128082199046e-05, + "loss": 1.8637, + "step": 4723 + }, + { + "epoch": 0.2633075079427011, + "grad_norm": 0.5676036477088928, + "learning_rate": 8.527497085561949e-05, + "loss": 1.6409, + "step": 4724 + }, + { + "epoch": 0.2633632461958642, + "grad_norm": 0.5784804821014404, + "learning_rate": 8.526865977053211e-05, + "loss": 1.8414, + "step": 4725 + }, + { + "epoch": 0.2634189844490274, + "grad_norm": 0.592461884021759, + "learning_rate": 8.52623475669285e-05, + "loss": 1.725, + "step": 4726 + }, + { + "epoch": 0.2634747227021905, + "grad_norm": 0.5251427888870239, + "learning_rate": 8.52560342450088e-05, + "loss": 1.5888, + "step": 4727 + }, + { + "epoch": 0.26353046095535365, + "grad_norm": 0.5062176585197449, + "learning_rate": 8.524971980497325e-05, + "loss": 1.5588, + "step": 4728 + }, + { + "epoch": 0.2635861992085168, + "grad_norm": 0.5686171054840088, + "learning_rate": 8.524340424702211e-05, + "loss": 1.6186, + "step": 4729 + }, + { + "epoch": 0.26364193746167996, + "grad_norm": 0.5521769523620605, + "learning_rate": 8.523708757135567e-05, + "loss": 1.6917, + "step": 4730 + }, + { + "epoch": 0.2636976757148431, + "grad_norm": 0.5489006042480469, + "learning_rate": 8.523076977817426e-05, + "loss": 1.8079, + "step": 4731 + }, + { + "epoch": 0.2637534139680062, + "grad_norm": 0.5295306444168091, + "learning_rate": 8.522445086767826e-05, + "loss": 1.6814, + "step": 4732 + }, + { + "epoch": 0.2638091522211694, + "grad_norm": 0.5596312284469604, + "learning_rate": 8.521813084006802e-05, + "loss": 1.7971, + "step": 4733 + }, + { + "epoch": 0.26386489047433254, + "grad_norm": 0.535030722618103, + "learning_rate": 8.5211809695544e-05, + "loss": 1.6389, + "step": 4734 + }, + { + "epoch": 0.26392062872749567, + "grad_norm": 0.5560666918754578, + "learning_rate": 8.520548743430673e-05, + "loss": 1.8107, + "step": 4735 + }, + { + "epoch": 0.26397636698065885, + "grad_norm": 0.5749865770339966, + "learning_rate": 8.51991640565566e-05, + "loss": 1.7698, + "step": 4736 + }, + { + "epoch": 0.264032105233822, + "grad_norm": 0.603252649307251, + "learning_rate": 8.519283956249424e-05, + "loss": 1.9701, + "step": 4737 + }, + { + "epoch": 0.2640878434869851, + "grad_norm": 0.562053918838501, + "learning_rate": 8.51865139523202e-05, + "loss": 1.7033, + "step": 4738 + }, + { + "epoch": 0.26414358174014824, + "grad_norm": 0.5553662776947021, + "learning_rate": 8.518018722623509e-05, + "loss": 1.6353, + "step": 4739 + }, + { + "epoch": 0.2641993199933114, + "grad_norm": 0.5916672945022583, + "learning_rate": 8.517385938443955e-05, + "loss": 1.8496, + "step": 4740 + }, + { + "epoch": 0.26425505824647455, + "grad_norm": 0.549395740032196, + "learning_rate": 8.516753042713426e-05, + "loss": 1.612, + "step": 4741 + }, + { + "epoch": 0.2643107964996377, + "grad_norm": 0.5560966730117798, + "learning_rate": 8.516120035451996e-05, + "loss": 1.5978, + "step": 4742 + }, + { + "epoch": 0.26436653475280086, + "grad_norm": 0.5934261679649353, + "learning_rate": 8.515486916679738e-05, + "loss": 1.9667, + "step": 4743 + }, + { + "epoch": 0.264422273005964, + "grad_norm": 0.5441667437553406, + "learning_rate": 8.514853686416732e-05, + "loss": 1.639, + "step": 4744 + }, + { + "epoch": 0.2644780112591271, + "grad_norm": 0.5780582427978516, + "learning_rate": 8.51422034468306e-05, + "loss": 1.6839, + "step": 4745 + }, + { + "epoch": 0.2645337495122903, + "grad_norm": 0.5739880204200745, + "learning_rate": 8.513586891498809e-05, + "loss": 1.6927, + "step": 4746 + }, + { + "epoch": 0.26458948776545343, + "grad_norm": 0.5097702145576477, + "learning_rate": 8.512953326884066e-05, + "loss": 1.5131, + "step": 4747 + }, + { + "epoch": 0.26464522601861656, + "grad_norm": 0.5593822598457336, + "learning_rate": 8.512319650858926e-05, + "loss": 1.8373, + "step": 4748 + }, + { + "epoch": 0.26470096427177975, + "grad_norm": 0.546627938747406, + "learning_rate": 8.511685863443484e-05, + "loss": 1.723, + "step": 4749 + }, + { + "epoch": 0.2647567025249429, + "grad_norm": 0.5196560621261597, + "learning_rate": 8.511051964657842e-05, + "loss": 1.6108, + "step": 4750 + }, + { + "epoch": 0.264812440778106, + "grad_norm": 0.548095166683197, + "learning_rate": 8.510417954522102e-05, + "loss": 1.6268, + "step": 4751 + }, + { + "epoch": 0.26486817903126914, + "grad_norm": 0.5570634007453918, + "learning_rate": 8.509783833056373e-05, + "loss": 1.828, + "step": 4752 + }, + { + "epoch": 0.2649239172844323, + "grad_norm": 0.5177022814750671, + "learning_rate": 8.509149600280762e-05, + "loss": 1.6537, + "step": 4753 + }, + { + "epoch": 0.26497965553759545, + "grad_norm": 0.5529354810714722, + "learning_rate": 8.508515256215389e-05, + "loss": 1.6702, + "step": 4754 + }, + { + "epoch": 0.2650353937907586, + "grad_norm": 0.6287319660186768, + "learning_rate": 8.507880800880364e-05, + "loss": 1.7545, + "step": 4755 + }, + { + "epoch": 0.26509113204392176, + "grad_norm": 0.5878986716270447, + "learning_rate": 8.507246234295814e-05, + "loss": 1.9199, + "step": 4756 + }, + { + "epoch": 0.2651468702970849, + "grad_norm": 0.560119092464447, + "learning_rate": 8.506611556481862e-05, + "loss": 1.645, + "step": 4757 + }, + { + "epoch": 0.265202608550248, + "grad_norm": 0.5107282996177673, + "learning_rate": 8.505976767458636e-05, + "loss": 1.8503, + "step": 4758 + }, + { + "epoch": 0.2652583468034112, + "grad_norm": 0.5514339208602905, + "learning_rate": 8.50534186724627e-05, + "loss": 1.6562, + "step": 4759 + }, + { + "epoch": 0.26531408505657433, + "grad_norm": 0.541807234287262, + "learning_rate": 8.504706855864897e-05, + "loss": 1.7167, + "step": 4760 + }, + { + "epoch": 0.26536982330973746, + "grad_norm": 0.5748420357704163, + "learning_rate": 8.504071733334656e-05, + "loss": 1.955, + "step": 4761 + }, + { + "epoch": 0.2654255615629006, + "grad_norm": 0.5451623201370239, + "learning_rate": 8.503436499675687e-05, + "loss": 1.7336, + "step": 4762 + }, + { + "epoch": 0.2654812998160638, + "grad_norm": 0.5036576986312866, + "learning_rate": 8.502801154908142e-05, + "loss": 1.7619, + "step": 4763 + }, + { + "epoch": 0.2655370380692269, + "grad_norm": 0.5252074003219604, + "learning_rate": 8.502165699052168e-05, + "loss": 1.6425, + "step": 4764 + }, + { + "epoch": 0.26559277632239003, + "grad_norm": 0.5452297925949097, + "learning_rate": 8.501530132127915e-05, + "loss": 1.5942, + "step": 4765 + }, + { + "epoch": 0.2656485145755532, + "grad_norm": 0.5282885432243347, + "learning_rate": 8.500894454155541e-05, + "loss": 1.4847, + "step": 4766 + }, + { + "epoch": 0.26570425282871635, + "grad_norm": 0.6032153367996216, + "learning_rate": 8.500258665155207e-05, + "loss": 1.8069, + "step": 4767 + }, + { + "epoch": 0.2657599910818795, + "grad_norm": 0.6232243776321411, + "learning_rate": 8.499622765147078e-05, + "loss": 1.9243, + "step": 4768 + }, + { + "epoch": 0.26581572933504266, + "grad_norm": 0.5226832032203674, + "learning_rate": 8.498986754151316e-05, + "loss": 1.5832, + "step": 4769 + }, + { + "epoch": 0.2658714675882058, + "grad_norm": 0.653657853603363, + "learning_rate": 8.498350632188097e-05, + "loss": 1.7387, + "step": 4770 + }, + { + "epoch": 0.2659272058413689, + "grad_norm": 0.6087796688079834, + "learning_rate": 8.497714399277592e-05, + "loss": 1.7853, + "step": 4771 + }, + { + "epoch": 0.2659829440945321, + "grad_norm": 0.5050531029701233, + "learning_rate": 8.49707805543998e-05, + "loss": 1.4848, + "step": 4772 + }, + { + "epoch": 0.26603868234769523, + "grad_norm": 0.5245751738548279, + "learning_rate": 8.496441600695441e-05, + "loss": 1.615, + "step": 4773 + }, + { + "epoch": 0.26609442060085836, + "grad_norm": 0.5427295565605164, + "learning_rate": 8.495805035064159e-05, + "loss": 1.8508, + "step": 4774 + }, + { + "epoch": 0.2661501588540215, + "grad_norm": 0.5052759647369385, + "learning_rate": 8.495168358566325e-05, + "loss": 1.6307, + "step": 4775 + }, + { + "epoch": 0.2662058971071847, + "grad_norm": 0.5618288516998291, + "learning_rate": 8.494531571222128e-05, + "loss": 1.7516, + "step": 4776 + }, + { + "epoch": 0.2662616353603478, + "grad_norm": 0.5743941068649292, + "learning_rate": 8.493894673051765e-05, + "loss": 1.9439, + "step": 4777 + }, + { + "epoch": 0.26631737361351093, + "grad_norm": 0.5246620178222656, + "learning_rate": 8.493257664075433e-05, + "loss": 1.7159, + "step": 4778 + }, + { + "epoch": 0.2663731118666741, + "grad_norm": 0.5409666895866394, + "learning_rate": 8.492620544313335e-05, + "loss": 1.6972, + "step": 4779 + }, + { + "epoch": 0.26642885011983725, + "grad_norm": 0.5137554407119751, + "learning_rate": 8.491983313785676e-05, + "loss": 1.6285, + "step": 4780 + }, + { + "epoch": 0.2664845883730004, + "grad_norm": 0.6102763414382935, + "learning_rate": 8.491345972512668e-05, + "loss": 1.7433, + "step": 4781 + }, + { + "epoch": 0.26654032662616356, + "grad_norm": 0.6035791039466858, + "learning_rate": 8.490708520514519e-05, + "loss": 1.8665, + "step": 4782 + }, + { + "epoch": 0.2665960648793267, + "grad_norm": 0.5769240856170654, + "learning_rate": 8.490070957811449e-05, + "loss": 1.7147, + "step": 4783 + }, + { + "epoch": 0.2666518031324898, + "grad_norm": 0.5191882252693176, + "learning_rate": 8.489433284423678e-05, + "loss": 1.5935, + "step": 4784 + }, + { + "epoch": 0.26670754138565295, + "grad_norm": 0.575363039970398, + "learning_rate": 8.488795500371427e-05, + "loss": 1.8616, + "step": 4785 + }, + { + "epoch": 0.26676327963881613, + "grad_norm": 0.5380163788795471, + "learning_rate": 8.488157605674925e-05, + "loss": 1.5693, + "step": 4786 + }, + { + "epoch": 0.26681901789197926, + "grad_norm": 0.5527309775352478, + "learning_rate": 8.487519600354399e-05, + "loss": 1.797, + "step": 4787 + }, + { + "epoch": 0.2668747561451424, + "grad_norm": 0.5432277321815491, + "learning_rate": 8.486881484430085e-05, + "loss": 1.7024, + "step": 4788 + }, + { + "epoch": 0.2669304943983056, + "grad_norm": 0.5643296837806702, + "learning_rate": 8.486243257922221e-05, + "loss": 1.6602, + "step": 4789 + }, + { + "epoch": 0.2669862326514687, + "grad_norm": 0.5539331436157227, + "learning_rate": 8.485604920851049e-05, + "loss": 1.7195, + "step": 4790 + }, + { + "epoch": 0.26704197090463183, + "grad_norm": 0.5279936790466309, + "learning_rate": 8.48496647323681e-05, + "loss": 1.6503, + "step": 4791 + }, + { + "epoch": 0.267097709157795, + "grad_norm": 0.5447912812232971, + "learning_rate": 8.484327915099752e-05, + "loss": 1.7975, + "step": 4792 + }, + { + "epoch": 0.26715344741095814, + "grad_norm": 0.6047879457473755, + "learning_rate": 8.48368924646013e-05, + "loss": 1.8362, + "step": 4793 + }, + { + "epoch": 0.2672091856641213, + "grad_norm": 0.5555823445320129, + "learning_rate": 8.483050467338194e-05, + "loss": 1.7033, + "step": 4794 + }, + { + "epoch": 0.26726492391728446, + "grad_norm": 0.5324097871780396, + "learning_rate": 8.482411577754205e-05, + "loss": 1.828, + "step": 4795 + }, + { + "epoch": 0.2673206621704476, + "grad_norm": 0.5133151412010193, + "learning_rate": 8.481772577728426e-05, + "loss": 1.6922, + "step": 4796 + }, + { + "epoch": 0.2673764004236107, + "grad_norm": 0.5466338396072388, + "learning_rate": 8.48113346728112e-05, + "loss": 1.7228, + "step": 4797 + }, + { + "epoch": 0.26743213867677385, + "grad_norm": 0.5190402269363403, + "learning_rate": 8.480494246432557e-05, + "loss": 1.7192, + "step": 4798 + }, + { + "epoch": 0.26748787692993703, + "grad_norm": 0.4959962069988251, + "learning_rate": 8.47985491520301e-05, + "loss": 1.5593, + "step": 4799 + }, + { + "epoch": 0.26754361518310016, + "grad_norm": 0.5530042052268982, + "learning_rate": 8.479215473612754e-05, + "loss": 1.7545, + "step": 4800 + }, + { + "epoch": 0.2675993534362633, + "grad_norm": 0.6360591650009155, + "learning_rate": 8.478575921682066e-05, + "loss": 1.9369, + "step": 4801 + }, + { + "epoch": 0.26765509168942647, + "grad_norm": 0.5604984164237976, + "learning_rate": 8.477936259431235e-05, + "loss": 1.6485, + "step": 4802 + }, + { + "epoch": 0.2677108299425896, + "grad_norm": 0.568709671497345, + "learning_rate": 8.477296486880541e-05, + "loss": 1.6459, + "step": 4803 + }, + { + "epoch": 0.26776656819575273, + "grad_norm": 0.6228764653205872, + "learning_rate": 8.476656604050277e-05, + "loss": 1.8825, + "step": 4804 + }, + { + "epoch": 0.2678223064489159, + "grad_norm": 0.5803889036178589, + "learning_rate": 8.476016610960736e-05, + "loss": 1.8011, + "step": 4805 + }, + { + "epoch": 0.26787804470207904, + "grad_norm": 0.5778336524963379, + "learning_rate": 8.475376507632215e-05, + "loss": 1.726, + "step": 4806 + }, + { + "epoch": 0.2679337829552422, + "grad_norm": 0.5755890011787415, + "learning_rate": 8.474736294085014e-05, + "loss": 1.6394, + "step": 4807 + }, + { + "epoch": 0.2679895212084053, + "grad_norm": 0.5545676350593567, + "learning_rate": 8.474095970339436e-05, + "loss": 1.7973, + "step": 4808 + }, + { + "epoch": 0.2680452594615685, + "grad_norm": 0.5003368854522705, + "learning_rate": 8.473455536415789e-05, + "loss": 1.6653, + "step": 4809 + }, + { + "epoch": 0.2681009977147316, + "grad_norm": 0.5292695164680481, + "learning_rate": 8.472814992334386e-05, + "loss": 1.7463, + "step": 4810 + }, + { + "epoch": 0.26815673596789474, + "grad_norm": 0.604960560798645, + "learning_rate": 8.472174338115537e-05, + "loss": 1.9016, + "step": 4811 + }, + { + "epoch": 0.26821247422105793, + "grad_norm": 0.5484800338745117, + "learning_rate": 8.471533573779564e-05, + "loss": 1.6117, + "step": 4812 + }, + { + "epoch": 0.26826821247422106, + "grad_norm": 0.5383596420288086, + "learning_rate": 8.470892699346786e-05, + "loss": 1.6871, + "step": 4813 + }, + { + "epoch": 0.2683239507273842, + "grad_norm": 0.5479928851127625, + "learning_rate": 8.470251714837529e-05, + "loss": 1.7255, + "step": 4814 + }, + { + "epoch": 0.26837968898054737, + "grad_norm": 0.5112576484680176, + "learning_rate": 8.46961062027212e-05, + "loss": 1.414, + "step": 4815 + }, + { + "epoch": 0.2684354272337105, + "grad_norm": 0.547825038433075, + "learning_rate": 8.46896941567089e-05, + "loss": 1.835, + "step": 4816 + }, + { + "epoch": 0.26849116548687363, + "grad_norm": 0.5121808648109436, + "learning_rate": 8.468328101054177e-05, + "loss": 1.5269, + "step": 4817 + }, + { + "epoch": 0.2685469037400368, + "grad_norm": 0.5761928558349609, + "learning_rate": 8.467686676442318e-05, + "loss": 1.7195, + "step": 4818 + }, + { + "epoch": 0.26860264199319994, + "grad_norm": 0.547089159488678, + "learning_rate": 8.467045141855656e-05, + "loss": 1.6714, + "step": 4819 + }, + { + "epoch": 0.26865838024636307, + "grad_norm": 0.5228059887886047, + "learning_rate": 8.466403497314537e-05, + "loss": 1.6444, + "step": 4820 + }, + { + "epoch": 0.2687141184995262, + "grad_norm": 0.5589326620101929, + "learning_rate": 8.465761742839307e-05, + "loss": 1.9121, + "step": 4821 + }, + { + "epoch": 0.2687698567526894, + "grad_norm": 0.5607814192771912, + "learning_rate": 8.465119878450324e-05, + "loss": 1.8351, + "step": 4822 + }, + { + "epoch": 0.2688255950058525, + "grad_norm": 0.591454029083252, + "learning_rate": 8.46447790416794e-05, + "loss": 1.8308, + "step": 4823 + }, + { + "epoch": 0.26888133325901564, + "grad_norm": 0.5167153477668762, + "learning_rate": 8.463835820012517e-05, + "loss": 1.6928, + "step": 4824 + }, + { + "epoch": 0.2689370715121788, + "grad_norm": 0.5741368532180786, + "learning_rate": 8.463193626004418e-05, + "loss": 1.8407, + "step": 4825 + }, + { + "epoch": 0.26899280976534196, + "grad_norm": 0.563448965549469, + "learning_rate": 8.462551322164007e-05, + "loss": 1.7246, + "step": 4826 + }, + { + "epoch": 0.2690485480185051, + "grad_norm": 0.5690648555755615, + "learning_rate": 8.461908908511657e-05, + "loss": 1.7408, + "step": 4827 + }, + { + "epoch": 0.26910428627166827, + "grad_norm": 0.5448554754257202, + "learning_rate": 8.461266385067741e-05, + "loss": 1.6012, + "step": 4828 + }, + { + "epoch": 0.2691600245248314, + "grad_norm": 0.5054116249084473, + "learning_rate": 8.460623751852637e-05, + "loss": 1.6175, + "step": 4829 + }, + { + "epoch": 0.2692157627779945, + "grad_norm": 0.5798751711845398, + "learning_rate": 8.459981008886721e-05, + "loss": 1.7742, + "step": 4830 + }, + { + "epoch": 0.26927150103115766, + "grad_norm": 0.5339779257774353, + "learning_rate": 8.459338156190384e-05, + "loss": 1.6737, + "step": 4831 + }, + { + "epoch": 0.26932723928432084, + "grad_norm": 0.5387359261512756, + "learning_rate": 8.45869519378401e-05, + "loss": 1.6606, + "step": 4832 + }, + { + "epoch": 0.26938297753748397, + "grad_norm": 0.646202802658081, + "learning_rate": 8.458052121687987e-05, + "loss": 1.9741, + "step": 4833 + }, + { + "epoch": 0.2694387157906471, + "grad_norm": 0.5640881061553955, + "learning_rate": 8.457408939922715e-05, + "loss": 1.7103, + "step": 4834 + }, + { + "epoch": 0.2694944540438103, + "grad_norm": 0.567292332649231, + "learning_rate": 8.456765648508589e-05, + "loss": 1.7605, + "step": 4835 + }, + { + "epoch": 0.2695501922969734, + "grad_norm": 0.6057398319244385, + "learning_rate": 8.456122247466009e-05, + "loss": 1.6074, + "step": 4836 + }, + { + "epoch": 0.26960593055013654, + "grad_norm": 0.6216564178466797, + "learning_rate": 8.455478736815385e-05, + "loss": 1.6341, + "step": 4837 + }, + { + "epoch": 0.2696616688032997, + "grad_norm": 0.53920978307724, + "learning_rate": 8.454835116577122e-05, + "loss": 1.792, + "step": 4838 + }, + { + "epoch": 0.26971740705646285, + "grad_norm": 0.5827376842498779, + "learning_rate": 8.45419138677163e-05, + "loss": 1.5826, + "step": 4839 + }, + { + "epoch": 0.269773145309626, + "grad_norm": 0.5303118228912354, + "learning_rate": 8.453547547419329e-05, + "loss": 1.7387, + "step": 4840 + }, + { + "epoch": 0.26982888356278917, + "grad_norm": 0.5183376669883728, + "learning_rate": 8.452903598540634e-05, + "loss": 1.532, + "step": 4841 + }, + { + "epoch": 0.2698846218159523, + "grad_norm": 0.5537537336349487, + "learning_rate": 8.452259540155968e-05, + "loss": 1.7955, + "step": 4842 + }, + { + "epoch": 0.2699403600691154, + "grad_norm": 0.5679836273193359, + "learning_rate": 8.451615372285758e-05, + "loss": 1.7329, + "step": 4843 + }, + { + "epoch": 0.26999609832227855, + "grad_norm": 0.5696743726730347, + "learning_rate": 8.450971094950433e-05, + "loss": 1.7294, + "step": 4844 + }, + { + "epoch": 0.27005183657544174, + "grad_norm": 0.5818564295768738, + "learning_rate": 8.450326708170426e-05, + "loss": 2.0301, + "step": 4845 + }, + { + "epoch": 0.27010757482860487, + "grad_norm": 0.5044540762901306, + "learning_rate": 8.449682211966172e-05, + "loss": 1.5171, + "step": 4846 + }, + { + "epoch": 0.270163313081768, + "grad_norm": 0.5692309141159058, + "learning_rate": 8.449037606358111e-05, + "loss": 1.776, + "step": 4847 + }, + { + "epoch": 0.2702190513349312, + "grad_norm": 0.5652437210083008, + "learning_rate": 8.448392891366688e-05, + "loss": 1.8956, + "step": 4848 + }, + { + "epoch": 0.2702747895880943, + "grad_norm": 0.5531434416770935, + "learning_rate": 8.447748067012345e-05, + "loss": 1.7156, + "step": 4849 + }, + { + "epoch": 0.27033052784125744, + "grad_norm": 0.5418469309806824, + "learning_rate": 8.447103133315537e-05, + "loss": 1.6983, + "step": 4850 + }, + { + "epoch": 0.2703862660944206, + "grad_norm": 0.5276792049407959, + "learning_rate": 8.446458090296716e-05, + "loss": 1.6147, + "step": 4851 + }, + { + "epoch": 0.27044200434758375, + "grad_norm": 0.5772181749343872, + "learning_rate": 8.445812937976338e-05, + "loss": 1.677, + "step": 4852 + }, + { + "epoch": 0.2704977426007469, + "grad_norm": 0.5323836803436279, + "learning_rate": 8.445167676374865e-05, + "loss": 1.4833, + "step": 4853 + }, + { + "epoch": 0.27055348085391, + "grad_norm": 0.5478299260139465, + "learning_rate": 8.444522305512757e-05, + "loss": 1.5832, + "step": 4854 + }, + { + "epoch": 0.2706092191070732, + "grad_norm": 0.5325939655303955, + "learning_rate": 8.443876825410488e-05, + "loss": 1.4971, + "step": 4855 + }, + { + "epoch": 0.2706649573602363, + "grad_norm": 0.5912976861000061, + "learning_rate": 8.443231236088524e-05, + "loss": 1.7624, + "step": 4856 + }, + { + "epoch": 0.27072069561339945, + "grad_norm": 0.5368456244468689, + "learning_rate": 8.44258553756734e-05, + "loss": 1.5509, + "step": 4857 + }, + { + "epoch": 0.27077643386656264, + "grad_norm": 0.5713909864425659, + "learning_rate": 8.441939729867415e-05, + "loss": 1.8286, + "step": 4858 + }, + { + "epoch": 0.27083217211972577, + "grad_norm": 0.5259481072425842, + "learning_rate": 8.44129381300923e-05, + "loss": 1.7291, + "step": 4859 + }, + { + "epoch": 0.2708879103728889, + "grad_norm": 0.5365427136421204, + "learning_rate": 8.440647787013268e-05, + "loss": 1.6051, + "step": 4860 + }, + { + "epoch": 0.2709436486260521, + "grad_norm": 0.5223046541213989, + "learning_rate": 8.44000165190002e-05, + "loss": 1.5241, + "step": 4861 + }, + { + "epoch": 0.2709993868792152, + "grad_norm": 0.5721556544303894, + "learning_rate": 8.439355407689975e-05, + "loss": 1.8138, + "step": 4862 + }, + { + "epoch": 0.27105512513237834, + "grad_norm": 0.527158260345459, + "learning_rate": 8.43870905440363e-05, + "loss": 1.5114, + "step": 4863 + }, + { + "epoch": 0.2711108633855415, + "grad_norm": 0.5364054441452026, + "learning_rate": 8.438062592061485e-05, + "loss": 1.5331, + "step": 4864 + }, + { + "epoch": 0.27116660163870465, + "grad_norm": 0.5465856790542603, + "learning_rate": 8.437416020684036e-05, + "loss": 1.5122, + "step": 4865 + }, + { + "epoch": 0.2712223398918678, + "grad_norm": 0.5655773282051086, + "learning_rate": 8.436769340291794e-05, + "loss": 1.8776, + "step": 4866 + }, + { + "epoch": 0.2712780781450309, + "grad_norm": 0.5278435349464417, + "learning_rate": 8.436122550905266e-05, + "loss": 1.6388, + "step": 4867 + }, + { + "epoch": 0.2713338163981941, + "grad_norm": 0.5141345262527466, + "learning_rate": 8.435475652544967e-05, + "loss": 1.5203, + "step": 4868 + }, + { + "epoch": 0.2713895546513572, + "grad_norm": 0.5731988549232483, + "learning_rate": 8.434828645231407e-05, + "loss": 1.8796, + "step": 4869 + }, + { + "epoch": 0.27144529290452035, + "grad_norm": 0.5262272357940674, + "learning_rate": 8.434181528985112e-05, + "loss": 1.711, + "step": 4870 + }, + { + "epoch": 0.27150103115768354, + "grad_norm": 0.5410183668136597, + "learning_rate": 8.4335343038266e-05, + "loss": 1.5739, + "step": 4871 + }, + { + "epoch": 0.27155676941084667, + "grad_norm": 0.5376774072647095, + "learning_rate": 8.432886969776398e-05, + "loss": 1.7037, + "step": 4872 + }, + { + "epoch": 0.2716125076640098, + "grad_norm": 0.4998942017555237, + "learning_rate": 8.432239526855036e-05, + "loss": 1.566, + "step": 4873 + }, + { + "epoch": 0.271668245917173, + "grad_norm": 0.562468945980072, + "learning_rate": 8.431591975083049e-05, + "loss": 1.7742, + "step": 4874 + }, + { + "epoch": 0.2717239841703361, + "grad_norm": 0.5608972907066345, + "learning_rate": 8.430944314480973e-05, + "loss": 1.7467, + "step": 4875 + }, + { + "epoch": 0.27177972242349924, + "grad_norm": 0.6075250506401062, + "learning_rate": 8.430296545069345e-05, + "loss": 1.5414, + "step": 4876 + }, + { + "epoch": 0.27183546067666237, + "grad_norm": 0.5488311052322388, + "learning_rate": 8.429648666868713e-05, + "loss": 1.7401, + "step": 4877 + }, + { + "epoch": 0.27189119892982555, + "grad_norm": 0.5740364193916321, + "learning_rate": 8.429000679899619e-05, + "loss": 1.6739, + "step": 4878 + }, + { + "epoch": 0.2719469371829887, + "grad_norm": 0.5271220207214355, + "learning_rate": 8.428352584182617e-05, + "loss": 1.6982, + "step": 4879 + }, + { + "epoch": 0.2720026754361518, + "grad_norm": 0.5354405045509338, + "learning_rate": 8.42770437973826e-05, + "loss": 1.6927, + "step": 4880 + }, + { + "epoch": 0.272058413689315, + "grad_norm": 0.569052517414093, + "learning_rate": 8.427056066587105e-05, + "loss": 1.6674, + "step": 4881 + }, + { + "epoch": 0.2721141519424781, + "grad_norm": 0.5651227831840515, + "learning_rate": 8.426407644749711e-05, + "loss": 1.8356, + "step": 4882 + }, + { + "epoch": 0.27216989019564125, + "grad_norm": 0.5364747643470764, + "learning_rate": 8.425759114246647e-05, + "loss": 1.749, + "step": 4883 + }, + { + "epoch": 0.27222562844880444, + "grad_norm": 0.48416903614997864, + "learning_rate": 8.425110475098476e-05, + "loss": 1.4771, + "step": 4884 + }, + { + "epoch": 0.27228136670196756, + "grad_norm": 0.5686883926391602, + "learning_rate": 8.42446172732577e-05, + "loss": 1.6603, + "step": 4885 + }, + { + "epoch": 0.2723371049551307, + "grad_norm": 0.5875502824783325, + "learning_rate": 8.423812870949104e-05, + "loss": 1.8797, + "step": 4886 + }, + { + "epoch": 0.2723928432082939, + "grad_norm": 0.5201019644737244, + "learning_rate": 8.423163905989055e-05, + "loss": 1.649, + "step": 4887 + }, + { + "epoch": 0.272448581461457, + "grad_norm": 0.566376268863678, + "learning_rate": 8.422514832466206e-05, + "loss": 1.7182, + "step": 4888 + }, + { + "epoch": 0.27250431971462014, + "grad_norm": 0.5158393979072571, + "learning_rate": 8.421865650401143e-05, + "loss": 1.6317, + "step": 4889 + }, + { + "epoch": 0.27256005796778326, + "grad_norm": 0.5439308881759644, + "learning_rate": 8.421216359814451e-05, + "loss": 1.7071, + "step": 4890 + }, + { + "epoch": 0.27261579622094645, + "grad_norm": 0.5321268439292908, + "learning_rate": 8.420566960726723e-05, + "loss": 1.6561, + "step": 4891 + }, + { + "epoch": 0.2726715344741096, + "grad_norm": 0.4758521616458893, + "learning_rate": 8.419917453158554e-05, + "loss": 1.5538, + "step": 4892 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 0.4964730441570282, + "learning_rate": 8.419267837130544e-05, + "loss": 1.5957, + "step": 4893 + }, + { + "epoch": 0.2727830109804359, + "grad_norm": 0.555168628692627, + "learning_rate": 8.418618112663292e-05, + "loss": 1.6552, + "step": 4894 + }, + { + "epoch": 0.272838749233599, + "grad_norm": 0.5903061032295227, + "learning_rate": 8.417968279777409e-05, + "loss": 1.8649, + "step": 4895 + }, + { + "epoch": 0.27289448748676215, + "grad_norm": 0.584933876991272, + "learning_rate": 8.417318338493497e-05, + "loss": 1.8317, + "step": 4896 + }, + { + "epoch": 0.27295022573992533, + "grad_norm": 0.6088751554489136, + "learning_rate": 8.416668288832173e-05, + "loss": 1.775, + "step": 4897 + }, + { + "epoch": 0.27300596399308846, + "grad_norm": 0.6300697326660156, + "learning_rate": 8.41601813081405e-05, + "loss": 1.9256, + "step": 4898 + }, + { + "epoch": 0.2730617022462516, + "grad_norm": 0.5516534447669983, + "learning_rate": 8.415367864459751e-05, + "loss": 1.6553, + "step": 4899 + }, + { + "epoch": 0.2731174404994147, + "grad_norm": 0.5985352993011475, + "learning_rate": 8.414717489789894e-05, + "loss": 1.8121, + "step": 4900 + }, + { + "epoch": 0.2731731787525779, + "grad_norm": 0.5280508399009705, + "learning_rate": 8.414067006825108e-05, + "loss": 1.657, + "step": 4901 + }, + { + "epoch": 0.27322891700574103, + "grad_norm": 0.6586048007011414, + "learning_rate": 8.413416415586024e-05, + "loss": 2.2447, + "step": 4902 + }, + { + "epoch": 0.27328465525890416, + "grad_norm": 0.5527061223983765, + "learning_rate": 8.412765716093272e-05, + "loss": 1.5666, + "step": 4903 + }, + { + "epoch": 0.27334039351206735, + "grad_norm": 0.5549877882003784, + "learning_rate": 8.412114908367488e-05, + "loss": 1.5972, + "step": 4904 + }, + { + "epoch": 0.2733961317652305, + "grad_norm": 0.5879062414169312, + "learning_rate": 8.411463992429314e-05, + "loss": 1.8609, + "step": 4905 + }, + { + "epoch": 0.2734518700183936, + "grad_norm": 0.5397518873214722, + "learning_rate": 8.41081296829939e-05, + "loss": 1.8211, + "step": 4906 + }, + { + "epoch": 0.2735076082715568, + "grad_norm": 0.5364968776702881, + "learning_rate": 8.410161835998369e-05, + "loss": 1.7879, + "step": 4907 + }, + { + "epoch": 0.2735633465247199, + "grad_norm": 0.5714520215988159, + "learning_rate": 8.409510595546894e-05, + "loss": 1.9543, + "step": 4908 + }, + { + "epoch": 0.27361908477788305, + "grad_norm": 0.5671858787536621, + "learning_rate": 8.408859246965623e-05, + "loss": 1.8165, + "step": 4909 + }, + { + "epoch": 0.27367482303104623, + "grad_norm": 0.6034393906593323, + "learning_rate": 8.408207790275213e-05, + "loss": 1.8084, + "step": 4910 + }, + { + "epoch": 0.27373056128420936, + "grad_norm": 0.5954535007476807, + "learning_rate": 8.407556225496322e-05, + "loss": 1.782, + "step": 4911 + }, + { + "epoch": 0.2737862995373725, + "grad_norm": 0.5597085952758789, + "learning_rate": 8.406904552649614e-05, + "loss": 1.7673, + "step": 4912 + }, + { + "epoch": 0.2738420377905356, + "grad_norm": 0.7730258107185364, + "learning_rate": 8.406252771755758e-05, + "loss": 1.9742, + "step": 4913 + }, + { + "epoch": 0.2738977760436988, + "grad_norm": 0.5349806547164917, + "learning_rate": 8.405600882835425e-05, + "loss": 1.6226, + "step": 4914 + }, + { + "epoch": 0.27395351429686193, + "grad_norm": 0.5271722674369812, + "learning_rate": 8.404948885909288e-05, + "loss": 1.7948, + "step": 4915 + }, + { + "epoch": 0.27400925255002506, + "grad_norm": 0.6604454517364502, + "learning_rate": 8.404296780998022e-05, + "loss": 1.5653, + "step": 4916 + }, + { + "epoch": 0.27406499080318825, + "grad_norm": 0.5219733119010925, + "learning_rate": 8.403644568122313e-05, + "loss": 1.6596, + "step": 4917 + }, + { + "epoch": 0.2741207290563514, + "grad_norm": 0.5320934653282166, + "learning_rate": 8.402992247302842e-05, + "loss": 1.7119, + "step": 4918 + }, + { + "epoch": 0.2741764673095145, + "grad_norm": 0.5232207179069519, + "learning_rate": 8.402339818560296e-05, + "loss": 1.7161, + "step": 4919 + }, + { + "epoch": 0.2742322055626777, + "grad_norm": 0.5363631844520569, + "learning_rate": 8.401687281915371e-05, + "loss": 1.7174, + "step": 4920 + }, + { + "epoch": 0.2742879438158408, + "grad_norm": 0.5237067937850952, + "learning_rate": 8.401034637388758e-05, + "loss": 1.5517, + "step": 4921 + }, + { + "epoch": 0.27434368206900395, + "grad_norm": 0.5529504418373108, + "learning_rate": 8.400381885001155e-05, + "loss": 1.7067, + "step": 4922 + }, + { + "epoch": 0.2743994203221671, + "grad_norm": 0.5712334513664246, + "learning_rate": 8.399729024773264e-05, + "loss": 1.7333, + "step": 4923 + }, + { + "epoch": 0.27445515857533026, + "grad_norm": 0.5530427098274231, + "learning_rate": 8.39907605672579e-05, + "loss": 1.7721, + "step": 4924 + }, + { + "epoch": 0.2745108968284934, + "grad_norm": 0.5096892714500427, + "learning_rate": 8.398422980879442e-05, + "loss": 1.5788, + "step": 4925 + }, + { + "epoch": 0.2745666350816565, + "grad_norm": 0.5875157713890076, + "learning_rate": 8.39776979725493e-05, + "loss": 1.7782, + "step": 4926 + }, + { + "epoch": 0.2746223733348197, + "grad_norm": 0.5620753169059753, + "learning_rate": 8.397116505872973e-05, + "loss": 1.6911, + "step": 4927 + }, + { + "epoch": 0.27467811158798283, + "grad_norm": 0.5037546157836914, + "learning_rate": 8.396463106754285e-05, + "loss": 1.7944, + "step": 4928 + }, + { + "epoch": 0.27473384984114596, + "grad_norm": 0.5311979055404663, + "learning_rate": 8.395809599919591e-05, + "loss": 1.8542, + "step": 4929 + }, + { + "epoch": 0.27478958809430915, + "grad_norm": 0.5294662714004517, + "learning_rate": 8.395155985389615e-05, + "loss": 1.582, + "step": 4930 + }, + { + "epoch": 0.2748453263474723, + "grad_norm": 0.5880303382873535, + "learning_rate": 8.394502263185087e-05, + "loss": 1.8807, + "step": 4931 + }, + { + "epoch": 0.2749010646006354, + "grad_norm": 0.5946251153945923, + "learning_rate": 8.393848433326736e-05, + "loss": 1.8139, + "step": 4932 + }, + { + "epoch": 0.2749568028537986, + "grad_norm": 0.5572118759155273, + "learning_rate": 8.393194495835304e-05, + "loss": 1.9141, + "step": 4933 + }, + { + "epoch": 0.2750125411069617, + "grad_norm": 0.5573039054870605, + "learning_rate": 8.392540450731522e-05, + "loss": 1.7951, + "step": 4934 + }, + { + "epoch": 0.27506827936012485, + "grad_norm": 0.540758490562439, + "learning_rate": 8.39188629803614e-05, + "loss": 1.7804, + "step": 4935 + }, + { + "epoch": 0.275124017613288, + "grad_norm": 0.5271297693252563, + "learning_rate": 8.3912320377699e-05, + "loss": 1.82, + "step": 4936 + }, + { + "epoch": 0.27517975586645116, + "grad_norm": 0.5359855890274048, + "learning_rate": 8.390577669953552e-05, + "loss": 1.7678, + "step": 4937 + }, + { + "epoch": 0.2752354941196143, + "grad_norm": 0.5025729537010193, + "learning_rate": 8.389923194607849e-05, + "loss": 1.5144, + "step": 4938 + }, + { + "epoch": 0.2752912323727774, + "grad_norm": 0.5402054190635681, + "learning_rate": 8.389268611753546e-05, + "loss": 1.6204, + "step": 4939 + }, + { + "epoch": 0.2753469706259406, + "grad_norm": 0.5499907732009888, + "learning_rate": 8.388613921411404e-05, + "loss": 1.6948, + "step": 4940 + }, + { + "epoch": 0.27540270887910373, + "grad_norm": 0.6044038534164429, + "learning_rate": 8.387959123602185e-05, + "loss": 1.5522, + "step": 4941 + }, + { + "epoch": 0.27545844713226686, + "grad_norm": 0.5463374853134155, + "learning_rate": 8.387304218346656e-05, + "loss": 1.6392, + "step": 4942 + }, + { + "epoch": 0.27551418538543004, + "grad_norm": 0.5164476633071899, + "learning_rate": 8.386649205665586e-05, + "loss": 1.674, + "step": 4943 + }, + { + "epoch": 0.2755699236385932, + "grad_norm": 0.6093559861183167, + "learning_rate": 8.385994085579751e-05, + "loss": 2.0767, + "step": 4944 + }, + { + "epoch": 0.2756256618917563, + "grad_norm": 0.5542387366294861, + "learning_rate": 8.385338858109922e-05, + "loss": 1.8275, + "step": 4945 + }, + { + "epoch": 0.27568140014491943, + "grad_norm": 0.5787892937660217, + "learning_rate": 8.384683523276885e-05, + "loss": 1.5918, + "step": 4946 + }, + { + "epoch": 0.2757371383980826, + "grad_norm": 0.5294553637504578, + "learning_rate": 8.38402808110142e-05, + "loss": 1.6857, + "step": 4947 + }, + { + "epoch": 0.27579287665124574, + "grad_norm": 0.5397957563400269, + "learning_rate": 8.383372531604314e-05, + "loss": 1.6894, + "step": 4948 + }, + { + "epoch": 0.2758486149044089, + "grad_norm": 0.5266357660293579, + "learning_rate": 8.382716874806357e-05, + "loss": 1.7214, + "step": 4949 + }, + { + "epoch": 0.27590435315757206, + "grad_norm": 0.5046342611312866, + "learning_rate": 8.382061110728345e-05, + "loss": 1.4341, + "step": 4950 + }, + { + "epoch": 0.2759600914107352, + "grad_norm": 0.5609323382377625, + "learning_rate": 8.381405239391074e-05, + "loss": 1.7528, + "step": 4951 + }, + { + "epoch": 0.2760158296638983, + "grad_norm": 0.5804145336151123, + "learning_rate": 8.38074926081534e-05, + "loss": 1.8709, + "step": 4952 + }, + { + "epoch": 0.2760715679170615, + "grad_norm": 0.5542110204696655, + "learning_rate": 8.380093175021953e-05, + "loss": 1.8472, + "step": 4953 + }, + { + "epoch": 0.27612730617022463, + "grad_norm": 0.5371457934379578, + "learning_rate": 8.379436982031718e-05, + "loss": 1.5508, + "step": 4954 + }, + { + "epoch": 0.27618304442338776, + "grad_norm": 0.6307567358016968, + "learning_rate": 8.378780681865445e-05, + "loss": 1.7762, + "step": 4955 + }, + { + "epoch": 0.27623878267655094, + "grad_norm": 0.6115426421165466, + "learning_rate": 8.37812427454395e-05, + "loss": 1.8666, + "step": 4956 + }, + { + "epoch": 0.27629452092971407, + "grad_norm": 0.5419024229049683, + "learning_rate": 8.377467760088046e-05, + "loss": 1.6681, + "step": 4957 + }, + { + "epoch": 0.2763502591828772, + "grad_norm": 0.5587498545646667, + "learning_rate": 8.376811138518558e-05, + "loss": 1.8999, + "step": 4958 + }, + { + "epoch": 0.27640599743604033, + "grad_norm": 0.6416218876838684, + "learning_rate": 8.376154409856309e-05, + "loss": 2.1091, + "step": 4959 + }, + { + "epoch": 0.2764617356892035, + "grad_norm": 0.5992975234985352, + "learning_rate": 8.375497574122127e-05, + "loss": 1.837, + "step": 4960 + }, + { + "epoch": 0.27651747394236664, + "grad_norm": 0.5807574987411499, + "learning_rate": 8.374840631336842e-05, + "loss": 1.643, + "step": 4961 + }, + { + "epoch": 0.27657321219552977, + "grad_norm": 0.5473943948745728, + "learning_rate": 8.374183581521288e-05, + "loss": 1.6044, + "step": 4962 + }, + { + "epoch": 0.27662895044869296, + "grad_norm": 0.5294444561004639, + "learning_rate": 8.373526424696305e-05, + "loss": 1.7088, + "step": 4963 + }, + { + "epoch": 0.2766846887018561, + "grad_norm": 0.5424871444702148, + "learning_rate": 8.372869160882733e-05, + "loss": 1.5888, + "step": 4964 + }, + { + "epoch": 0.2767404269550192, + "grad_norm": 0.5405928492546082, + "learning_rate": 8.372211790101414e-05, + "loss": 1.6905, + "step": 4965 + }, + { + "epoch": 0.2767961652081824, + "grad_norm": 0.5668782591819763, + "learning_rate": 8.3715543123732e-05, + "loss": 1.7584, + "step": 4966 + }, + { + "epoch": 0.2768519034613455, + "grad_norm": 0.586342990398407, + "learning_rate": 8.370896727718942e-05, + "loss": 1.7863, + "step": 4967 + }, + { + "epoch": 0.27690764171450866, + "grad_norm": 0.6017349362373352, + "learning_rate": 8.370239036159493e-05, + "loss": 1.8825, + "step": 4968 + }, + { + "epoch": 0.2769633799676718, + "grad_norm": 0.5821561813354492, + "learning_rate": 8.36958123771571e-05, + "loss": 1.9587, + "step": 4969 + }, + { + "epoch": 0.27701911822083497, + "grad_norm": 0.5764045119285583, + "learning_rate": 8.368923332408459e-05, + "loss": 1.8635, + "step": 4970 + }, + { + "epoch": 0.2770748564739981, + "grad_norm": 0.595043957233429, + "learning_rate": 8.368265320258598e-05, + "loss": 1.7843, + "step": 4971 + }, + { + "epoch": 0.27713059472716123, + "grad_norm": 0.5718355774879456, + "learning_rate": 8.367607201287002e-05, + "loss": 1.6231, + "step": 4972 + }, + { + "epoch": 0.2771863329803244, + "grad_norm": 0.5044475793838501, + "learning_rate": 8.366948975514539e-05, + "loss": 1.5014, + "step": 4973 + }, + { + "epoch": 0.27724207123348754, + "grad_norm": 0.5001023411750793, + "learning_rate": 8.366290642962087e-05, + "loss": 1.522, + "step": 4974 + }, + { + "epoch": 0.27729780948665067, + "grad_norm": 0.7615741491317749, + "learning_rate": 8.36563220365052e-05, + "loss": 1.5344, + "step": 4975 + }, + { + "epoch": 0.27735354773981385, + "grad_norm": 0.47964903712272644, + "learning_rate": 8.364973657600724e-05, + "loss": 1.4201, + "step": 4976 + }, + { + "epoch": 0.277409285992977, + "grad_norm": 0.5713698863983154, + "learning_rate": 8.364315004833583e-05, + "loss": 1.7664, + "step": 4977 + }, + { + "epoch": 0.2774650242461401, + "grad_norm": 0.5541187524795532, + "learning_rate": 8.363656245369984e-05, + "loss": 1.75, + "step": 4978 + }, + { + "epoch": 0.2775207624993033, + "grad_norm": 0.543755054473877, + "learning_rate": 8.362997379230822e-05, + "loss": 1.6432, + "step": 4979 + }, + { + "epoch": 0.2775765007524664, + "grad_norm": 0.5810009241104126, + "learning_rate": 8.36233840643699e-05, + "loss": 1.948, + "step": 4980 + }, + { + "epoch": 0.27763223900562956, + "grad_norm": 0.5693858861923218, + "learning_rate": 8.361679327009388e-05, + "loss": 1.8148, + "step": 4981 + }, + { + "epoch": 0.2776879772587927, + "grad_norm": 0.5942829251289368, + "learning_rate": 8.361020140968919e-05, + "loss": 1.9087, + "step": 4982 + }, + { + "epoch": 0.27774371551195587, + "grad_norm": 0.548213541507721, + "learning_rate": 8.360360848336484e-05, + "loss": 1.7628, + "step": 4983 + }, + { + "epoch": 0.277799453765119, + "grad_norm": 0.5708996057510376, + "learning_rate": 8.359701449132998e-05, + "loss": 1.8127, + "step": 4984 + }, + { + "epoch": 0.2778551920182821, + "grad_norm": 0.5608772039413452, + "learning_rate": 8.359041943379369e-05, + "loss": 1.5508, + "step": 4985 + }, + { + "epoch": 0.2779109302714453, + "grad_norm": 0.5337716937065125, + "learning_rate": 8.358382331096514e-05, + "loss": 1.6666, + "step": 4986 + }, + { + "epoch": 0.27796666852460844, + "grad_norm": 0.5663906335830688, + "learning_rate": 8.357722612305353e-05, + "loss": 1.8808, + "step": 4987 + }, + { + "epoch": 0.27802240677777157, + "grad_norm": 0.5678949952125549, + "learning_rate": 8.357062787026805e-05, + "loss": 1.7122, + "step": 4988 + }, + { + "epoch": 0.27807814503093475, + "grad_norm": 0.5173599720001221, + "learning_rate": 8.356402855281802e-05, + "loss": 1.6552, + "step": 4989 + }, + { + "epoch": 0.2781338832840979, + "grad_norm": 0.5319927334785461, + "learning_rate": 8.355742817091268e-05, + "loss": 1.4913, + "step": 4990 + }, + { + "epoch": 0.278189621537261, + "grad_norm": 0.5666325092315674, + "learning_rate": 8.355082672476136e-05, + "loss": 1.7334, + "step": 4991 + }, + { + "epoch": 0.27824535979042414, + "grad_norm": 0.6288278698921204, + "learning_rate": 8.354422421457346e-05, + "loss": 2.005, + "step": 4992 + }, + { + "epoch": 0.2783010980435873, + "grad_norm": 0.4918287992477417, + "learning_rate": 8.353762064055833e-05, + "loss": 1.6484, + "step": 4993 + }, + { + "epoch": 0.27835683629675045, + "grad_norm": 0.6033855676651001, + "learning_rate": 8.353101600292541e-05, + "loss": 1.7403, + "step": 4994 + }, + { + "epoch": 0.2784125745499136, + "grad_norm": 0.5309021472930908, + "learning_rate": 8.352441030188417e-05, + "loss": 1.6779, + "step": 4995 + }, + { + "epoch": 0.27846831280307677, + "grad_norm": 0.5141871571540833, + "learning_rate": 8.351780353764408e-05, + "loss": 1.7298, + "step": 4996 + }, + { + "epoch": 0.2785240510562399, + "grad_norm": 0.5200504064559937, + "learning_rate": 8.351119571041468e-05, + "loss": 1.594, + "step": 4997 + }, + { + "epoch": 0.278579789309403, + "grad_norm": 0.5325762033462524, + "learning_rate": 8.350458682040556e-05, + "loss": 1.7623, + "step": 4998 + }, + { + "epoch": 0.2786355275625662, + "grad_norm": 0.539318859577179, + "learning_rate": 8.349797686782627e-05, + "loss": 1.6779, + "step": 4999 + }, + { + "epoch": 0.27869126581572934, + "grad_norm": 0.5733089447021484, + "learning_rate": 8.349136585288648e-05, + "loss": 1.8159, + "step": 5000 + }, + { + "epoch": 0.27874700406889247, + "grad_norm": 0.5516615509986877, + "learning_rate": 8.348475377579583e-05, + "loss": 1.6049, + "step": 5001 + }, + { + "epoch": 0.27880274232205565, + "grad_norm": 0.5449507236480713, + "learning_rate": 8.3478140636764e-05, + "loss": 1.661, + "step": 5002 + }, + { + "epoch": 0.2788584805752188, + "grad_norm": 0.5257706642150879, + "learning_rate": 8.347152643600076e-05, + "loss": 1.6633, + "step": 5003 + }, + { + "epoch": 0.2789142188283819, + "grad_norm": 0.5481857657432556, + "learning_rate": 8.346491117371584e-05, + "loss": 1.7599, + "step": 5004 + }, + { + "epoch": 0.27896995708154504, + "grad_norm": 0.5461267232894897, + "learning_rate": 8.345829485011906e-05, + "loss": 1.6645, + "step": 5005 + }, + { + "epoch": 0.2790256953347082, + "grad_norm": 0.5450317859649658, + "learning_rate": 8.345167746542024e-05, + "loss": 1.7965, + "step": 5006 + }, + { + "epoch": 0.27908143358787135, + "grad_norm": 0.5598206520080566, + "learning_rate": 8.344505901982926e-05, + "loss": 1.8171, + "step": 5007 + }, + { + "epoch": 0.2791371718410345, + "grad_norm": 0.5036829113960266, + "learning_rate": 8.343843951355599e-05, + "loss": 1.5853, + "step": 5008 + }, + { + "epoch": 0.27919291009419767, + "grad_norm": 0.5530052185058594, + "learning_rate": 8.34318189468104e-05, + "loss": 1.8362, + "step": 5009 + }, + { + "epoch": 0.2792486483473608, + "grad_norm": 0.5920783877372742, + "learning_rate": 8.34251973198024e-05, + "loss": 1.7712, + "step": 5010 + }, + { + "epoch": 0.2793043866005239, + "grad_norm": 0.5592779517173767, + "learning_rate": 8.341857463274204e-05, + "loss": 1.729, + "step": 5011 + }, + { + "epoch": 0.2793601248536871, + "grad_norm": 0.5464910864830017, + "learning_rate": 8.341195088583934e-05, + "loss": 1.9075, + "step": 5012 + }, + { + "epoch": 0.27941586310685024, + "grad_norm": 0.5421869158744812, + "learning_rate": 8.340532607930435e-05, + "loss": 1.6845, + "step": 5013 + }, + { + "epoch": 0.27947160136001337, + "grad_norm": 0.6448494791984558, + "learning_rate": 8.339870021334721e-05, + "loss": 1.677, + "step": 5014 + }, + { + "epoch": 0.2795273396131765, + "grad_norm": 0.551950991153717, + "learning_rate": 8.339207328817801e-05, + "loss": 1.7604, + "step": 5015 + }, + { + "epoch": 0.2795830778663397, + "grad_norm": 0.5297108292579651, + "learning_rate": 8.338544530400694e-05, + "loss": 1.8327, + "step": 5016 + }, + { + "epoch": 0.2796388161195028, + "grad_norm": 0.5589694976806641, + "learning_rate": 8.337881626104418e-05, + "loss": 1.8363, + "step": 5017 + }, + { + "epoch": 0.27969455437266594, + "grad_norm": 0.5295442342758179, + "learning_rate": 8.337218615949999e-05, + "loss": 1.5949, + "step": 5018 + }, + { + "epoch": 0.2797502926258291, + "grad_norm": 0.5680721998214722, + "learning_rate": 8.336555499958463e-05, + "loss": 1.7101, + "step": 5019 + }, + { + "epoch": 0.27980603087899225, + "grad_norm": 0.5222816467285156, + "learning_rate": 8.33589227815084e-05, + "loss": 1.6419, + "step": 5020 + }, + { + "epoch": 0.2798617691321554, + "grad_norm": 0.5572875142097473, + "learning_rate": 8.335228950548164e-05, + "loss": 1.5752, + "step": 5021 + }, + { + "epoch": 0.27991750738531856, + "grad_norm": 0.5234338641166687, + "learning_rate": 8.334565517171471e-05, + "loss": 1.608, + "step": 5022 + }, + { + "epoch": 0.2799732456384817, + "grad_norm": 0.5773409008979797, + "learning_rate": 8.333901978041801e-05, + "loss": 1.8295, + "step": 5023 + }, + { + "epoch": 0.2800289838916448, + "grad_norm": 0.6236357092857361, + "learning_rate": 8.3332383331802e-05, + "loss": 2.1082, + "step": 5024 + }, + { + "epoch": 0.280084722144808, + "grad_norm": 0.5226585865020752, + "learning_rate": 8.332574582607712e-05, + "loss": 1.5637, + "step": 5025 + }, + { + "epoch": 0.28014046039797114, + "grad_norm": 0.5552464723587036, + "learning_rate": 8.331910726345389e-05, + "loss": 1.565, + "step": 5026 + }, + { + "epoch": 0.28019619865113427, + "grad_norm": 0.5889436602592468, + "learning_rate": 8.331246764414282e-05, + "loss": 1.6853, + "step": 5027 + }, + { + "epoch": 0.2802519369042974, + "grad_norm": 0.5935594439506531, + "learning_rate": 8.330582696835453e-05, + "loss": 1.8281, + "step": 5028 + }, + { + "epoch": 0.2803076751574606, + "grad_norm": 0.5328096747398376, + "learning_rate": 8.329918523629958e-05, + "loss": 1.5658, + "step": 5029 + }, + { + "epoch": 0.2803634134106237, + "grad_norm": 0.5282544493675232, + "learning_rate": 8.329254244818862e-05, + "loss": 1.5369, + "step": 5030 + }, + { + "epoch": 0.28041915166378684, + "grad_norm": 0.5771158337593079, + "learning_rate": 8.328589860423234e-05, + "loss": 1.718, + "step": 5031 + }, + { + "epoch": 0.28047488991695, + "grad_norm": 0.5074672698974609, + "learning_rate": 8.327925370464142e-05, + "loss": 1.5096, + "step": 5032 + }, + { + "epoch": 0.28053062817011315, + "grad_norm": 0.5818241834640503, + "learning_rate": 8.32726077496266e-05, + "loss": 1.8082, + "step": 5033 + }, + { + "epoch": 0.2805863664232763, + "grad_norm": 0.5617592930793762, + "learning_rate": 8.326596073939865e-05, + "loss": 1.885, + "step": 5034 + }, + { + "epoch": 0.28064210467643946, + "grad_norm": 0.5317988991737366, + "learning_rate": 8.325931267416837e-05, + "loss": 1.6933, + "step": 5035 + }, + { + "epoch": 0.2806978429296026, + "grad_norm": 0.5429521799087524, + "learning_rate": 8.325266355414663e-05, + "loss": 1.7869, + "step": 5036 + }, + { + "epoch": 0.2807535811827657, + "grad_norm": 0.5846121311187744, + "learning_rate": 8.324601337954427e-05, + "loss": 1.8213, + "step": 5037 + }, + { + "epoch": 0.28080931943592885, + "grad_norm": 0.5202860236167908, + "learning_rate": 8.323936215057219e-05, + "loss": 1.5685, + "step": 5038 + }, + { + "epoch": 0.28086505768909203, + "grad_norm": 0.5208321213722229, + "learning_rate": 8.323270986744136e-05, + "loss": 1.6801, + "step": 5039 + }, + { + "epoch": 0.28092079594225516, + "grad_norm": 0.5601228475570679, + "learning_rate": 8.322605653036273e-05, + "loss": 1.7527, + "step": 5040 + }, + { + "epoch": 0.2809765341954183, + "grad_norm": 0.5703938603401184, + "learning_rate": 8.32194021395473e-05, + "loss": 1.7583, + "step": 5041 + }, + { + "epoch": 0.2810322724485815, + "grad_norm": 0.5135952234268188, + "learning_rate": 8.321274669520613e-05, + "loss": 1.6603, + "step": 5042 + }, + { + "epoch": 0.2810880107017446, + "grad_norm": 0.5345764756202698, + "learning_rate": 8.320609019755025e-05, + "loss": 1.8041, + "step": 5043 + }, + { + "epoch": 0.28114374895490774, + "grad_norm": 0.5866489410400391, + "learning_rate": 8.319943264679082e-05, + "loss": 1.8187, + "step": 5044 + }, + { + "epoch": 0.2811994872080709, + "grad_norm": 0.5317565202713013, + "learning_rate": 8.319277404313895e-05, + "loss": 1.627, + "step": 5045 + }, + { + "epoch": 0.28125522546123405, + "grad_norm": 0.5532716512680054, + "learning_rate": 8.318611438680581e-05, + "loss": 1.7922, + "step": 5046 + }, + { + "epoch": 0.2813109637143972, + "grad_norm": 0.5880955457687378, + "learning_rate": 8.317945367800262e-05, + "loss": 1.9276, + "step": 5047 + }, + { + "epoch": 0.28136670196756036, + "grad_norm": 0.5237969160079956, + "learning_rate": 8.31727919169406e-05, + "loss": 1.6415, + "step": 5048 + }, + { + "epoch": 0.2814224402207235, + "grad_norm": 0.5675956010818481, + "learning_rate": 8.316612910383104e-05, + "loss": 1.7371, + "step": 5049 + }, + { + "epoch": 0.2814781784738866, + "grad_norm": 0.5321084260940552, + "learning_rate": 8.315946523888523e-05, + "loss": 1.5045, + "step": 5050 + }, + { + "epoch": 0.28153391672704975, + "grad_norm": 0.5198732614517212, + "learning_rate": 8.31528003223145e-05, + "loss": 1.7094, + "step": 5051 + }, + { + "epoch": 0.28158965498021293, + "grad_norm": 0.5548423528671265, + "learning_rate": 8.314613435433025e-05, + "loss": 1.7824, + "step": 5052 + }, + { + "epoch": 0.28164539323337606, + "grad_norm": 0.5975722074508667, + "learning_rate": 8.313946733514388e-05, + "loss": 1.6823, + "step": 5053 + }, + { + "epoch": 0.2817011314865392, + "grad_norm": 0.5505688190460205, + "learning_rate": 8.313279926496682e-05, + "loss": 1.6891, + "step": 5054 + }, + { + "epoch": 0.2817568697397024, + "grad_norm": 0.535331666469574, + "learning_rate": 8.312613014401053e-05, + "loss": 1.6879, + "step": 5055 + }, + { + "epoch": 0.2818126079928655, + "grad_norm": 0.5429748296737671, + "learning_rate": 8.311945997248656e-05, + "loss": 1.7741, + "step": 5056 + }, + { + "epoch": 0.28186834624602863, + "grad_norm": 0.5404984354972839, + "learning_rate": 8.31127887506064e-05, + "loss": 1.5888, + "step": 5057 + }, + { + "epoch": 0.2819240844991918, + "grad_norm": 0.6144102811813354, + "learning_rate": 8.310611647858164e-05, + "loss": 1.8173, + "step": 5058 + }, + { + "epoch": 0.28197982275235495, + "grad_norm": 0.5709677934646606, + "learning_rate": 8.30994431566239e-05, + "loss": 1.6492, + "step": 5059 + }, + { + "epoch": 0.2820355610055181, + "grad_norm": 0.5943745374679565, + "learning_rate": 8.309276878494481e-05, + "loss": 1.9265, + "step": 5060 + }, + { + "epoch": 0.28209129925868126, + "grad_norm": 0.5663633942604065, + "learning_rate": 8.308609336375601e-05, + "loss": 1.5966, + "step": 5061 + }, + { + "epoch": 0.2821470375118444, + "grad_norm": 0.5235463380813599, + "learning_rate": 8.307941689326926e-05, + "loss": 1.6598, + "step": 5062 + }, + { + "epoch": 0.2822027757650075, + "grad_norm": 0.5473840832710266, + "learning_rate": 8.307273937369627e-05, + "loss": 1.3741, + "step": 5063 + }, + { + "epoch": 0.28225851401817065, + "grad_norm": 0.6380063891410828, + "learning_rate": 8.30660608052488e-05, + "loss": 1.7855, + "step": 5064 + }, + { + "epoch": 0.28231425227133383, + "grad_norm": 0.5315070748329163, + "learning_rate": 8.305938118813868e-05, + "loss": 1.6285, + "step": 5065 + }, + { + "epoch": 0.28236999052449696, + "grad_norm": 0.571528971195221, + "learning_rate": 8.305270052257773e-05, + "loss": 1.8315, + "step": 5066 + }, + { + "epoch": 0.2824257287776601, + "grad_norm": 0.5939456820487976, + "learning_rate": 8.304601880877784e-05, + "loss": 1.8598, + "step": 5067 + }, + { + "epoch": 0.2824814670308233, + "grad_norm": 0.5018705129623413, + "learning_rate": 8.30393360469509e-05, + "loss": 1.5472, + "step": 5068 + }, + { + "epoch": 0.2825372052839864, + "grad_norm": 0.5844521522521973, + "learning_rate": 8.303265223730885e-05, + "loss": 1.8186, + "step": 5069 + }, + { + "epoch": 0.28259294353714953, + "grad_norm": 0.5360279083251953, + "learning_rate": 8.302596738006367e-05, + "loss": 1.7101, + "step": 5070 + }, + { + "epoch": 0.2826486817903127, + "grad_norm": 0.5614787340164185, + "learning_rate": 8.301928147542736e-05, + "loss": 1.6207, + "step": 5071 + }, + { + "epoch": 0.28270442004347585, + "grad_norm": 0.5616874098777771, + "learning_rate": 8.301259452361197e-05, + "loss": 1.7829, + "step": 5072 + }, + { + "epoch": 0.282760158296639, + "grad_norm": 0.6129429340362549, + "learning_rate": 8.300590652482954e-05, + "loss": 1.844, + "step": 5073 + }, + { + "epoch": 0.2828158965498021, + "grad_norm": 0.5966079831123352, + "learning_rate": 8.29992174792922e-05, + "loss": 1.9242, + "step": 5074 + }, + { + "epoch": 0.2828716348029653, + "grad_norm": 0.5461622476577759, + "learning_rate": 8.299252738721206e-05, + "loss": 1.7337, + "step": 5075 + }, + { + "epoch": 0.2829273730561284, + "grad_norm": 0.5274501442909241, + "learning_rate": 8.298583624880135e-05, + "loss": 1.6531, + "step": 5076 + }, + { + "epoch": 0.28298311130929155, + "grad_norm": 0.6280329823493958, + "learning_rate": 8.29791440642722e-05, + "loss": 1.6198, + "step": 5077 + }, + { + "epoch": 0.28303884956245473, + "grad_norm": 0.5429005026817322, + "learning_rate": 8.297245083383689e-05, + "loss": 1.7574, + "step": 5078 + }, + { + "epoch": 0.28309458781561786, + "grad_norm": 0.586188018321991, + "learning_rate": 8.296575655770768e-05, + "loss": 1.7325, + "step": 5079 + }, + { + "epoch": 0.283150326068781, + "grad_norm": 0.48814016580581665, + "learning_rate": 8.295906123609688e-05, + "loss": 1.6964, + "step": 5080 + }, + { + "epoch": 0.2832060643219442, + "grad_norm": 0.518273651599884, + "learning_rate": 8.295236486921685e-05, + "loss": 1.6128, + "step": 5081 + }, + { + "epoch": 0.2832618025751073, + "grad_norm": 0.5701366066932678, + "learning_rate": 8.29456674572799e-05, + "loss": 1.8898, + "step": 5082 + }, + { + "epoch": 0.28331754082827043, + "grad_norm": 0.522463858127594, + "learning_rate": 8.293896900049846e-05, + "loss": 1.513, + "step": 5083 + }, + { + "epoch": 0.2833732790814336, + "grad_norm": 0.5641170144081116, + "learning_rate": 8.293226949908499e-05, + "loss": 1.658, + "step": 5084 + }, + { + "epoch": 0.28342901733459674, + "grad_norm": 0.5498567223548889, + "learning_rate": 8.292556895325194e-05, + "loss": 1.6148, + "step": 5085 + }, + { + "epoch": 0.2834847555877599, + "grad_norm": 0.5941603183746338, + "learning_rate": 8.29188673632118e-05, + "loss": 1.7469, + "step": 5086 + }, + { + "epoch": 0.283540493840923, + "grad_norm": 0.5746224522590637, + "learning_rate": 8.291216472917714e-05, + "loss": 1.6819, + "step": 5087 + }, + { + "epoch": 0.2835962320940862, + "grad_norm": 0.6701369285583496, + "learning_rate": 8.290546105136048e-05, + "loss": 1.3384, + "step": 5088 + }, + { + "epoch": 0.2836519703472493, + "grad_norm": 0.5807752013206482, + "learning_rate": 8.289875632997446e-05, + "loss": 1.6534, + "step": 5089 + }, + { + "epoch": 0.28370770860041244, + "grad_norm": 0.5432621240615845, + "learning_rate": 8.289205056523168e-05, + "loss": 1.6963, + "step": 5090 + }, + { + "epoch": 0.28376344685357563, + "grad_norm": 0.5509108901023865, + "learning_rate": 8.288534375734486e-05, + "loss": 1.6027, + "step": 5091 + }, + { + "epoch": 0.28381918510673876, + "grad_norm": 0.5456513166427612, + "learning_rate": 8.287863590652666e-05, + "loss": 1.6362, + "step": 5092 + }, + { + "epoch": 0.2838749233599019, + "grad_norm": 0.5441727042198181, + "learning_rate": 8.287192701298982e-05, + "loss": 1.5781, + "step": 5093 + }, + { + "epoch": 0.28393066161306507, + "grad_norm": 0.5558503866195679, + "learning_rate": 8.286521707694712e-05, + "loss": 1.8077, + "step": 5094 + }, + { + "epoch": 0.2839863998662282, + "grad_norm": 0.5933700799942017, + "learning_rate": 8.285850609861134e-05, + "loss": 1.8407, + "step": 5095 + }, + { + "epoch": 0.28404213811939133, + "grad_norm": 0.557685375213623, + "learning_rate": 8.285179407819534e-05, + "loss": 1.579, + "step": 5096 + }, + { + "epoch": 0.28409787637255446, + "grad_norm": 0.5183169841766357, + "learning_rate": 8.284508101591198e-05, + "loss": 1.3955, + "step": 5097 + }, + { + "epoch": 0.28415361462571764, + "grad_norm": 0.5807473659515381, + "learning_rate": 8.283836691197413e-05, + "loss": 1.8429, + "step": 5098 + }, + { + "epoch": 0.28420935287888077, + "grad_norm": 0.6236990690231323, + "learning_rate": 8.283165176659474e-05, + "loss": 1.8281, + "step": 5099 + }, + { + "epoch": 0.2842650911320439, + "grad_norm": 0.5581399202346802, + "learning_rate": 8.282493557998678e-05, + "loss": 1.764, + "step": 5100 + }, + { + "epoch": 0.2843208293852071, + "grad_norm": 0.5508102774620056, + "learning_rate": 8.281821835236325e-05, + "loss": 1.8694, + "step": 5101 + }, + { + "epoch": 0.2843765676383702, + "grad_norm": 0.6012663841247559, + "learning_rate": 8.281150008393718e-05, + "loss": 1.8829, + "step": 5102 + }, + { + "epoch": 0.28443230589153334, + "grad_norm": 0.5453019738197327, + "learning_rate": 8.280478077492163e-05, + "loss": 1.8996, + "step": 5103 + }, + { + "epoch": 0.28448804414469653, + "grad_norm": 0.5334420204162598, + "learning_rate": 8.27980604255297e-05, + "loss": 1.7342, + "step": 5104 + }, + { + "epoch": 0.28454378239785966, + "grad_norm": 0.5454635620117188, + "learning_rate": 8.279133903597451e-05, + "loss": 1.7496, + "step": 5105 + }, + { + "epoch": 0.2845995206510228, + "grad_norm": 0.5557402968406677, + "learning_rate": 8.278461660646925e-05, + "loss": 1.63, + "step": 5106 + }, + { + "epoch": 0.28465525890418597, + "grad_norm": 0.5542622208595276, + "learning_rate": 8.27778931372271e-05, + "loss": 1.6639, + "step": 5107 + }, + { + "epoch": 0.2847109971573491, + "grad_norm": 0.565591037273407, + "learning_rate": 8.277116862846126e-05, + "loss": 1.9303, + "step": 5108 + }, + { + "epoch": 0.28476673541051223, + "grad_norm": 0.6099279522895813, + "learning_rate": 8.276444308038504e-05, + "loss": 1.7833, + "step": 5109 + }, + { + "epoch": 0.28482247366367536, + "grad_norm": 0.6192046999931335, + "learning_rate": 8.27577164932117e-05, + "loss": 1.9167, + "step": 5110 + }, + { + "epoch": 0.28487821191683854, + "grad_norm": 0.5659559965133667, + "learning_rate": 8.275098886715462e-05, + "loss": 1.7716, + "step": 5111 + }, + { + "epoch": 0.28493395017000167, + "grad_norm": 0.6038410067558289, + "learning_rate": 8.274426020242709e-05, + "loss": 1.9078, + "step": 5112 + }, + { + "epoch": 0.2849896884231648, + "grad_norm": 0.5924156904220581, + "learning_rate": 8.273753049924256e-05, + "loss": 1.7014, + "step": 5113 + }, + { + "epoch": 0.285045426676328, + "grad_norm": 0.5436737537384033, + "learning_rate": 8.273079975781442e-05, + "loss": 1.6482, + "step": 5114 + }, + { + "epoch": 0.2851011649294911, + "grad_norm": 0.5460022687911987, + "learning_rate": 8.272406797835614e-05, + "loss": 1.7304, + "step": 5115 + }, + { + "epoch": 0.28515690318265424, + "grad_norm": 0.5954405069351196, + "learning_rate": 8.271733516108125e-05, + "loss": 1.6698, + "step": 5116 + }, + { + "epoch": 0.2852126414358174, + "grad_norm": 0.638888418674469, + "learning_rate": 8.27106013062032e-05, + "loss": 2.0553, + "step": 5117 + }, + { + "epoch": 0.28526837968898056, + "grad_norm": 0.5477131605148315, + "learning_rate": 8.270386641393564e-05, + "loss": 1.5031, + "step": 5118 + }, + { + "epoch": 0.2853241179421437, + "grad_norm": 0.5998544692993164, + "learning_rate": 8.269713048449208e-05, + "loss": 1.9087, + "step": 5119 + }, + { + "epoch": 0.2853798561953068, + "grad_norm": 0.5584544539451599, + "learning_rate": 8.26903935180862e-05, + "loss": 1.8125, + "step": 5120 + }, + { + "epoch": 0.28543559444847, + "grad_norm": 0.5390369892120361, + "learning_rate": 8.268365551493161e-05, + "loss": 1.6459, + "step": 5121 + }, + { + "epoch": 0.2854913327016331, + "grad_norm": 0.5171942710876465, + "learning_rate": 8.267691647524206e-05, + "loss": 1.6801, + "step": 5122 + }, + { + "epoch": 0.28554707095479626, + "grad_norm": 0.4894436299800873, + "learning_rate": 8.26701763992312e-05, + "loss": 1.4172, + "step": 5123 + }, + { + "epoch": 0.28560280920795944, + "grad_norm": 0.5318630337715149, + "learning_rate": 8.266343528711285e-05, + "loss": 1.6956, + "step": 5124 + }, + { + "epoch": 0.28565854746112257, + "grad_norm": 0.513378918170929, + "learning_rate": 8.265669313910077e-05, + "loss": 1.5235, + "step": 5125 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.6027741432189941, + "learning_rate": 8.264994995540878e-05, + "loss": 1.9089, + "step": 5126 + }, + { + "epoch": 0.2857700239674489, + "grad_norm": 0.5300361514091492, + "learning_rate": 8.264320573625075e-05, + "loss": 1.6013, + "step": 5127 + }, + { + "epoch": 0.285825762220612, + "grad_norm": 0.5484519600868225, + "learning_rate": 8.263646048184055e-05, + "loss": 1.6596, + "step": 5128 + }, + { + "epoch": 0.28588150047377514, + "grad_norm": 0.6186813116073608, + "learning_rate": 8.26297141923921e-05, + "loss": 1.7786, + "step": 5129 + }, + { + "epoch": 0.2859372387269383, + "grad_norm": 0.5475611686706543, + "learning_rate": 8.262296686811936e-05, + "loss": 1.6151, + "step": 5130 + }, + { + "epoch": 0.28599297698010145, + "grad_norm": 0.612417995929718, + "learning_rate": 8.261621850923634e-05, + "loss": 1.587, + "step": 5131 + }, + { + "epoch": 0.2860487152332646, + "grad_norm": 0.5619268417358398, + "learning_rate": 8.260946911595701e-05, + "loss": 1.6915, + "step": 5132 + }, + { + "epoch": 0.2861044534864277, + "grad_norm": 0.5510770678520203, + "learning_rate": 8.260271868849547e-05, + "loss": 1.9188, + "step": 5133 + }, + { + "epoch": 0.2861601917395909, + "grad_norm": 0.5569331049919128, + "learning_rate": 8.259596722706575e-05, + "loss": 1.7657, + "step": 5134 + }, + { + "epoch": 0.286215929992754, + "grad_norm": 0.48364466428756714, + "learning_rate": 8.258921473188202e-05, + "loss": 1.3247, + "step": 5135 + }, + { + "epoch": 0.28627166824591715, + "grad_norm": 0.5114015936851501, + "learning_rate": 8.25824612031584e-05, + "loss": 1.6025, + "step": 5136 + }, + { + "epoch": 0.28632740649908034, + "grad_norm": 0.5254806876182556, + "learning_rate": 8.257570664110907e-05, + "loss": 1.7264, + "step": 5137 + }, + { + "epoch": 0.28638314475224347, + "grad_norm": 0.5384583473205566, + "learning_rate": 8.256895104594828e-05, + "loss": 1.802, + "step": 5138 + }, + { + "epoch": 0.2864388830054066, + "grad_norm": 0.5924034118652344, + "learning_rate": 8.256219441789022e-05, + "loss": 1.9493, + "step": 5139 + }, + { + "epoch": 0.2864946212585698, + "grad_norm": 0.5453627705574036, + "learning_rate": 8.255543675714923e-05, + "loss": 1.5655, + "step": 5140 + }, + { + "epoch": 0.2865503595117329, + "grad_norm": 0.535179853439331, + "learning_rate": 8.254867806393957e-05, + "loss": 1.5492, + "step": 5141 + }, + { + "epoch": 0.28660609776489604, + "grad_norm": 0.5418823957443237, + "learning_rate": 8.254191833847564e-05, + "loss": 1.7343, + "step": 5142 + }, + { + "epoch": 0.28666183601805917, + "grad_norm": 0.5330826044082642, + "learning_rate": 8.253515758097179e-05, + "loss": 1.6551, + "step": 5143 + }, + { + "epoch": 0.28671757427122235, + "grad_norm": 0.6033239960670471, + "learning_rate": 8.252839579164243e-05, + "loss": 1.8227, + "step": 5144 + }, + { + "epoch": 0.2867733125243855, + "grad_norm": 0.5882185697555542, + "learning_rate": 8.252163297070201e-05, + "loss": 1.9731, + "step": 5145 + }, + { + "epoch": 0.2868290507775486, + "grad_norm": 0.537185788154602, + "learning_rate": 8.251486911836501e-05, + "loss": 1.5992, + "step": 5146 + }, + { + "epoch": 0.2868847890307118, + "grad_norm": 0.5307870507240295, + "learning_rate": 8.250810423484592e-05, + "loss": 1.5641, + "step": 5147 + }, + { + "epoch": 0.2869405272838749, + "grad_norm": 0.5483027696609497, + "learning_rate": 8.25013383203593e-05, + "loss": 1.759, + "step": 5148 + }, + { + "epoch": 0.28699626553703805, + "grad_norm": 0.5503141283988953, + "learning_rate": 8.249457137511976e-05, + "loss": 1.7229, + "step": 5149 + }, + { + "epoch": 0.28705200379020124, + "grad_norm": 0.5450831651687622, + "learning_rate": 8.248780339934183e-05, + "loss": 1.6758, + "step": 5150 + }, + { + "epoch": 0.28710774204336437, + "grad_norm": 0.5555149912834167, + "learning_rate": 8.248103439324022e-05, + "loss": 1.7173, + "step": 5151 + }, + { + "epoch": 0.2871634802965275, + "grad_norm": 0.5960267186164856, + "learning_rate": 8.247426435702956e-05, + "loss": 1.8327, + "step": 5152 + }, + { + "epoch": 0.2872192185496907, + "grad_norm": 0.5497944951057434, + "learning_rate": 8.246749329092458e-05, + "loss": 1.6373, + "step": 5153 + }, + { + "epoch": 0.2872749568028538, + "grad_norm": 0.6035077571868896, + "learning_rate": 8.246072119514e-05, + "loss": 2.0384, + "step": 5154 + }, + { + "epoch": 0.28733069505601694, + "grad_norm": 0.5685641765594482, + "learning_rate": 8.245394806989062e-05, + "loss": 1.9093, + "step": 5155 + }, + { + "epoch": 0.28738643330918007, + "grad_norm": 0.5542479753494263, + "learning_rate": 8.244717391539124e-05, + "loss": 1.6794, + "step": 5156 + }, + { + "epoch": 0.28744217156234325, + "grad_norm": 0.5434539318084717, + "learning_rate": 8.244039873185664e-05, + "loss": 1.6624, + "step": 5157 + }, + { + "epoch": 0.2874979098155064, + "grad_norm": 0.5240741968154907, + "learning_rate": 8.243362251950177e-05, + "loss": 1.7119, + "step": 5158 + }, + { + "epoch": 0.2875536480686695, + "grad_norm": 0.5400795340538025, + "learning_rate": 8.242684527854148e-05, + "loss": 1.7379, + "step": 5159 + }, + { + "epoch": 0.2876093863218327, + "grad_norm": 0.5450997352600098, + "learning_rate": 8.242006700919072e-05, + "loss": 1.648, + "step": 5160 + }, + { + "epoch": 0.2876651245749958, + "grad_norm": 0.5497955679893494, + "learning_rate": 8.241328771166446e-05, + "loss": 1.8969, + "step": 5161 + }, + { + "epoch": 0.28772086282815895, + "grad_norm": 0.556607186794281, + "learning_rate": 8.24065073861777e-05, + "loss": 1.7941, + "step": 5162 + }, + { + "epoch": 0.28777660108132214, + "grad_norm": 0.5775546431541443, + "learning_rate": 8.239972603294546e-05, + "loss": 1.7996, + "step": 5163 + }, + { + "epoch": 0.28783233933448527, + "grad_norm": 0.5500494241714478, + "learning_rate": 8.239294365218282e-05, + "loss": 1.486, + "step": 5164 + }, + { + "epoch": 0.2878880775876484, + "grad_norm": 0.5263432860374451, + "learning_rate": 8.238616024410486e-05, + "loss": 1.8011, + "step": 5165 + }, + { + "epoch": 0.2879438158408115, + "grad_norm": 0.580796480178833, + "learning_rate": 8.237937580892674e-05, + "loss": 1.7308, + "step": 5166 + }, + { + "epoch": 0.2879995540939747, + "grad_norm": 0.5561580657958984, + "learning_rate": 8.237259034686359e-05, + "loss": 1.7732, + "step": 5167 + }, + { + "epoch": 0.28805529234713784, + "grad_norm": 0.5456521511077881, + "learning_rate": 8.236580385813062e-05, + "loss": 1.6932, + "step": 5168 + }, + { + "epoch": 0.28811103060030097, + "grad_norm": 0.5676544904708862, + "learning_rate": 8.235901634294306e-05, + "loss": 1.8033, + "step": 5169 + }, + { + "epoch": 0.28816676885346415, + "grad_norm": 0.5046932697296143, + "learning_rate": 8.235222780151616e-05, + "loss": 1.5637, + "step": 5170 + }, + { + "epoch": 0.2882225071066273, + "grad_norm": 0.5261063575744629, + "learning_rate": 8.234543823406525e-05, + "loss": 1.5763, + "step": 5171 + }, + { + "epoch": 0.2882782453597904, + "grad_norm": 0.5619118809700012, + "learning_rate": 8.23386476408056e-05, + "loss": 1.7251, + "step": 5172 + }, + { + "epoch": 0.2883339836129536, + "grad_norm": 0.5556089282035828, + "learning_rate": 8.233185602195259e-05, + "loss": 1.7168, + "step": 5173 + }, + { + "epoch": 0.2883897218661167, + "grad_norm": 0.5449663400650024, + "learning_rate": 8.232506337772163e-05, + "loss": 1.7282, + "step": 5174 + }, + { + "epoch": 0.28844546011927985, + "grad_norm": 0.5821020007133484, + "learning_rate": 8.231826970832812e-05, + "loss": 2.0267, + "step": 5175 + }, + { + "epoch": 0.28850119837244304, + "grad_norm": 0.5104268193244934, + "learning_rate": 8.231147501398753e-05, + "loss": 1.4387, + "step": 5176 + }, + { + "epoch": 0.28855693662560616, + "grad_norm": 0.548219621181488, + "learning_rate": 8.230467929491534e-05, + "loss": 1.7042, + "step": 5177 + }, + { + "epoch": 0.2886126748787693, + "grad_norm": 0.5711565017700195, + "learning_rate": 8.229788255132706e-05, + "loss": 1.6752, + "step": 5178 + }, + { + "epoch": 0.2886684131319324, + "grad_norm": 0.526942789554596, + "learning_rate": 8.229108478343827e-05, + "loss": 1.5905, + "step": 5179 + }, + { + "epoch": 0.2887241513850956, + "grad_norm": 0.5535737872123718, + "learning_rate": 8.228428599146453e-05, + "loss": 1.6857, + "step": 5180 + }, + { + "epoch": 0.28877988963825874, + "grad_norm": 0.5093039870262146, + "learning_rate": 8.227748617562147e-05, + "loss": 1.6489, + "step": 5181 + }, + { + "epoch": 0.28883562789142186, + "grad_norm": 0.5642322301864624, + "learning_rate": 8.227068533612475e-05, + "loss": 1.8709, + "step": 5182 + }, + { + "epoch": 0.28889136614458505, + "grad_norm": 0.5547685623168945, + "learning_rate": 8.226388347319004e-05, + "loss": 1.7088, + "step": 5183 + }, + { + "epoch": 0.2889471043977482, + "grad_norm": 0.5316441059112549, + "learning_rate": 8.225708058703305e-05, + "loss": 1.59, + "step": 5184 + }, + { + "epoch": 0.2890028426509113, + "grad_norm": 0.5305221080780029, + "learning_rate": 8.225027667786955e-05, + "loss": 1.4301, + "step": 5185 + }, + { + "epoch": 0.2890585809040745, + "grad_norm": 0.5498524904251099, + "learning_rate": 8.224347174591529e-05, + "loss": 1.533, + "step": 5186 + }, + { + "epoch": 0.2891143191572376, + "grad_norm": 0.5519589781761169, + "learning_rate": 8.22366657913861e-05, + "loss": 1.7171, + "step": 5187 + }, + { + "epoch": 0.28917005741040075, + "grad_norm": 0.5893858075141907, + "learning_rate": 8.222985881449783e-05, + "loss": 1.7751, + "step": 5188 + }, + { + "epoch": 0.2892257956635639, + "grad_norm": 0.5334852933883667, + "learning_rate": 8.222305081546635e-05, + "loss": 1.6905, + "step": 5189 + }, + { + "epoch": 0.28928153391672706, + "grad_norm": 0.5692505836486816, + "learning_rate": 8.221624179450757e-05, + "loss": 1.6461, + "step": 5190 + }, + { + "epoch": 0.2893372721698902, + "grad_norm": 0.5988993644714355, + "learning_rate": 8.220943175183743e-05, + "loss": 2.0131, + "step": 5191 + }, + { + "epoch": 0.2893930104230533, + "grad_norm": 0.6873819231987, + "learning_rate": 8.220262068767191e-05, + "loss": 1.977, + "step": 5192 + }, + { + "epoch": 0.2894487486762165, + "grad_norm": 0.5408362746238708, + "learning_rate": 8.219580860222701e-05, + "loss": 1.6866, + "step": 5193 + }, + { + "epoch": 0.28950448692937963, + "grad_norm": 0.8928006291389465, + "learning_rate": 8.218899549571878e-05, + "loss": 1.6639, + "step": 5194 + }, + { + "epoch": 0.28956022518254276, + "grad_norm": 0.5256812572479248, + "learning_rate": 8.218218136836331e-05, + "loss": 1.435, + "step": 5195 + }, + { + "epoch": 0.28961596343570595, + "grad_norm": 0.5350750684738159, + "learning_rate": 8.217536622037667e-05, + "loss": 1.6317, + "step": 5196 + }, + { + "epoch": 0.2896717016888691, + "grad_norm": 0.5534375309944153, + "learning_rate": 8.2168550051975e-05, + "loss": 1.7473, + "step": 5197 + }, + { + "epoch": 0.2897274399420322, + "grad_norm": 0.5433312058448792, + "learning_rate": 8.216173286337448e-05, + "loss": 1.8094, + "step": 5198 + }, + { + "epoch": 0.2897831781951954, + "grad_norm": 0.5386417508125305, + "learning_rate": 8.215491465479133e-05, + "loss": 1.5757, + "step": 5199 + }, + { + "epoch": 0.2898389164483585, + "grad_norm": 0.6519530415534973, + "learning_rate": 8.214809542644173e-05, + "loss": 1.9404, + "step": 5200 + }, + { + "epoch": 0.28989465470152165, + "grad_norm": 0.6092321872711182, + "learning_rate": 8.214127517854199e-05, + "loss": 1.8751, + "step": 5201 + }, + { + "epoch": 0.2899503929546848, + "grad_norm": 0.5904344320297241, + "learning_rate": 8.213445391130841e-05, + "loss": 1.8278, + "step": 5202 + }, + { + "epoch": 0.29000613120784796, + "grad_norm": 0.6538552045822144, + "learning_rate": 8.212763162495729e-05, + "loss": 1.683, + "step": 5203 + }, + { + "epoch": 0.2900618694610111, + "grad_norm": 0.5683111548423767, + "learning_rate": 8.212080831970503e-05, + "loss": 1.6758, + "step": 5204 + }, + { + "epoch": 0.2901176077141742, + "grad_norm": 0.5633412599563599, + "learning_rate": 8.2113983995768e-05, + "loss": 1.7229, + "step": 5205 + }, + { + "epoch": 0.2901733459673374, + "grad_norm": 0.5722443461418152, + "learning_rate": 8.210715865336263e-05, + "loss": 1.8076, + "step": 5206 + }, + { + "epoch": 0.29022908422050053, + "grad_norm": 0.562892496585846, + "learning_rate": 8.21003322927054e-05, + "loss": 1.672, + "step": 5207 + }, + { + "epoch": 0.29028482247366366, + "grad_norm": 0.5266914367675781, + "learning_rate": 8.209350491401277e-05, + "loss": 1.6009, + "step": 5208 + }, + { + "epoch": 0.29034056072682685, + "grad_norm": 0.576404869556427, + "learning_rate": 8.20866765175013e-05, + "loss": 1.8675, + "step": 5209 + }, + { + "epoch": 0.29039629897999, + "grad_norm": 0.6091673374176025, + "learning_rate": 8.207984710338752e-05, + "loss": 1.7122, + "step": 5210 + }, + { + "epoch": 0.2904520372331531, + "grad_norm": 0.590103030204773, + "learning_rate": 8.207301667188803e-05, + "loss": 1.5629, + "step": 5211 + }, + { + "epoch": 0.29050777548631623, + "grad_norm": 0.5491459369659424, + "learning_rate": 8.206618522321945e-05, + "loss": 1.6373, + "step": 5212 + }, + { + "epoch": 0.2905635137394794, + "grad_norm": 0.5361247062683105, + "learning_rate": 8.205935275759842e-05, + "loss": 1.7587, + "step": 5213 + }, + { + "epoch": 0.29061925199264255, + "grad_norm": 0.5602622628211975, + "learning_rate": 8.205251927524164e-05, + "loss": 1.6596, + "step": 5214 + }, + { + "epoch": 0.2906749902458057, + "grad_norm": 0.5763882994651794, + "learning_rate": 8.204568477636585e-05, + "loss": 1.7195, + "step": 5215 + }, + { + "epoch": 0.29073072849896886, + "grad_norm": 0.5280525088310242, + "learning_rate": 8.203884926118777e-05, + "loss": 1.6929, + "step": 5216 + }, + { + "epoch": 0.290786466752132, + "grad_norm": 0.5279143452644348, + "learning_rate": 8.203201272992419e-05, + "loss": 1.4884, + "step": 5217 + }, + { + "epoch": 0.2908422050052951, + "grad_norm": 0.5360000729560852, + "learning_rate": 8.202517518279193e-05, + "loss": 1.6383, + "step": 5218 + }, + { + "epoch": 0.2908979432584583, + "grad_norm": 0.5178120732307434, + "learning_rate": 8.201833662000781e-05, + "loss": 1.3916, + "step": 5219 + }, + { + "epoch": 0.29095368151162143, + "grad_norm": 0.5441476702690125, + "learning_rate": 8.201149704178875e-05, + "loss": 1.8316, + "step": 5220 + }, + { + "epoch": 0.29100941976478456, + "grad_norm": 0.5272539854049683, + "learning_rate": 8.200465644835165e-05, + "loss": 1.479, + "step": 5221 + }, + { + "epoch": 0.29106515801794774, + "grad_norm": 0.5858429074287415, + "learning_rate": 8.199781483991345e-05, + "loss": 1.8735, + "step": 5222 + }, + { + "epoch": 0.2911208962711109, + "grad_norm": 0.5939355492591858, + "learning_rate": 8.19909722166911e-05, + "loss": 1.8911, + "step": 5223 + }, + { + "epoch": 0.291176634524274, + "grad_norm": 0.6942164301872253, + "learning_rate": 8.198412857890166e-05, + "loss": 1.5865, + "step": 5224 + }, + { + "epoch": 0.29123237277743713, + "grad_norm": 0.5283763408660889, + "learning_rate": 8.197728392676211e-05, + "loss": 1.518, + "step": 5225 + }, + { + "epoch": 0.2912881110306003, + "grad_norm": 0.5898897051811218, + "learning_rate": 8.197043826048957e-05, + "loss": 1.4729, + "step": 5226 + }, + { + "epoch": 0.29134384928376345, + "grad_norm": 0.6161963939666748, + "learning_rate": 8.196359158030113e-05, + "loss": 1.7724, + "step": 5227 + }, + { + "epoch": 0.2913995875369266, + "grad_norm": 0.5693463683128357, + "learning_rate": 8.195674388641393e-05, + "loss": 1.7379, + "step": 5228 + }, + { + "epoch": 0.29145532579008976, + "grad_norm": 0.5397728681564331, + "learning_rate": 8.194989517904513e-05, + "loss": 1.694, + "step": 5229 + }, + { + "epoch": 0.2915110640432529, + "grad_norm": 0.5856531858444214, + "learning_rate": 8.194304545841193e-05, + "loss": 1.7607, + "step": 5230 + }, + { + "epoch": 0.291566802296416, + "grad_norm": 0.5777943730354309, + "learning_rate": 8.19361947247316e-05, + "loss": 1.7321, + "step": 5231 + }, + { + "epoch": 0.2916225405495792, + "grad_norm": 0.5896830558776855, + "learning_rate": 8.192934297822133e-05, + "loss": 1.7183, + "step": 5232 + }, + { + "epoch": 0.29167827880274233, + "grad_norm": 0.6119521260261536, + "learning_rate": 8.192249021909847e-05, + "loss": 1.9229, + "step": 5233 + }, + { + "epoch": 0.29173401705590546, + "grad_norm": 0.5776544213294983, + "learning_rate": 8.191563644758037e-05, + "loss": 1.8151, + "step": 5234 + }, + { + "epoch": 0.2917897553090686, + "grad_norm": 0.510097086429596, + "learning_rate": 8.190878166388435e-05, + "loss": 1.6619, + "step": 5235 + }, + { + "epoch": 0.2918454935622318, + "grad_norm": 0.5378518104553223, + "learning_rate": 8.19019258682278e-05, + "loss": 1.8347, + "step": 5236 + }, + { + "epoch": 0.2919012318153949, + "grad_norm": 0.5934120416641235, + "learning_rate": 8.189506906082818e-05, + "loss": 1.7583, + "step": 5237 + }, + { + "epoch": 0.29195697006855803, + "grad_norm": 0.49861982464790344, + "learning_rate": 8.188821124190293e-05, + "loss": 1.4644, + "step": 5238 + }, + { + "epoch": 0.2920127083217212, + "grad_norm": 0.5318624377250671, + "learning_rate": 8.188135241166953e-05, + "loss": 1.6562, + "step": 5239 + }, + { + "epoch": 0.29206844657488434, + "grad_norm": 0.5517171621322632, + "learning_rate": 8.187449257034552e-05, + "loss": 1.6493, + "step": 5240 + }, + { + "epoch": 0.2921241848280475, + "grad_norm": 0.5400835275650024, + "learning_rate": 8.186763171814845e-05, + "loss": 1.5672, + "step": 5241 + }, + { + "epoch": 0.29217992308121066, + "grad_norm": 0.5250990986824036, + "learning_rate": 8.186076985529589e-05, + "loss": 1.6091, + "step": 5242 + }, + { + "epoch": 0.2922356613343738, + "grad_norm": 0.5855765342712402, + "learning_rate": 8.18539069820055e-05, + "loss": 1.8457, + "step": 5243 + }, + { + "epoch": 0.2922913995875369, + "grad_norm": 0.6245700716972351, + "learning_rate": 8.184704309849487e-05, + "loss": 1.5562, + "step": 5244 + }, + { + "epoch": 0.2923471378407001, + "grad_norm": 0.583342432975769, + "learning_rate": 8.184017820498173e-05, + "loss": 1.8421, + "step": 5245 + }, + { + "epoch": 0.29240287609386323, + "grad_norm": 0.576387345790863, + "learning_rate": 8.183331230168377e-05, + "loss": 1.7761, + "step": 5246 + }, + { + "epoch": 0.29245861434702636, + "grad_norm": 0.5464752316474915, + "learning_rate": 8.182644538881873e-05, + "loss": 1.6677, + "step": 5247 + }, + { + "epoch": 0.2925143526001895, + "grad_norm": 0.602606475353241, + "learning_rate": 8.181957746660445e-05, + "loss": 2.0468, + "step": 5248 + }, + { + "epoch": 0.29257009085335267, + "grad_norm": 0.535839855670929, + "learning_rate": 8.181270853525866e-05, + "loss": 1.5903, + "step": 5249 + }, + { + "epoch": 0.2926258291065158, + "grad_norm": 0.5617656707763672, + "learning_rate": 8.180583859499923e-05, + "loss": 1.6818, + "step": 5250 + }, + { + "epoch": 0.29268156735967893, + "grad_norm": 0.5979596972465515, + "learning_rate": 8.179896764604407e-05, + "loss": 1.7915, + "step": 5251 + }, + { + "epoch": 0.2927373056128421, + "grad_norm": 0.5312914848327637, + "learning_rate": 8.179209568861104e-05, + "loss": 1.4523, + "step": 5252 + }, + { + "epoch": 0.29279304386600524, + "grad_norm": 0.5243698358535767, + "learning_rate": 8.178522272291809e-05, + "loss": 1.5611, + "step": 5253 + }, + { + "epoch": 0.29284878211916837, + "grad_norm": 0.5564961433410645, + "learning_rate": 8.17783487491832e-05, + "loss": 1.7228, + "step": 5254 + }, + { + "epoch": 0.29290452037233156, + "grad_norm": 0.5704841613769531, + "learning_rate": 8.177147376762437e-05, + "loss": 1.8324, + "step": 5255 + }, + { + "epoch": 0.2929602586254947, + "grad_norm": 0.5011201500892639, + "learning_rate": 8.176459777845964e-05, + "loss": 1.6782, + "step": 5256 + }, + { + "epoch": 0.2930159968786578, + "grad_norm": 0.4964855909347534, + "learning_rate": 8.175772078190707e-05, + "loss": 1.4567, + "step": 5257 + }, + { + "epoch": 0.29307173513182094, + "grad_norm": 0.547637403011322, + "learning_rate": 8.175084277818472e-05, + "loss": 1.6129, + "step": 5258 + }, + { + "epoch": 0.2931274733849841, + "grad_norm": 0.5082324743270874, + "learning_rate": 8.174396376751079e-05, + "loss": 1.5253, + "step": 5259 + }, + { + "epoch": 0.29318321163814726, + "grad_norm": 0.535663366317749, + "learning_rate": 8.173708375010342e-05, + "loss": 1.574, + "step": 5260 + }, + { + "epoch": 0.2932389498913104, + "grad_norm": 0.5733945965766907, + "learning_rate": 8.173020272618078e-05, + "loss": 1.8022, + "step": 5261 + }, + { + "epoch": 0.29329468814447357, + "grad_norm": 0.5937253832817078, + "learning_rate": 8.172332069596111e-05, + "loss": 1.952, + "step": 5262 + }, + { + "epoch": 0.2933504263976367, + "grad_norm": 0.5622910261154175, + "learning_rate": 8.171643765966266e-05, + "loss": 1.6838, + "step": 5263 + }, + { + "epoch": 0.29340616465079983, + "grad_norm": 0.5633754730224609, + "learning_rate": 8.170955361750373e-05, + "loss": 1.8205, + "step": 5264 + }, + { + "epoch": 0.293461902903963, + "grad_norm": 0.5639583468437195, + "learning_rate": 8.170266856970264e-05, + "loss": 1.6995, + "step": 5265 + }, + { + "epoch": 0.29351764115712614, + "grad_norm": 0.5767412781715393, + "learning_rate": 8.169578251647775e-05, + "loss": 1.8193, + "step": 5266 + }, + { + "epoch": 0.29357337941028927, + "grad_norm": 0.5323848128318787, + "learning_rate": 8.168889545804743e-05, + "loss": 1.6137, + "step": 5267 + }, + { + "epoch": 0.29362911766345245, + "grad_norm": 0.5105542540550232, + "learning_rate": 8.16820073946301e-05, + "loss": 1.3883, + "step": 5268 + }, + { + "epoch": 0.2936848559166156, + "grad_norm": 0.5348597168922424, + "learning_rate": 8.167511832644423e-05, + "loss": 1.7465, + "step": 5269 + }, + { + "epoch": 0.2937405941697787, + "grad_norm": 0.5634239315986633, + "learning_rate": 8.166822825370828e-05, + "loss": 1.8121, + "step": 5270 + }, + { + "epoch": 0.29379633242294184, + "grad_norm": 0.5704219937324524, + "learning_rate": 8.166133717664075e-05, + "loss": 1.8007, + "step": 5271 + }, + { + "epoch": 0.293852070676105, + "grad_norm": 0.5514686703681946, + "learning_rate": 8.165444509546023e-05, + "loss": 1.7627, + "step": 5272 + }, + { + "epoch": 0.29390780892926816, + "grad_norm": 0.5763065218925476, + "learning_rate": 8.164755201038525e-05, + "loss": 1.8668, + "step": 5273 + }, + { + "epoch": 0.2939635471824313, + "grad_norm": 0.5290045738220215, + "learning_rate": 8.164065792163445e-05, + "loss": 1.6992, + "step": 5274 + }, + { + "epoch": 0.29401928543559447, + "grad_norm": 0.5327118039131165, + "learning_rate": 8.163376282942645e-05, + "loss": 1.6882, + "step": 5275 + }, + { + "epoch": 0.2940750236887576, + "grad_norm": 0.5230002403259277, + "learning_rate": 8.162686673397995e-05, + "loss": 1.6314, + "step": 5276 + }, + { + "epoch": 0.2941307619419207, + "grad_norm": 0.5596842765808105, + "learning_rate": 8.161996963551361e-05, + "loss": 1.8543, + "step": 5277 + }, + { + "epoch": 0.2941865001950839, + "grad_norm": 0.4837280809879303, + "learning_rate": 8.16130715342462e-05, + "loss": 1.407, + "step": 5278 + }, + { + "epoch": 0.29424223844824704, + "grad_norm": 0.5188647508621216, + "learning_rate": 8.160617243039648e-05, + "loss": 1.6469, + "step": 5279 + }, + { + "epoch": 0.29429797670141017, + "grad_norm": 0.5345882177352905, + "learning_rate": 8.159927232418325e-05, + "loss": 1.762, + "step": 5280 + }, + { + "epoch": 0.2943537149545733, + "grad_norm": 0.6385248303413391, + "learning_rate": 8.159237121582532e-05, + "loss": 1.725, + "step": 5281 + }, + { + "epoch": 0.2944094532077365, + "grad_norm": 0.532394289970398, + "learning_rate": 8.158546910554159e-05, + "loss": 1.59, + "step": 5282 + }, + { + "epoch": 0.2944651914608996, + "grad_norm": 0.5918634533882141, + "learning_rate": 8.157856599355093e-05, + "loss": 1.8722, + "step": 5283 + }, + { + "epoch": 0.29452092971406274, + "grad_norm": 0.5643036365509033, + "learning_rate": 8.157166188007228e-05, + "loss": 1.6608, + "step": 5284 + }, + { + "epoch": 0.2945766679672259, + "grad_norm": 0.5480226874351501, + "learning_rate": 8.156475676532458e-05, + "loss": 1.6745, + "step": 5285 + }, + { + "epoch": 0.29463240622038905, + "grad_norm": 0.5562642216682434, + "learning_rate": 8.155785064952683e-05, + "loss": 1.9036, + "step": 5286 + }, + { + "epoch": 0.2946881444735522, + "grad_norm": 0.5737085938453674, + "learning_rate": 8.155094353289807e-05, + "loss": 1.6749, + "step": 5287 + }, + { + "epoch": 0.29474388272671537, + "grad_norm": 0.537407398223877, + "learning_rate": 8.154403541565732e-05, + "loss": 1.5855, + "step": 5288 + }, + { + "epoch": 0.2947996209798785, + "grad_norm": 0.5637186169624329, + "learning_rate": 8.153712629802369e-05, + "loss": 1.6667, + "step": 5289 + }, + { + "epoch": 0.2948553592330416, + "grad_norm": 0.587086021900177, + "learning_rate": 8.153021618021628e-05, + "loss": 1.709, + "step": 5290 + }, + { + "epoch": 0.2949110974862048, + "grad_norm": 0.5255305767059326, + "learning_rate": 8.152330506245425e-05, + "loss": 1.4982, + "step": 5291 + }, + { + "epoch": 0.29496683573936794, + "grad_norm": 0.5582296848297119, + "learning_rate": 8.151639294495678e-05, + "loss": 1.6915, + "step": 5292 + }, + { + "epoch": 0.29502257399253107, + "grad_norm": 0.5476033687591553, + "learning_rate": 8.150947982794307e-05, + "loss": 1.4827, + "step": 5293 + }, + { + "epoch": 0.2950783122456942, + "grad_norm": 0.548763632774353, + "learning_rate": 8.150256571163238e-05, + "loss": 1.805, + "step": 5294 + }, + { + "epoch": 0.2951340504988574, + "grad_norm": 0.58586585521698, + "learning_rate": 8.149565059624398e-05, + "loss": 1.7433, + "step": 5295 + }, + { + "epoch": 0.2951897887520205, + "grad_norm": 0.5618621110916138, + "learning_rate": 8.148873448199717e-05, + "loss": 1.7681, + "step": 5296 + }, + { + "epoch": 0.29524552700518364, + "grad_norm": 0.5388831496238708, + "learning_rate": 8.148181736911129e-05, + "loss": 1.582, + "step": 5297 + }, + { + "epoch": 0.2953012652583468, + "grad_norm": 0.5742696523666382, + "learning_rate": 8.147489925780572e-05, + "loss": 1.8182, + "step": 5298 + }, + { + "epoch": 0.29535700351150995, + "grad_norm": 0.5271889567375183, + "learning_rate": 8.146798014829986e-05, + "loss": 1.4823, + "step": 5299 + }, + { + "epoch": 0.2954127417646731, + "grad_norm": 0.5565046072006226, + "learning_rate": 8.146106004081315e-05, + "loss": 1.6328, + "step": 5300 + }, + { + "epoch": 0.29546848001783627, + "grad_norm": 0.5434616804122925, + "learning_rate": 8.145413893556503e-05, + "loss": 1.5871, + "step": 5301 + }, + { + "epoch": 0.2955242182709994, + "grad_norm": 0.5343239903450012, + "learning_rate": 8.144721683277504e-05, + "loss": 1.6328, + "step": 5302 + }, + { + "epoch": 0.2955799565241625, + "grad_norm": 0.5372942686080933, + "learning_rate": 8.144029373266264e-05, + "loss": 1.6885, + "step": 5303 + }, + { + "epoch": 0.29563569477732565, + "grad_norm": 0.5881915092468262, + "learning_rate": 8.143336963544746e-05, + "loss": 1.8579, + "step": 5304 + }, + { + "epoch": 0.29569143303048884, + "grad_norm": 0.5892425179481506, + "learning_rate": 8.142644454134905e-05, + "loss": 1.8771, + "step": 5305 + }, + { + "epoch": 0.29574717128365197, + "grad_norm": 0.5286465287208557, + "learning_rate": 8.141951845058707e-05, + "loss": 1.6766, + "step": 5306 + }, + { + "epoch": 0.2958029095368151, + "grad_norm": 0.5843679904937744, + "learning_rate": 8.141259136338113e-05, + "loss": 1.7359, + "step": 5307 + }, + { + "epoch": 0.2958586477899783, + "grad_norm": 0.6178736090660095, + "learning_rate": 8.140566327995094e-05, + "loss": 1.9672, + "step": 5308 + }, + { + "epoch": 0.2959143860431414, + "grad_norm": 0.5524381399154663, + "learning_rate": 8.139873420051623e-05, + "loss": 1.5947, + "step": 5309 + }, + { + "epoch": 0.29597012429630454, + "grad_norm": 0.5591756105422974, + "learning_rate": 8.139180412529674e-05, + "loss": 1.7245, + "step": 5310 + }, + { + "epoch": 0.2960258625494677, + "grad_norm": 0.5642113089561462, + "learning_rate": 8.138487305451224e-05, + "loss": 1.7156, + "step": 5311 + }, + { + "epoch": 0.29608160080263085, + "grad_norm": 0.5767959356307983, + "learning_rate": 8.137794098838257e-05, + "loss": 1.78, + "step": 5312 + }, + { + "epoch": 0.296137339055794, + "grad_norm": 0.5422171950340271, + "learning_rate": 8.137100792712755e-05, + "loss": 1.9258, + "step": 5313 + }, + { + "epoch": 0.29619307730895716, + "grad_norm": 0.5860824584960938, + "learning_rate": 8.136407387096704e-05, + "loss": 1.7132, + "step": 5314 + }, + { + "epoch": 0.2962488155621203, + "grad_norm": 0.6460077166557312, + "learning_rate": 8.135713882012102e-05, + "loss": 1.8024, + "step": 5315 + }, + { + "epoch": 0.2963045538152834, + "grad_norm": 0.5744182467460632, + "learning_rate": 8.135020277480934e-05, + "loss": 1.7025, + "step": 5316 + }, + { + "epoch": 0.29636029206844655, + "grad_norm": 0.560867965221405, + "learning_rate": 8.134326573525202e-05, + "loss": 1.7402, + "step": 5317 + }, + { + "epoch": 0.29641603032160974, + "grad_norm": 0.5005339980125427, + "learning_rate": 8.133632770166907e-05, + "loss": 1.585, + "step": 5318 + }, + { + "epoch": 0.29647176857477286, + "grad_norm": 0.5216720700263977, + "learning_rate": 8.13293886742805e-05, + "loss": 1.7313, + "step": 5319 + }, + { + "epoch": 0.296527506827936, + "grad_norm": 0.5353510975837708, + "learning_rate": 8.132244865330638e-05, + "loss": 1.7854, + "step": 5320 + }, + { + "epoch": 0.2965832450810992, + "grad_norm": 0.5222895741462708, + "learning_rate": 8.131550763896682e-05, + "loss": 1.6821, + "step": 5321 + }, + { + "epoch": 0.2966389833342623, + "grad_norm": 0.5571734309196472, + "learning_rate": 8.130856563148193e-05, + "loss": 1.6151, + "step": 5322 + }, + { + "epoch": 0.29669472158742544, + "grad_norm": 0.5494416952133179, + "learning_rate": 8.130162263107189e-05, + "loss": 1.7497, + "step": 5323 + }, + { + "epoch": 0.2967504598405886, + "grad_norm": 0.5263827443122864, + "learning_rate": 8.129467863795688e-05, + "loss": 1.7157, + "step": 5324 + }, + { + "epoch": 0.29680619809375175, + "grad_norm": 0.5756681561470032, + "learning_rate": 8.128773365235711e-05, + "loss": 1.6488, + "step": 5325 + }, + { + "epoch": 0.2968619363469149, + "grad_norm": 0.5204091668128967, + "learning_rate": 8.128078767449287e-05, + "loss": 1.6868, + "step": 5326 + }, + { + "epoch": 0.296917674600078, + "grad_norm": 0.5748211145401001, + "learning_rate": 8.127384070458442e-05, + "loss": 1.9352, + "step": 5327 + }, + { + "epoch": 0.2969734128532412, + "grad_norm": 0.5648884773254395, + "learning_rate": 8.126689274285207e-05, + "loss": 1.9085, + "step": 5328 + }, + { + "epoch": 0.2970291511064043, + "grad_norm": 0.5396182537078857, + "learning_rate": 8.125994378951619e-05, + "loss": 1.715, + "step": 5329 + }, + { + "epoch": 0.29708488935956745, + "grad_norm": 0.5755982398986816, + "learning_rate": 8.125299384479714e-05, + "loss": 1.7472, + "step": 5330 + }, + { + "epoch": 0.29714062761273063, + "grad_norm": 0.5721607804298401, + "learning_rate": 8.124604290891535e-05, + "loss": 1.8646, + "step": 5331 + }, + { + "epoch": 0.29719636586589376, + "grad_norm": 0.5612310171127319, + "learning_rate": 8.123909098209126e-05, + "loss": 1.6506, + "step": 5332 + }, + { + "epoch": 0.2972521041190569, + "grad_norm": 0.5630115866661072, + "learning_rate": 8.123213806454535e-05, + "loss": 1.805, + "step": 5333 + }, + { + "epoch": 0.2973078423722201, + "grad_norm": 0.5319987535476685, + "learning_rate": 8.122518415649808e-05, + "loss": 1.6501, + "step": 5334 + }, + { + "epoch": 0.2973635806253832, + "grad_norm": 0.5346727967262268, + "learning_rate": 8.121822925817006e-05, + "loss": 1.7944, + "step": 5335 + }, + { + "epoch": 0.29741931887854633, + "grad_norm": 0.5356037616729736, + "learning_rate": 8.121127336978183e-05, + "loss": 1.5578, + "step": 5336 + }, + { + "epoch": 0.2974750571317095, + "grad_norm": 0.5593723058700562, + "learning_rate": 8.120431649155396e-05, + "loss": 1.7118, + "step": 5337 + }, + { + "epoch": 0.29753079538487265, + "grad_norm": 0.5361452102661133, + "learning_rate": 8.11973586237071e-05, + "loss": 1.7363, + "step": 5338 + }, + { + "epoch": 0.2975865336380358, + "grad_norm": 0.5503700971603394, + "learning_rate": 8.119039976646192e-05, + "loss": 1.74, + "step": 5339 + }, + { + "epoch": 0.2976422718911989, + "grad_norm": 0.5040326714515686, + "learning_rate": 8.118343992003913e-05, + "loss": 1.5712, + "step": 5340 + }, + { + "epoch": 0.2976980101443621, + "grad_norm": 0.5251342058181763, + "learning_rate": 8.117647908465942e-05, + "loss": 1.5346, + "step": 5341 + }, + { + "epoch": 0.2977537483975252, + "grad_norm": 0.5664347410202026, + "learning_rate": 8.116951726054358e-05, + "loss": 2.0871, + "step": 5342 + }, + { + "epoch": 0.29780948665068835, + "grad_norm": 0.5798686742782593, + "learning_rate": 8.116255444791237e-05, + "loss": 1.5362, + "step": 5343 + }, + { + "epoch": 0.29786522490385153, + "grad_norm": 0.5248550772666931, + "learning_rate": 8.115559064698662e-05, + "loss": 1.5788, + "step": 5344 + }, + { + "epoch": 0.29792096315701466, + "grad_norm": 0.6149808764457703, + "learning_rate": 8.11486258579872e-05, + "loss": 1.7055, + "step": 5345 + }, + { + "epoch": 0.2979767014101778, + "grad_norm": 0.6035127639770508, + "learning_rate": 8.114166008113498e-05, + "loss": 1.8135, + "step": 5346 + }, + { + "epoch": 0.298032439663341, + "grad_norm": 0.5967592000961304, + "learning_rate": 8.113469331665085e-05, + "loss": 1.655, + "step": 5347 + }, + { + "epoch": 0.2980881779165041, + "grad_norm": 0.5948666334152222, + "learning_rate": 8.112772556475579e-05, + "loss": 2.0929, + "step": 5348 + }, + { + "epoch": 0.29814391616966723, + "grad_norm": 0.5955588221549988, + "learning_rate": 8.112075682567075e-05, + "loss": 1.6594, + "step": 5349 + }, + { + "epoch": 0.29819965442283036, + "grad_norm": 0.5304718017578125, + "learning_rate": 8.111378709961676e-05, + "loss": 1.7254, + "step": 5350 + }, + { + "epoch": 0.29825539267599355, + "grad_norm": 0.5426492691040039, + "learning_rate": 8.110681638681485e-05, + "loss": 1.7559, + "step": 5351 + }, + { + "epoch": 0.2983111309291567, + "grad_norm": 0.6616886258125305, + "learning_rate": 8.109984468748608e-05, + "loss": 1.6271, + "step": 5352 + }, + { + "epoch": 0.2983668691823198, + "grad_norm": 0.537685751914978, + "learning_rate": 8.109287200185157e-05, + "loss": 1.6231, + "step": 5353 + }, + { + "epoch": 0.298422607435483, + "grad_norm": 0.5190281867980957, + "learning_rate": 8.108589833013245e-05, + "loss": 1.5838, + "step": 5354 + }, + { + "epoch": 0.2984783456886461, + "grad_norm": 0.5232527852058411, + "learning_rate": 8.107892367254986e-05, + "loss": 1.5132, + "step": 5355 + }, + { + "epoch": 0.29853408394180925, + "grad_norm": 0.5797703266143799, + "learning_rate": 8.107194802932503e-05, + "loss": 1.811, + "step": 5356 + }, + { + "epoch": 0.29858982219497243, + "grad_norm": 0.5324226021766663, + "learning_rate": 8.106497140067916e-05, + "loss": 1.8477, + "step": 5357 + }, + { + "epoch": 0.29864556044813556, + "grad_norm": 0.5274566411972046, + "learning_rate": 8.105799378683353e-05, + "loss": 1.5521, + "step": 5358 + }, + { + "epoch": 0.2987012987012987, + "grad_norm": 0.5862823128700256, + "learning_rate": 8.10510151880094e-05, + "loss": 1.6123, + "step": 5359 + }, + { + "epoch": 0.2987570369544619, + "grad_norm": 0.5503446459770203, + "learning_rate": 8.104403560442813e-05, + "loss": 1.6369, + "step": 5360 + }, + { + "epoch": 0.298812775207625, + "grad_norm": 0.5560075044631958, + "learning_rate": 8.103705503631104e-05, + "loss": 1.762, + "step": 5361 + }, + { + "epoch": 0.29886851346078813, + "grad_norm": 0.5699611306190491, + "learning_rate": 8.103007348387952e-05, + "loss": 1.9896, + "step": 5362 + }, + { + "epoch": 0.29892425171395126, + "grad_norm": 0.5774125456809998, + "learning_rate": 8.102309094735498e-05, + "loss": 1.7463, + "step": 5363 + }, + { + "epoch": 0.29897998996711445, + "grad_norm": 0.5046089887619019, + "learning_rate": 8.101610742695889e-05, + "loss": 1.4381, + "step": 5364 + }, + { + "epoch": 0.2990357282202776, + "grad_norm": 0.5611773133277893, + "learning_rate": 8.100912292291269e-05, + "loss": 1.8118, + "step": 5365 + }, + { + "epoch": 0.2990914664734407, + "grad_norm": 0.5826941132545471, + "learning_rate": 8.100213743543793e-05, + "loss": 1.7309, + "step": 5366 + }, + { + "epoch": 0.2991472047266039, + "grad_norm": 0.5598444938659668, + "learning_rate": 8.099515096475611e-05, + "loss": 1.7422, + "step": 5367 + }, + { + "epoch": 0.299202942979767, + "grad_norm": 0.5191280841827393, + "learning_rate": 8.098816351108881e-05, + "loss": 1.5088, + "step": 5368 + }, + { + "epoch": 0.29925868123293015, + "grad_norm": 0.589454174041748, + "learning_rate": 8.098117507465765e-05, + "loss": 1.4643, + "step": 5369 + }, + { + "epoch": 0.29931441948609333, + "grad_norm": 0.5066042542457581, + "learning_rate": 8.097418565568424e-05, + "loss": 1.3811, + "step": 5370 + }, + { + "epoch": 0.29937015773925646, + "grad_norm": 0.5717688798904419, + "learning_rate": 8.096719525439026e-05, + "loss": 1.5929, + "step": 5371 + }, + { + "epoch": 0.2994258959924196, + "grad_norm": 0.5810229778289795, + "learning_rate": 8.096020387099739e-05, + "loss": 1.5428, + "step": 5372 + }, + { + "epoch": 0.2994816342455827, + "grad_norm": 0.5295297503471375, + "learning_rate": 8.095321150572738e-05, + "loss": 1.5148, + "step": 5373 + }, + { + "epoch": 0.2995373724987459, + "grad_norm": 0.6027771234512329, + "learning_rate": 8.094621815880197e-05, + "loss": 1.898, + "step": 5374 + }, + { + "epoch": 0.29959311075190903, + "grad_norm": 0.5107868909835815, + "learning_rate": 8.093922383044293e-05, + "loss": 1.4073, + "step": 5375 + }, + { + "epoch": 0.29964884900507216, + "grad_norm": 0.5989086031913757, + "learning_rate": 8.09322285208721e-05, + "loss": 1.7551, + "step": 5376 + }, + { + "epoch": 0.29970458725823534, + "grad_norm": 0.5706072449684143, + "learning_rate": 8.092523223031134e-05, + "loss": 1.8272, + "step": 5377 + }, + { + "epoch": 0.2997603255113985, + "grad_norm": 0.5593813061714172, + "learning_rate": 8.091823495898251e-05, + "loss": 1.6346, + "step": 5378 + }, + { + "epoch": 0.2998160637645616, + "grad_norm": 0.5510803461074829, + "learning_rate": 8.091123670710754e-05, + "loss": 1.7025, + "step": 5379 + }, + { + "epoch": 0.2998718020177248, + "grad_norm": 0.5860506892204285, + "learning_rate": 8.090423747490836e-05, + "loss": 1.6895, + "step": 5380 + }, + { + "epoch": 0.2999275402708879, + "grad_norm": 0.5655683875083923, + "learning_rate": 8.089723726260696e-05, + "loss": 1.8338, + "step": 5381 + }, + { + "epoch": 0.29998327852405104, + "grad_norm": 0.5369336605072021, + "learning_rate": 8.089023607042534e-05, + "loss": 1.65, + "step": 5382 + }, + { + "epoch": 0.30003901677721423, + "grad_norm": 0.5484170317649841, + "learning_rate": 8.088323389858552e-05, + "loss": 1.433, + "step": 5383 + }, + { + "epoch": 0.30009475503037736, + "grad_norm": 0.5139251947402954, + "learning_rate": 8.08762307473096e-05, + "loss": 1.3703, + "step": 5384 + }, + { + "epoch": 0.3001504932835405, + "grad_norm": 0.6160516142845154, + "learning_rate": 8.086922661681966e-05, + "loss": 2.1215, + "step": 5385 + }, + { + "epoch": 0.3002062315367036, + "grad_norm": 0.5299053192138672, + "learning_rate": 8.086222150733782e-05, + "loss": 1.5703, + "step": 5386 + }, + { + "epoch": 0.3002619697898668, + "grad_norm": 0.5320441722869873, + "learning_rate": 8.085521541908627e-05, + "loss": 1.5785, + "step": 5387 + }, + { + "epoch": 0.30031770804302993, + "grad_norm": 0.5633600354194641, + "learning_rate": 8.084820835228717e-05, + "loss": 1.799, + "step": 5388 + }, + { + "epoch": 0.30037344629619306, + "grad_norm": 0.5468734502792358, + "learning_rate": 8.084120030716275e-05, + "loss": 1.6782, + "step": 5389 + }, + { + "epoch": 0.30042918454935624, + "grad_norm": 0.5711122751235962, + "learning_rate": 8.083419128393528e-05, + "loss": 1.6544, + "step": 5390 + }, + { + "epoch": 0.30048492280251937, + "grad_norm": 0.5407732129096985, + "learning_rate": 8.082718128282705e-05, + "loss": 1.7962, + "step": 5391 + }, + { + "epoch": 0.3005406610556825, + "grad_norm": 0.5521290898323059, + "learning_rate": 8.082017030406037e-05, + "loss": 1.7551, + "step": 5392 + }, + { + "epoch": 0.3005963993088457, + "grad_norm": 0.5816917419433594, + "learning_rate": 8.081315834785756e-05, + "loss": 1.8789, + "step": 5393 + }, + { + "epoch": 0.3006521375620088, + "grad_norm": 0.5271922945976257, + "learning_rate": 8.080614541444103e-05, + "loss": 1.7545, + "step": 5394 + }, + { + "epoch": 0.30070787581517194, + "grad_norm": 0.543911337852478, + "learning_rate": 8.079913150403318e-05, + "loss": 1.6059, + "step": 5395 + }, + { + "epoch": 0.3007636140683351, + "grad_norm": 0.547044038772583, + "learning_rate": 8.079211661685644e-05, + "loss": 2.0125, + "step": 5396 + }, + { + "epoch": 0.30081935232149826, + "grad_norm": 0.6385172605514526, + "learning_rate": 8.07851007531333e-05, + "loss": 1.8713, + "step": 5397 + }, + { + "epoch": 0.3008750905746614, + "grad_norm": 0.5882077813148499, + "learning_rate": 8.077808391308626e-05, + "loss": 1.6547, + "step": 5398 + }, + { + "epoch": 0.3009308288278245, + "grad_norm": 0.5390593409538269, + "learning_rate": 8.077106609693784e-05, + "loss": 1.5186, + "step": 5399 + }, + { + "epoch": 0.3009865670809877, + "grad_norm": 0.5759447813034058, + "learning_rate": 8.076404730491061e-05, + "loss": 1.8402, + "step": 5400 + }, + { + "epoch": 0.30104230533415083, + "grad_norm": 0.5196195244789124, + "learning_rate": 8.075702753722718e-05, + "loss": 1.656, + "step": 5401 + }, + { + "epoch": 0.30109804358731396, + "grad_norm": 0.5357980728149414, + "learning_rate": 8.075000679411014e-05, + "loss": 1.6743, + "step": 5402 + }, + { + "epoch": 0.30115378184047714, + "grad_norm": 0.5370086431503296, + "learning_rate": 8.074298507578218e-05, + "loss": 1.7567, + "step": 5403 + }, + { + "epoch": 0.30120952009364027, + "grad_norm": 0.5173280835151672, + "learning_rate": 8.073596238246599e-05, + "loss": 1.5783, + "step": 5404 + }, + { + "epoch": 0.3012652583468034, + "grad_norm": 0.5284645557403564, + "learning_rate": 8.072893871438428e-05, + "loss": 1.7135, + "step": 5405 + }, + { + "epoch": 0.3013209965999666, + "grad_norm": 0.5838817954063416, + "learning_rate": 8.072191407175976e-05, + "loss": 1.8845, + "step": 5406 + }, + { + "epoch": 0.3013767348531297, + "grad_norm": 0.5520975589752197, + "learning_rate": 8.071488845481528e-05, + "loss": 1.6139, + "step": 5407 + }, + { + "epoch": 0.30143247310629284, + "grad_norm": 0.5155717730522156, + "learning_rate": 8.07078618637736e-05, + "loss": 1.4973, + "step": 5408 + }, + { + "epoch": 0.30148821135945597, + "grad_norm": 0.5581832528114319, + "learning_rate": 8.070083429885758e-05, + "loss": 1.7224, + "step": 5409 + }, + { + "epoch": 0.30154394961261916, + "grad_norm": 0.5734993815422058, + "learning_rate": 8.069380576029011e-05, + "loss": 1.508, + "step": 5410 + }, + { + "epoch": 0.3015996878657823, + "grad_norm": 0.5819764733314514, + "learning_rate": 8.068677624829406e-05, + "loss": 2.0365, + "step": 5411 + }, + { + "epoch": 0.3016554261189454, + "grad_norm": 0.538995623588562, + "learning_rate": 8.067974576309241e-05, + "loss": 1.8489, + "step": 5412 + }, + { + "epoch": 0.3017111643721086, + "grad_norm": 0.5447677373886108, + "learning_rate": 8.067271430490809e-05, + "loss": 1.7361, + "step": 5413 + }, + { + "epoch": 0.3017669026252717, + "grad_norm": 0.5370633602142334, + "learning_rate": 8.066568187396409e-05, + "loss": 1.5648, + "step": 5414 + }, + { + "epoch": 0.30182264087843486, + "grad_norm": 0.5709346532821655, + "learning_rate": 8.065864847048346e-05, + "loss": 1.7308, + "step": 5415 + }, + { + "epoch": 0.30187837913159804, + "grad_norm": 0.5642514824867249, + "learning_rate": 8.065161409468925e-05, + "loss": 1.9456, + "step": 5416 + }, + { + "epoch": 0.30193411738476117, + "grad_norm": 0.5522916316986084, + "learning_rate": 8.064457874680457e-05, + "loss": 1.8213, + "step": 5417 + }, + { + "epoch": 0.3019898556379243, + "grad_norm": 0.5913909077644348, + "learning_rate": 8.06375424270525e-05, + "loss": 1.8837, + "step": 5418 + }, + { + "epoch": 0.3020455938910874, + "grad_norm": 0.596079409122467, + "learning_rate": 8.063050513565624e-05, + "loss": 1.9783, + "step": 5419 + }, + { + "epoch": 0.3021013321442506, + "grad_norm": 0.5493654012680054, + "learning_rate": 8.062346687283892e-05, + "loss": 1.8092, + "step": 5420 + }, + { + "epoch": 0.30215707039741374, + "grad_norm": 0.5493000745773315, + "learning_rate": 8.06164276388238e-05, + "loss": 1.6994, + "step": 5421 + }, + { + "epoch": 0.30221280865057687, + "grad_norm": 0.4986167550086975, + "learning_rate": 8.060938743383408e-05, + "loss": 1.5504, + "step": 5422 + }, + { + "epoch": 0.30226854690374005, + "grad_norm": 0.5836266875267029, + "learning_rate": 8.060234625809306e-05, + "loss": 1.8898, + "step": 5423 + }, + { + "epoch": 0.3023242851569032, + "grad_norm": 0.5557297468185425, + "learning_rate": 8.059530411182406e-05, + "loss": 1.7518, + "step": 5424 + }, + { + "epoch": 0.3023800234100663, + "grad_norm": 0.5643293261528015, + "learning_rate": 8.058826099525039e-05, + "loss": 1.92, + "step": 5425 + }, + { + "epoch": 0.3024357616632295, + "grad_norm": 0.5600275993347168, + "learning_rate": 8.058121690859541e-05, + "loss": 1.7421, + "step": 5426 + }, + { + "epoch": 0.3024914999163926, + "grad_norm": 0.5405864119529724, + "learning_rate": 8.057417185208254e-05, + "loss": 1.7487, + "step": 5427 + }, + { + "epoch": 0.30254723816955575, + "grad_norm": 0.5578258633613586, + "learning_rate": 8.056712582593519e-05, + "loss": 1.7268, + "step": 5428 + }, + { + "epoch": 0.30260297642271894, + "grad_norm": 0.5377827286720276, + "learning_rate": 8.056007883037682e-05, + "loss": 1.8249, + "step": 5429 + }, + { + "epoch": 0.30265871467588207, + "grad_norm": 0.5574936270713806, + "learning_rate": 8.055303086563095e-05, + "loss": 1.8337, + "step": 5430 + }, + { + "epoch": 0.3027144529290452, + "grad_norm": 0.594794511795044, + "learning_rate": 8.054598193192106e-05, + "loss": 2.0531, + "step": 5431 + }, + { + "epoch": 0.3027701911822083, + "grad_norm": 0.509722888469696, + "learning_rate": 8.053893202947074e-05, + "loss": 1.6712, + "step": 5432 + }, + { + "epoch": 0.3028259294353715, + "grad_norm": 0.5056367516517639, + "learning_rate": 8.053188115850354e-05, + "loss": 1.5738, + "step": 5433 + }, + { + "epoch": 0.30288166768853464, + "grad_norm": 0.5353802442550659, + "learning_rate": 8.052482931924308e-05, + "loss": 1.8257, + "step": 5434 + }, + { + "epoch": 0.30293740594169777, + "grad_norm": 0.535033106803894, + "learning_rate": 8.051777651191299e-05, + "loss": 1.7261, + "step": 5435 + }, + { + "epoch": 0.30299314419486095, + "grad_norm": 0.5537331700325012, + "learning_rate": 8.051072273673698e-05, + "loss": 1.7634, + "step": 5436 + }, + { + "epoch": 0.3030488824480241, + "grad_norm": 0.538147509098053, + "learning_rate": 8.050366799393874e-05, + "loss": 1.5592, + "step": 5437 + }, + { + "epoch": 0.3031046207011872, + "grad_norm": 0.5110997557640076, + "learning_rate": 8.049661228374199e-05, + "loss": 1.7104, + "step": 5438 + }, + { + "epoch": 0.3031603589543504, + "grad_norm": 0.5138676166534424, + "learning_rate": 8.04895556063705e-05, + "loss": 1.7344, + "step": 5439 + }, + { + "epoch": 0.3032160972075135, + "grad_norm": 0.5240350961685181, + "learning_rate": 8.048249796204808e-05, + "loss": 1.6345, + "step": 5440 + }, + { + "epoch": 0.30327183546067665, + "grad_norm": 0.5258268713951111, + "learning_rate": 8.047543935099855e-05, + "loss": 1.542, + "step": 5441 + }, + { + "epoch": 0.3033275737138398, + "grad_norm": 0.5549874901771545, + "learning_rate": 8.046837977344577e-05, + "loss": 1.8106, + "step": 5442 + }, + { + "epoch": 0.30338331196700297, + "grad_norm": 0.5787036418914795, + "learning_rate": 8.046131922961362e-05, + "loss": 1.8995, + "step": 5443 + }, + { + "epoch": 0.3034390502201661, + "grad_norm": 0.5319430828094482, + "learning_rate": 8.045425771972603e-05, + "loss": 1.471, + "step": 5444 + }, + { + "epoch": 0.3034947884733292, + "grad_norm": 0.5467014312744141, + "learning_rate": 8.044719524400694e-05, + "loss": 1.6613, + "step": 5445 + }, + { + "epoch": 0.3035505267264924, + "grad_norm": 0.5461364388465881, + "learning_rate": 8.044013180268034e-05, + "loss": 1.7442, + "step": 5446 + }, + { + "epoch": 0.30360626497965554, + "grad_norm": 0.5711673498153687, + "learning_rate": 8.043306739597024e-05, + "loss": 1.7848, + "step": 5447 + }, + { + "epoch": 0.30366200323281867, + "grad_norm": 0.5382382273674011, + "learning_rate": 8.042600202410066e-05, + "loss": 1.5744, + "step": 5448 + }, + { + "epoch": 0.30371774148598185, + "grad_norm": 0.5482212901115417, + "learning_rate": 8.041893568729573e-05, + "loss": 1.6689, + "step": 5449 + }, + { + "epoch": 0.303773479739145, + "grad_norm": 0.5345839262008667, + "learning_rate": 8.041186838577949e-05, + "loss": 1.6285, + "step": 5450 + }, + { + "epoch": 0.3038292179923081, + "grad_norm": 0.5510614514350891, + "learning_rate": 8.04048001197761e-05, + "loss": 1.5176, + "step": 5451 + }, + { + "epoch": 0.3038849562454713, + "grad_norm": 0.5475590825080872, + "learning_rate": 8.039773088950973e-05, + "loss": 1.6778, + "step": 5452 + }, + { + "epoch": 0.3039406944986344, + "grad_norm": 0.5662024021148682, + "learning_rate": 8.039066069520455e-05, + "loss": 1.9253, + "step": 5453 + }, + { + "epoch": 0.30399643275179755, + "grad_norm": 0.6412192583084106, + "learning_rate": 8.038358953708482e-05, + "loss": 1.8921, + "step": 5454 + }, + { + "epoch": 0.3040521710049607, + "grad_norm": 0.5427385568618774, + "learning_rate": 8.037651741537478e-05, + "loss": 1.6157, + "step": 5455 + }, + { + "epoch": 0.30410790925812387, + "grad_norm": 0.5492942333221436, + "learning_rate": 8.03694443302987e-05, + "loss": 1.6204, + "step": 5456 + }, + { + "epoch": 0.304163647511287, + "grad_norm": 0.5571532249450684, + "learning_rate": 8.036237028208092e-05, + "loss": 1.6984, + "step": 5457 + }, + { + "epoch": 0.3042193857644501, + "grad_norm": 0.5320706963539124, + "learning_rate": 8.035529527094578e-05, + "loss": 1.5733, + "step": 5458 + }, + { + "epoch": 0.3042751240176133, + "grad_norm": 0.5525981187820435, + "learning_rate": 8.034821929711767e-05, + "loss": 1.6158, + "step": 5459 + }, + { + "epoch": 0.30433086227077644, + "grad_norm": 0.5780904293060303, + "learning_rate": 8.034114236082098e-05, + "loss": 1.8269, + "step": 5460 + }, + { + "epoch": 0.30438660052393957, + "grad_norm": 0.5405531525611877, + "learning_rate": 8.033406446228014e-05, + "loss": 1.8742, + "step": 5461 + }, + { + "epoch": 0.30444233877710275, + "grad_norm": 0.5742613077163696, + "learning_rate": 8.032698560171964e-05, + "loss": 1.9496, + "step": 5462 + }, + { + "epoch": 0.3044980770302659, + "grad_norm": 0.49316903948783875, + "learning_rate": 8.031990577936398e-05, + "loss": 1.5899, + "step": 5463 + }, + { + "epoch": 0.304553815283429, + "grad_norm": 0.5170844197273254, + "learning_rate": 8.031282499543769e-05, + "loss": 1.6575, + "step": 5464 + }, + { + "epoch": 0.30460955353659214, + "grad_norm": 0.5051673650741577, + "learning_rate": 8.030574325016532e-05, + "loss": 1.5878, + "step": 5465 + }, + { + "epoch": 0.3046652917897553, + "grad_norm": 0.493794709444046, + "learning_rate": 8.029866054377148e-05, + "loss": 1.5681, + "step": 5466 + }, + { + "epoch": 0.30472103004291845, + "grad_norm": 0.5372213125228882, + "learning_rate": 8.029157687648077e-05, + "loss": 1.6819, + "step": 5467 + }, + { + "epoch": 0.3047767682960816, + "grad_norm": 0.559104323387146, + "learning_rate": 8.028449224851785e-05, + "loss": 1.8688, + "step": 5468 + }, + { + "epoch": 0.30483250654924476, + "grad_norm": 0.558225691318512, + "learning_rate": 8.027740666010741e-05, + "loss": 1.7629, + "step": 5469 + }, + { + "epoch": 0.3048882448024079, + "grad_norm": 0.511577844619751, + "learning_rate": 8.027032011147417e-05, + "loss": 1.594, + "step": 5470 + }, + { + "epoch": 0.304943983055571, + "grad_norm": 0.5308223962783813, + "learning_rate": 8.026323260284286e-05, + "loss": 1.6677, + "step": 5471 + }, + { + "epoch": 0.3049997213087342, + "grad_norm": 0.5670995712280273, + "learning_rate": 8.025614413443824e-05, + "loss": 1.5382, + "step": 5472 + }, + { + "epoch": 0.30505545956189734, + "grad_norm": 0.553377091884613, + "learning_rate": 8.024905470648516e-05, + "loss": 1.59, + "step": 5473 + }, + { + "epoch": 0.30511119781506046, + "grad_norm": 0.5147939324378967, + "learning_rate": 8.024196431920841e-05, + "loss": 1.6797, + "step": 5474 + }, + { + "epoch": 0.30516693606822365, + "grad_norm": 0.5732524394989014, + "learning_rate": 8.023487297283289e-05, + "loss": 1.7703, + "step": 5475 + }, + { + "epoch": 0.3052226743213868, + "grad_norm": 0.5088878870010376, + "learning_rate": 8.022778066758348e-05, + "loss": 1.5239, + "step": 5476 + }, + { + "epoch": 0.3052784125745499, + "grad_norm": 0.5896703600883484, + "learning_rate": 8.02206874036851e-05, + "loss": 1.8356, + "step": 5477 + }, + { + "epoch": 0.30533415082771304, + "grad_norm": 0.5752948522567749, + "learning_rate": 8.021359318136273e-05, + "loss": 1.8527, + "step": 5478 + }, + { + "epoch": 0.3053898890808762, + "grad_norm": 0.5507591366767883, + "learning_rate": 8.020649800084133e-05, + "loss": 1.7682, + "step": 5479 + }, + { + "epoch": 0.30544562733403935, + "grad_norm": 0.5891523957252502, + "learning_rate": 8.019940186234591e-05, + "loss": 1.7112, + "step": 5480 + }, + { + "epoch": 0.3055013655872025, + "grad_norm": 0.5745503306388855, + "learning_rate": 8.019230476610155e-05, + "loss": 1.7824, + "step": 5481 + }, + { + "epoch": 0.30555710384036566, + "grad_norm": 0.6154142022132874, + "learning_rate": 8.018520671233333e-05, + "loss": 1.8217, + "step": 5482 + }, + { + "epoch": 0.3056128420935288, + "grad_norm": 0.5336470603942871, + "learning_rate": 8.017810770126633e-05, + "loss": 1.572, + "step": 5483 + }, + { + "epoch": 0.3056685803466919, + "grad_norm": 0.6083388328552246, + "learning_rate": 8.017100773312572e-05, + "loss": 1.8889, + "step": 5484 + }, + { + "epoch": 0.3057243185998551, + "grad_norm": 0.5398688912391663, + "learning_rate": 8.016390680813664e-05, + "loss": 1.8318, + "step": 5485 + }, + { + "epoch": 0.30578005685301823, + "grad_norm": 0.5180187225341797, + "learning_rate": 8.015680492652432e-05, + "loss": 1.4898, + "step": 5486 + }, + { + "epoch": 0.30583579510618136, + "grad_norm": 0.5112860798835754, + "learning_rate": 8.014970208851395e-05, + "loss": 1.622, + "step": 5487 + }, + { + "epoch": 0.3058915333593445, + "grad_norm": 0.5450818538665771, + "learning_rate": 8.014259829433082e-05, + "loss": 1.5932, + "step": 5488 + }, + { + "epoch": 0.3059472716125077, + "grad_norm": 0.5598384737968445, + "learning_rate": 8.013549354420022e-05, + "loss": 1.7663, + "step": 5489 + }, + { + "epoch": 0.3060030098656708, + "grad_norm": 0.574329137802124, + "learning_rate": 8.012838783834749e-05, + "loss": 1.7812, + "step": 5490 + }, + { + "epoch": 0.30605874811883393, + "grad_norm": 0.5636276006698608, + "learning_rate": 8.012128117699793e-05, + "loss": 1.8031, + "step": 5491 + }, + { + "epoch": 0.3061144863719971, + "grad_norm": 0.5229976177215576, + "learning_rate": 8.011417356037697e-05, + "loss": 1.7483, + "step": 5492 + }, + { + "epoch": 0.30617022462516025, + "grad_norm": 0.5263829231262207, + "learning_rate": 8.010706498870997e-05, + "loss": 1.6449, + "step": 5493 + }, + { + "epoch": 0.3062259628783234, + "grad_norm": 0.5461215376853943, + "learning_rate": 8.009995546222242e-05, + "loss": 1.5837, + "step": 5494 + }, + { + "epoch": 0.30628170113148656, + "grad_norm": 0.541483998298645, + "learning_rate": 8.009284498113979e-05, + "loss": 1.7239, + "step": 5495 + }, + { + "epoch": 0.3063374393846497, + "grad_norm": 0.540389358997345, + "learning_rate": 8.008573354568756e-05, + "loss": 1.6928, + "step": 5496 + }, + { + "epoch": 0.3063931776378128, + "grad_norm": 0.550672709941864, + "learning_rate": 8.007862115609129e-05, + "loss": 1.7299, + "step": 5497 + }, + { + "epoch": 0.306448915890976, + "grad_norm": 0.532590389251709, + "learning_rate": 8.007150781257651e-05, + "loss": 1.6299, + "step": 5498 + }, + { + "epoch": 0.30650465414413913, + "grad_norm": 0.5489155650138855, + "learning_rate": 8.006439351536883e-05, + "loss": 1.6814, + "step": 5499 + }, + { + "epoch": 0.30656039239730226, + "grad_norm": 0.5809459090232849, + "learning_rate": 8.005727826469389e-05, + "loss": 1.7617, + "step": 5500 + }, + { + "epoch": 0.3066161306504654, + "grad_norm": 0.5688945055007935, + "learning_rate": 8.005016206077731e-05, + "loss": 1.913, + "step": 5501 + }, + { + "epoch": 0.3066718689036286, + "grad_norm": 0.5430113673210144, + "learning_rate": 8.004304490384482e-05, + "loss": 1.6782, + "step": 5502 + }, + { + "epoch": 0.3067276071567917, + "grad_norm": 0.5550969243049622, + "learning_rate": 8.003592679412208e-05, + "loss": 1.4965, + "step": 5503 + }, + { + "epoch": 0.30678334540995483, + "grad_norm": 0.5173535943031311, + "learning_rate": 8.00288077318349e-05, + "loss": 1.4724, + "step": 5504 + }, + { + "epoch": 0.306839083663118, + "grad_norm": 0.5464041233062744, + "learning_rate": 8.0021687717209e-05, + "loss": 1.6722, + "step": 5505 + }, + { + "epoch": 0.30689482191628115, + "grad_norm": 0.5555015206336975, + "learning_rate": 8.001456675047019e-05, + "loss": 1.8088, + "step": 5506 + }, + { + "epoch": 0.3069505601694443, + "grad_norm": 0.5883082747459412, + "learning_rate": 8.000744483184433e-05, + "loss": 1.5916, + "step": 5507 + }, + { + "epoch": 0.30700629842260746, + "grad_norm": 0.5937238931655884, + "learning_rate": 8.000032196155726e-05, + "loss": 1.8253, + "step": 5508 + }, + { + "epoch": 0.3070620366757706, + "grad_norm": 0.5752248764038086, + "learning_rate": 7.999319813983492e-05, + "loss": 1.7183, + "step": 5509 + }, + { + "epoch": 0.3071177749289337, + "grad_norm": 0.5927345156669617, + "learning_rate": 7.99860733669032e-05, + "loss": 1.8415, + "step": 5510 + }, + { + "epoch": 0.30717351318209685, + "grad_norm": 0.597845196723938, + "learning_rate": 7.997894764298806e-05, + "loss": 1.8575, + "step": 5511 + }, + { + "epoch": 0.30722925143526003, + "grad_norm": 0.5484491586685181, + "learning_rate": 7.997182096831548e-05, + "loss": 1.6398, + "step": 5512 + }, + { + "epoch": 0.30728498968842316, + "grad_norm": 0.5977261662483215, + "learning_rate": 7.99646933431115e-05, + "loss": 2.0446, + "step": 5513 + }, + { + "epoch": 0.3073407279415863, + "grad_norm": 0.5897913575172424, + "learning_rate": 7.995756476760214e-05, + "loss": 1.7335, + "step": 5514 + }, + { + "epoch": 0.3073964661947495, + "grad_norm": 0.5303786396980286, + "learning_rate": 7.995043524201351e-05, + "loss": 1.6374, + "step": 5515 + }, + { + "epoch": 0.3074522044479126, + "grad_norm": 0.6054732799530029, + "learning_rate": 7.994330476657168e-05, + "loss": 1.8542, + "step": 5516 + }, + { + "epoch": 0.30750794270107573, + "grad_norm": 0.5825492739677429, + "learning_rate": 7.993617334150282e-05, + "loss": 1.74, + "step": 5517 + }, + { + "epoch": 0.3075636809542389, + "grad_norm": 0.5496809482574463, + "learning_rate": 7.992904096703307e-05, + "loss": 1.6844, + "step": 5518 + }, + { + "epoch": 0.30761941920740205, + "grad_norm": 0.5574871301651001, + "learning_rate": 7.992190764338864e-05, + "loss": 1.7397, + "step": 5519 + }, + { + "epoch": 0.3076751574605652, + "grad_norm": 0.5654902458190918, + "learning_rate": 7.991477337079576e-05, + "loss": 1.7361, + "step": 5520 + }, + { + "epoch": 0.30773089571372836, + "grad_norm": 0.5748382806777954, + "learning_rate": 7.990763814948068e-05, + "loss": 1.8819, + "step": 5521 + }, + { + "epoch": 0.3077866339668915, + "grad_norm": 0.5120726823806763, + "learning_rate": 7.99005019796697e-05, + "loss": 1.5405, + "step": 5522 + }, + { + "epoch": 0.3078423722200546, + "grad_norm": 0.5529910326004028, + "learning_rate": 7.989336486158912e-05, + "loss": 1.6712, + "step": 5523 + }, + { + "epoch": 0.30789811047321775, + "grad_norm": 0.5775067210197449, + "learning_rate": 7.988622679546529e-05, + "loss": 2.0319, + "step": 5524 + }, + { + "epoch": 0.30795384872638093, + "grad_norm": 0.5432143211364746, + "learning_rate": 7.987908778152462e-05, + "loss": 1.5891, + "step": 5525 + }, + { + "epoch": 0.30800958697954406, + "grad_norm": 0.5764423608779907, + "learning_rate": 7.987194781999345e-05, + "loss": 1.865, + "step": 5526 + }, + { + "epoch": 0.3080653252327072, + "grad_norm": 0.5256220698356628, + "learning_rate": 7.98648069110983e-05, + "loss": 1.5777, + "step": 5527 + }, + { + "epoch": 0.3081210634858704, + "grad_norm": 0.5597642064094543, + "learning_rate": 7.985766505506559e-05, + "loss": 1.8957, + "step": 5528 + }, + { + "epoch": 0.3081768017390335, + "grad_norm": 0.5411173701286316, + "learning_rate": 7.985052225212181e-05, + "loss": 1.7575, + "step": 5529 + }, + { + "epoch": 0.30823253999219663, + "grad_norm": 0.5252230763435364, + "learning_rate": 7.984337850249352e-05, + "loss": 1.7377, + "step": 5530 + }, + { + "epoch": 0.3082882782453598, + "grad_norm": 0.5985997915267944, + "learning_rate": 7.983623380640729e-05, + "loss": 1.7941, + "step": 5531 + }, + { + "epoch": 0.30834401649852294, + "grad_norm": 0.5696808099746704, + "learning_rate": 7.982908816408963e-05, + "loss": 1.8425, + "step": 5532 + }, + { + "epoch": 0.3083997547516861, + "grad_norm": 0.5184767246246338, + "learning_rate": 7.982194157576723e-05, + "loss": 1.6765, + "step": 5533 + }, + { + "epoch": 0.3084554930048492, + "grad_norm": 0.5509563088417053, + "learning_rate": 7.981479404166672e-05, + "loss": 1.8554, + "step": 5534 + }, + { + "epoch": 0.3085112312580124, + "grad_norm": 0.5477381944656372, + "learning_rate": 7.980764556201478e-05, + "loss": 1.6513, + "step": 5535 + }, + { + "epoch": 0.3085669695111755, + "grad_norm": 0.5575202107429504, + "learning_rate": 7.980049613703811e-05, + "loss": 1.7565, + "step": 5536 + }, + { + "epoch": 0.30862270776433864, + "grad_norm": 0.578071117401123, + "learning_rate": 7.979334576696344e-05, + "loss": 1.6711, + "step": 5537 + }, + { + "epoch": 0.30867844601750183, + "grad_norm": 0.5293973684310913, + "learning_rate": 7.978619445201756e-05, + "loss": 1.8865, + "step": 5538 + }, + { + "epoch": 0.30873418427066496, + "grad_norm": 0.5793629288673401, + "learning_rate": 7.977904219242724e-05, + "loss": 1.9338, + "step": 5539 + }, + { + "epoch": 0.3087899225238281, + "grad_norm": 0.5701123476028442, + "learning_rate": 7.977188898841936e-05, + "loss": 1.778, + "step": 5540 + }, + { + "epoch": 0.30884566077699127, + "grad_norm": 0.5166484117507935, + "learning_rate": 7.976473484022071e-05, + "loss": 1.6528, + "step": 5541 + }, + { + "epoch": 0.3089013990301544, + "grad_norm": 0.5501734018325806, + "learning_rate": 7.975757974805824e-05, + "loss": 1.6939, + "step": 5542 + }, + { + "epoch": 0.30895713728331753, + "grad_norm": 0.5325387716293335, + "learning_rate": 7.975042371215881e-05, + "loss": 1.5085, + "step": 5543 + }, + { + "epoch": 0.3090128755364807, + "grad_norm": 0.5717397928237915, + "learning_rate": 7.974326673274943e-05, + "loss": 1.7745, + "step": 5544 + }, + { + "epoch": 0.30906861378964384, + "grad_norm": 0.5344177484512329, + "learning_rate": 7.973610881005702e-05, + "loss": 1.6344, + "step": 5545 + }, + { + "epoch": 0.30912435204280697, + "grad_norm": 0.5647115707397461, + "learning_rate": 7.972894994430862e-05, + "loss": 1.8173, + "step": 5546 + }, + { + "epoch": 0.3091800902959701, + "grad_norm": 0.5356699824333191, + "learning_rate": 7.972179013573125e-05, + "loss": 1.6173, + "step": 5547 + }, + { + "epoch": 0.3092358285491333, + "grad_norm": 0.5651494860649109, + "learning_rate": 7.971462938455199e-05, + "loss": 1.5781, + "step": 5548 + }, + { + "epoch": 0.3092915668022964, + "grad_norm": 0.5726121664047241, + "learning_rate": 7.970746769099795e-05, + "loss": 1.5528, + "step": 5549 + }, + { + "epoch": 0.30934730505545954, + "grad_norm": 0.6116449236869812, + "learning_rate": 7.970030505529624e-05, + "loss": 1.9145, + "step": 5550 + }, + { + "epoch": 0.3094030433086227, + "grad_norm": 0.5738492012023926, + "learning_rate": 7.969314147767399e-05, + "loss": 1.7875, + "step": 5551 + }, + { + "epoch": 0.30945878156178586, + "grad_norm": 0.5894981026649475, + "learning_rate": 7.968597695835844e-05, + "loss": 1.5879, + "step": 5552 + }, + { + "epoch": 0.309514519814949, + "grad_norm": 0.5126131772994995, + "learning_rate": 7.967881149757678e-05, + "loss": 1.6178, + "step": 5553 + }, + { + "epoch": 0.30957025806811217, + "grad_norm": 0.5616469979286194, + "learning_rate": 7.967164509555624e-05, + "loss": 1.7701, + "step": 5554 + }, + { + "epoch": 0.3096259963212753, + "grad_norm": 0.5041468739509583, + "learning_rate": 7.966447775252415e-05, + "loss": 1.5632, + "step": 5555 + }, + { + "epoch": 0.3096817345744384, + "grad_norm": 0.5093483328819275, + "learning_rate": 7.965730946870775e-05, + "loss": 1.7161, + "step": 5556 + }, + { + "epoch": 0.30973747282760156, + "grad_norm": 0.6104699373245239, + "learning_rate": 7.965014024433443e-05, + "loss": 1.7959, + "step": 5557 + }, + { + "epoch": 0.30979321108076474, + "grad_norm": 0.5576456189155579, + "learning_rate": 7.964297007963151e-05, + "loss": 1.8631, + "step": 5558 + }, + { + "epoch": 0.30984894933392787, + "grad_norm": 0.5558076500892639, + "learning_rate": 7.963579897482642e-05, + "loss": 1.7503, + "step": 5559 + }, + { + "epoch": 0.309904687587091, + "grad_norm": 0.5433835983276367, + "learning_rate": 7.96286269301466e-05, + "loss": 1.6935, + "step": 5560 + }, + { + "epoch": 0.3099604258402542, + "grad_norm": 0.5542037487030029, + "learning_rate": 7.962145394581944e-05, + "loss": 1.7342, + "step": 5561 + }, + { + "epoch": 0.3100161640934173, + "grad_norm": 0.5680848360061646, + "learning_rate": 7.961428002207249e-05, + "loss": 1.6875, + "step": 5562 + }, + { + "epoch": 0.31007190234658044, + "grad_norm": 0.5349116921424866, + "learning_rate": 7.960710515913323e-05, + "loss": 1.6991, + "step": 5563 + }, + { + "epoch": 0.3101276405997436, + "grad_norm": 0.5729091167449951, + "learning_rate": 7.959992935722924e-05, + "loss": 1.8622, + "step": 5564 + }, + { + "epoch": 0.31018337885290675, + "grad_norm": 0.558594286441803, + "learning_rate": 7.959275261658804e-05, + "loss": 1.8244, + "step": 5565 + }, + { + "epoch": 0.3102391171060699, + "grad_norm": 0.5720626711845398, + "learning_rate": 7.958557493743728e-05, + "loss": 1.796, + "step": 5566 + }, + { + "epoch": 0.31029485535923307, + "grad_norm": 0.7089996933937073, + "learning_rate": 7.957839632000457e-05, + "loss": 2.2928, + "step": 5567 + }, + { + "epoch": 0.3103505936123962, + "grad_norm": 0.51308274269104, + "learning_rate": 7.957121676451759e-05, + "loss": 1.5466, + "step": 5568 + }, + { + "epoch": 0.3104063318655593, + "grad_norm": 0.5389419794082642, + "learning_rate": 7.956403627120403e-05, + "loss": 1.7847, + "step": 5569 + }, + { + "epoch": 0.31046207011872246, + "grad_norm": 0.5362538695335388, + "learning_rate": 7.95568548402916e-05, + "loss": 1.752, + "step": 5570 + }, + { + "epoch": 0.31051780837188564, + "grad_norm": 0.5565882921218872, + "learning_rate": 7.954967247200806e-05, + "loss": 1.7436, + "step": 5571 + }, + { + "epoch": 0.31057354662504877, + "grad_norm": 0.5700491070747375, + "learning_rate": 7.95424891665812e-05, + "loss": 1.3893, + "step": 5572 + }, + { + "epoch": 0.3106292848782119, + "grad_norm": 0.5634492635726929, + "learning_rate": 7.953530492423884e-05, + "loss": 1.5228, + "step": 5573 + }, + { + "epoch": 0.3106850231313751, + "grad_norm": 0.5454849004745483, + "learning_rate": 7.95281197452088e-05, + "loss": 1.7454, + "step": 5574 + }, + { + "epoch": 0.3107407613845382, + "grad_norm": 0.5382822751998901, + "learning_rate": 7.952093362971897e-05, + "loss": 1.6264, + "step": 5575 + }, + { + "epoch": 0.31079649963770134, + "grad_norm": 0.5650563836097717, + "learning_rate": 7.951374657799724e-05, + "loss": 1.4175, + "step": 5576 + }, + { + "epoch": 0.3108522378908645, + "grad_norm": 0.570775032043457, + "learning_rate": 7.950655859027154e-05, + "loss": 1.6686, + "step": 5577 + }, + { + "epoch": 0.31090797614402765, + "grad_norm": 0.5498449206352234, + "learning_rate": 7.949936966676984e-05, + "loss": 1.7351, + "step": 5578 + }, + { + "epoch": 0.3109637143971908, + "grad_norm": 0.6256487369537354, + "learning_rate": 7.949217980772012e-05, + "loss": 1.9914, + "step": 5579 + }, + { + "epoch": 0.3110194526503539, + "grad_norm": 0.6062150001525879, + "learning_rate": 7.948498901335042e-05, + "loss": 1.9362, + "step": 5580 + }, + { + "epoch": 0.3110751909035171, + "grad_norm": 0.5351932048797607, + "learning_rate": 7.947779728388878e-05, + "loss": 1.6922, + "step": 5581 + }, + { + "epoch": 0.3111309291566802, + "grad_norm": 0.6049745678901672, + "learning_rate": 7.947060461956329e-05, + "loss": 2.146, + "step": 5582 + }, + { + "epoch": 0.31118666740984335, + "grad_norm": 0.5465789437294006, + "learning_rate": 7.946341102060202e-05, + "loss": 1.7858, + "step": 5583 + }, + { + "epoch": 0.31124240566300654, + "grad_norm": 0.5127213597297668, + "learning_rate": 7.945621648723313e-05, + "loss": 1.6921, + "step": 5584 + }, + { + "epoch": 0.31129814391616967, + "grad_norm": 0.5576222538948059, + "learning_rate": 7.944902101968482e-05, + "loss": 1.7601, + "step": 5585 + }, + { + "epoch": 0.3113538821693328, + "grad_norm": 0.5145538449287415, + "learning_rate": 7.944182461818525e-05, + "loss": 1.6861, + "step": 5586 + }, + { + "epoch": 0.311409620422496, + "grad_norm": 0.5060127973556519, + "learning_rate": 7.943462728296266e-05, + "loss": 1.4954, + "step": 5587 + }, + { + "epoch": 0.3114653586756591, + "grad_norm": 0.5226243138313293, + "learning_rate": 7.942742901424531e-05, + "loss": 1.7086, + "step": 5588 + }, + { + "epoch": 0.31152109692882224, + "grad_norm": 0.5711196064949036, + "learning_rate": 7.942022981226149e-05, + "loss": 1.7788, + "step": 5589 + }, + { + "epoch": 0.3115768351819854, + "grad_norm": 0.511813759803772, + "learning_rate": 7.941302967723951e-05, + "loss": 1.3316, + "step": 5590 + }, + { + "epoch": 0.31163257343514855, + "grad_norm": 0.5399052500724792, + "learning_rate": 7.940582860940771e-05, + "loss": 1.6683, + "step": 5591 + }, + { + "epoch": 0.3116883116883117, + "grad_norm": 0.5305676460266113, + "learning_rate": 7.939862660899448e-05, + "loss": 1.7344, + "step": 5592 + }, + { + "epoch": 0.3117440499414748, + "grad_norm": 0.5254833698272705, + "learning_rate": 7.939142367622823e-05, + "loss": 1.5524, + "step": 5593 + }, + { + "epoch": 0.311799788194638, + "grad_norm": 0.5858429074287415, + "learning_rate": 7.938421981133738e-05, + "loss": 1.7415, + "step": 5594 + }, + { + "epoch": 0.3118555264478011, + "grad_norm": 0.6082313656806946, + "learning_rate": 7.937701501455039e-05, + "loss": 1.5333, + "step": 5595 + }, + { + "epoch": 0.31191126470096425, + "grad_norm": 0.5757048726081848, + "learning_rate": 7.936980928609577e-05, + "loss": 1.8723, + "step": 5596 + }, + { + "epoch": 0.31196700295412744, + "grad_norm": 0.6089504957199097, + "learning_rate": 7.936260262620205e-05, + "loss": 1.8915, + "step": 5597 + }, + { + "epoch": 0.31202274120729057, + "grad_norm": 0.588326096534729, + "learning_rate": 7.935539503509775e-05, + "loss": 1.8353, + "step": 5598 + }, + { + "epoch": 0.3120784794604537, + "grad_norm": 0.5930234789848328, + "learning_rate": 7.934818651301148e-05, + "loss": 1.832, + "step": 5599 + }, + { + "epoch": 0.3121342177136169, + "grad_norm": 0.5394973158836365, + "learning_rate": 7.934097706017185e-05, + "loss": 1.7301, + "step": 5600 + }, + { + "epoch": 0.31218995596678, + "grad_norm": 0.5147609114646912, + "learning_rate": 7.93337666768075e-05, + "loss": 1.7095, + "step": 5601 + }, + { + "epoch": 0.31224569421994314, + "grad_norm": 0.5531661510467529, + "learning_rate": 7.932655536314708e-05, + "loss": 1.6071, + "step": 5602 + }, + { + "epoch": 0.31230143247310627, + "grad_norm": 0.5388891696929932, + "learning_rate": 7.931934311941933e-05, + "loss": 1.5759, + "step": 5603 + }, + { + "epoch": 0.31235717072626945, + "grad_norm": 0.5236558318138123, + "learning_rate": 7.931212994585294e-05, + "loss": 1.5492, + "step": 5604 + }, + { + "epoch": 0.3124129089794326, + "grad_norm": 0.6088682413101196, + "learning_rate": 7.93049158426767e-05, + "loss": 1.7768, + "step": 5605 + }, + { + "epoch": 0.3124686472325957, + "grad_norm": 0.5254512429237366, + "learning_rate": 7.92977008101194e-05, + "loss": 1.6003, + "step": 5606 + }, + { + "epoch": 0.3125243854857589, + "grad_norm": 0.5747987031936646, + "learning_rate": 7.929048484840984e-05, + "loss": 1.7666, + "step": 5607 + }, + { + "epoch": 0.312580123738922, + "grad_norm": 0.5682463645935059, + "learning_rate": 7.928326795777688e-05, + "loss": 1.7861, + "step": 5608 + }, + { + "epoch": 0.31263586199208515, + "grad_norm": 0.5339683890342712, + "learning_rate": 7.927605013844939e-05, + "loss": 1.614, + "step": 5609 + }, + { + "epoch": 0.31269160024524834, + "grad_norm": 0.5913909673690796, + "learning_rate": 7.926883139065627e-05, + "loss": 1.7949, + "step": 5610 + }, + { + "epoch": 0.31274733849841146, + "grad_norm": 0.5656397342681885, + "learning_rate": 7.926161171462648e-05, + "loss": 1.8147, + "step": 5611 + }, + { + "epoch": 0.3128030767515746, + "grad_norm": 0.5707045197486877, + "learning_rate": 7.925439111058897e-05, + "loss": 1.7117, + "step": 5612 + }, + { + "epoch": 0.3128588150047378, + "grad_norm": 0.5682026743888855, + "learning_rate": 7.924716957877275e-05, + "loss": 1.6873, + "step": 5613 + }, + { + "epoch": 0.3129145532579009, + "grad_norm": 0.6239393353462219, + "learning_rate": 7.92399471194068e-05, + "loss": 2.136, + "step": 5614 + }, + { + "epoch": 0.31297029151106404, + "grad_norm": 0.5405849814414978, + "learning_rate": 7.923272373272024e-05, + "loss": 1.7105, + "step": 5615 + }, + { + "epoch": 0.31302602976422716, + "grad_norm": 0.5093609094619751, + "learning_rate": 7.922549941894212e-05, + "loss": 1.7117, + "step": 5616 + }, + { + "epoch": 0.31308176801739035, + "grad_norm": 0.5615028738975525, + "learning_rate": 7.921827417830155e-05, + "loss": 1.7621, + "step": 5617 + }, + { + "epoch": 0.3131375062705535, + "grad_norm": 0.5841954946517944, + "learning_rate": 7.921104801102766e-05, + "loss": 1.7155, + "step": 5618 + }, + { + "epoch": 0.3131932445237166, + "grad_norm": 0.5684096217155457, + "learning_rate": 7.920382091734966e-05, + "loss": 1.5615, + "step": 5619 + }, + { + "epoch": 0.3132489827768798, + "grad_norm": 0.5647116303443909, + "learning_rate": 7.919659289749673e-05, + "loss": 1.6964, + "step": 5620 + }, + { + "epoch": 0.3133047210300429, + "grad_norm": 0.5479496121406555, + "learning_rate": 7.918936395169809e-05, + "loss": 1.6701, + "step": 5621 + }, + { + "epoch": 0.31336045928320605, + "grad_norm": 0.5465035438537598, + "learning_rate": 7.918213408018302e-05, + "loss": 1.8372, + "step": 5622 + }, + { + "epoch": 0.31341619753636923, + "grad_norm": 0.5440232157707214, + "learning_rate": 7.91749032831808e-05, + "loss": 1.6181, + "step": 5623 + }, + { + "epoch": 0.31347193578953236, + "grad_norm": 0.5956066846847534, + "learning_rate": 7.916767156092073e-05, + "loss": 1.8816, + "step": 5624 + }, + { + "epoch": 0.3135276740426955, + "grad_norm": 0.4970141053199768, + "learning_rate": 7.916043891363221e-05, + "loss": 1.331, + "step": 5625 + }, + { + "epoch": 0.3135834122958586, + "grad_norm": 0.5314142107963562, + "learning_rate": 7.915320534154457e-05, + "loss": 1.7526, + "step": 5626 + }, + { + "epoch": 0.3136391505490218, + "grad_norm": 0.5765748620033264, + "learning_rate": 7.914597084488723e-05, + "loss": 1.7204, + "step": 5627 + }, + { + "epoch": 0.31369488880218493, + "grad_norm": 0.5975958704948425, + "learning_rate": 7.913873542388963e-05, + "loss": 1.8833, + "step": 5628 + }, + { + "epoch": 0.31375062705534806, + "grad_norm": 0.5788082480430603, + "learning_rate": 7.913149907878123e-05, + "loss": 1.9049, + "step": 5629 + }, + { + "epoch": 0.31380636530851125, + "grad_norm": 0.6019555330276489, + "learning_rate": 7.912426180979152e-05, + "loss": 2.005, + "step": 5630 + }, + { + "epoch": 0.3138621035616744, + "grad_norm": 0.5763736963272095, + "learning_rate": 7.911702361715006e-05, + "loss": 1.7476, + "step": 5631 + }, + { + "epoch": 0.3139178418148375, + "grad_norm": 0.5758547782897949, + "learning_rate": 7.910978450108634e-05, + "loss": 1.69, + "step": 5632 + }, + { + "epoch": 0.3139735800680007, + "grad_norm": 0.5762767791748047, + "learning_rate": 7.910254446183e-05, + "loss": 1.7354, + "step": 5633 + }, + { + "epoch": 0.3140293183211638, + "grad_norm": 0.5475091338157654, + "learning_rate": 7.909530349961062e-05, + "loss": 1.803, + "step": 5634 + }, + { + "epoch": 0.31408505657432695, + "grad_norm": 0.5797522664070129, + "learning_rate": 7.908806161465785e-05, + "loss": 1.8425, + "step": 5635 + }, + { + "epoch": 0.31414079482749013, + "grad_norm": 0.5494913458824158, + "learning_rate": 7.908081880720137e-05, + "loss": 1.7041, + "step": 5636 + }, + { + "epoch": 0.31419653308065326, + "grad_norm": 0.5253703594207764, + "learning_rate": 7.907357507747087e-05, + "loss": 1.5982, + "step": 5637 + }, + { + "epoch": 0.3142522713338164, + "grad_norm": 0.5663535594940186, + "learning_rate": 7.906633042569607e-05, + "loss": 1.6506, + "step": 5638 + }, + { + "epoch": 0.3143080095869795, + "grad_norm": 0.5768305659294128, + "learning_rate": 7.905908485210674e-05, + "loss": 1.675, + "step": 5639 + }, + { + "epoch": 0.3143637478401427, + "grad_norm": 0.5730108022689819, + "learning_rate": 7.905183835693266e-05, + "loss": 1.6702, + "step": 5640 + }, + { + "epoch": 0.31441948609330583, + "grad_norm": 0.5377948880195618, + "learning_rate": 7.904459094040366e-05, + "loss": 1.8156, + "step": 5641 + }, + { + "epoch": 0.31447522434646896, + "grad_norm": 0.5925690531730652, + "learning_rate": 7.903734260274958e-05, + "loss": 1.8198, + "step": 5642 + }, + { + "epoch": 0.31453096259963215, + "grad_norm": 0.5221425294876099, + "learning_rate": 7.903009334420027e-05, + "loss": 1.6291, + "step": 5643 + }, + { + "epoch": 0.3145867008527953, + "grad_norm": 0.5379535555839539, + "learning_rate": 7.902284316498567e-05, + "loss": 1.6026, + "step": 5644 + }, + { + "epoch": 0.3146424391059584, + "grad_norm": 0.5477253198623657, + "learning_rate": 7.901559206533571e-05, + "loss": 1.9096, + "step": 5645 + }, + { + "epoch": 0.3146981773591216, + "grad_norm": 0.6306549310684204, + "learning_rate": 7.900834004548034e-05, + "loss": 1.9637, + "step": 5646 + }, + { + "epoch": 0.3147539156122847, + "grad_norm": 0.5738115906715393, + "learning_rate": 7.900108710564954e-05, + "loss": 1.8217, + "step": 5647 + }, + { + "epoch": 0.31480965386544785, + "grad_norm": 0.5737825036048889, + "learning_rate": 7.899383324607336e-05, + "loss": 1.7018, + "step": 5648 + }, + { + "epoch": 0.314865392118611, + "grad_norm": 0.5575332641601562, + "learning_rate": 7.898657846698183e-05, + "loss": 1.823, + "step": 5649 + }, + { + "epoch": 0.31492113037177416, + "grad_norm": 0.5665508508682251, + "learning_rate": 7.897932276860502e-05, + "loss": 1.8531, + "step": 5650 + }, + { + "epoch": 0.3149768686249373, + "grad_norm": 0.6147223711013794, + "learning_rate": 7.897206615117307e-05, + "loss": 1.8, + "step": 5651 + }, + { + "epoch": 0.3150326068781004, + "grad_norm": 0.5605811476707458, + "learning_rate": 7.89648086149161e-05, + "loss": 1.8554, + "step": 5652 + }, + { + "epoch": 0.3150883451312636, + "grad_norm": 0.5749962329864502, + "learning_rate": 7.895755016006427e-05, + "loss": 1.9814, + "step": 5653 + }, + { + "epoch": 0.31514408338442673, + "grad_norm": 0.6655054688453674, + "learning_rate": 7.895029078684779e-05, + "loss": 1.6895, + "step": 5654 + }, + { + "epoch": 0.31519982163758986, + "grad_norm": 0.5131604671478271, + "learning_rate": 7.894303049549687e-05, + "loss": 1.4731, + "step": 5655 + }, + { + "epoch": 0.31525555989075305, + "grad_norm": 0.5364745855331421, + "learning_rate": 7.893576928624178e-05, + "loss": 1.819, + "step": 5656 + }, + { + "epoch": 0.3153112981439162, + "grad_norm": 0.563586413860321, + "learning_rate": 7.89285071593128e-05, + "loss": 1.6023, + "step": 5657 + }, + { + "epoch": 0.3153670363970793, + "grad_norm": 0.5618447065353394, + "learning_rate": 7.892124411494022e-05, + "loss": 1.5903, + "step": 5658 + }, + { + "epoch": 0.3154227746502425, + "grad_norm": 0.5073031783103943, + "learning_rate": 7.891398015335442e-05, + "loss": 1.646, + "step": 5659 + }, + { + "epoch": 0.3154785129034056, + "grad_norm": 0.5081502795219421, + "learning_rate": 7.890671527478574e-05, + "loss": 1.3751, + "step": 5660 + }, + { + "epoch": 0.31553425115656875, + "grad_norm": 0.524069607257843, + "learning_rate": 7.88994494794646e-05, + "loss": 1.6491, + "step": 5661 + }, + { + "epoch": 0.3155899894097319, + "grad_norm": 0.5874504446983337, + "learning_rate": 7.88921827676214e-05, + "loss": 1.5753, + "step": 5662 + }, + { + "epoch": 0.31564572766289506, + "grad_norm": 0.5709517002105713, + "learning_rate": 7.888491513948661e-05, + "loss": 1.8023, + "step": 5663 + }, + { + "epoch": 0.3157014659160582, + "grad_norm": 0.5294995903968811, + "learning_rate": 7.887764659529073e-05, + "loss": 1.6754, + "step": 5664 + }, + { + "epoch": 0.3157572041692213, + "grad_norm": 0.5117160677909851, + "learning_rate": 7.887037713526428e-05, + "loss": 1.6262, + "step": 5665 + }, + { + "epoch": 0.3158129424223845, + "grad_norm": 0.49994394183158875, + "learning_rate": 7.88631067596378e-05, + "loss": 1.5649, + "step": 5666 + }, + { + "epoch": 0.31586868067554763, + "grad_norm": 0.486306756734848, + "learning_rate": 7.885583546864184e-05, + "loss": 1.4968, + "step": 5667 + }, + { + "epoch": 0.31592441892871076, + "grad_norm": 0.5242376327514648, + "learning_rate": 7.884856326250703e-05, + "loss": 1.5559, + "step": 5668 + }, + { + "epoch": 0.31598015718187394, + "grad_norm": 0.5692494511604309, + "learning_rate": 7.884129014146397e-05, + "loss": 1.8384, + "step": 5669 + }, + { + "epoch": 0.3160358954350371, + "grad_norm": 0.5784143209457397, + "learning_rate": 7.883401610574336e-05, + "loss": 1.9506, + "step": 5670 + }, + { + "epoch": 0.3160916336882002, + "grad_norm": 0.5659399032592773, + "learning_rate": 7.882674115557587e-05, + "loss": 1.6864, + "step": 5671 + }, + { + "epoch": 0.31614737194136333, + "grad_norm": 0.6336827278137207, + "learning_rate": 7.881946529119223e-05, + "loss": 1.9635, + "step": 5672 + }, + { + "epoch": 0.3162031101945265, + "grad_norm": 0.5327314734458923, + "learning_rate": 7.881218851282317e-05, + "loss": 1.5806, + "step": 5673 + }, + { + "epoch": 0.31625884844768964, + "grad_norm": 0.5700320601463318, + "learning_rate": 7.880491082069949e-05, + "loss": 1.7419, + "step": 5674 + }, + { + "epoch": 0.3163145867008528, + "grad_norm": 0.569348156452179, + "learning_rate": 7.879763221505197e-05, + "loss": 1.7392, + "step": 5675 + }, + { + "epoch": 0.31637032495401596, + "grad_norm": 0.5255264639854431, + "learning_rate": 7.879035269611146e-05, + "loss": 1.6862, + "step": 5676 + }, + { + "epoch": 0.3164260632071791, + "grad_norm": 0.5734140872955322, + "learning_rate": 7.878307226410882e-05, + "loss": 1.8253, + "step": 5677 + }, + { + "epoch": 0.3164818014603422, + "grad_norm": 0.5915566086769104, + "learning_rate": 7.877579091927496e-05, + "loss": 1.7754, + "step": 5678 + }, + { + "epoch": 0.3165375397135054, + "grad_norm": 0.5272923707962036, + "learning_rate": 7.876850866184077e-05, + "loss": 1.7315, + "step": 5679 + }, + { + "epoch": 0.31659327796666853, + "grad_norm": 0.5072640180587769, + "learning_rate": 7.876122549203723e-05, + "loss": 1.5367, + "step": 5680 + }, + { + "epoch": 0.31664901621983166, + "grad_norm": 0.5453153848648071, + "learning_rate": 7.87539414100953e-05, + "loss": 1.7551, + "step": 5681 + }, + { + "epoch": 0.31670475447299484, + "grad_norm": 0.5492895245552063, + "learning_rate": 7.874665641624599e-05, + "loss": 1.7739, + "step": 5682 + }, + { + "epoch": 0.31676049272615797, + "grad_norm": 0.5405164957046509, + "learning_rate": 7.873937051072035e-05, + "loss": 1.747, + "step": 5683 + }, + { + "epoch": 0.3168162309793211, + "grad_norm": 0.5549308061599731, + "learning_rate": 7.873208369374943e-05, + "loss": 1.8224, + "step": 5684 + }, + { + "epoch": 0.31687196923248423, + "grad_norm": 0.5366522669792175, + "learning_rate": 7.872479596556435e-05, + "loss": 1.6589, + "step": 5685 + }, + { + "epoch": 0.3169277074856474, + "grad_norm": 0.527472734451294, + "learning_rate": 7.871750732639621e-05, + "loss": 1.6122, + "step": 5686 + }, + { + "epoch": 0.31698344573881054, + "grad_norm": 0.5421255826950073, + "learning_rate": 7.871021777647618e-05, + "loss": 1.766, + "step": 5687 + }, + { + "epoch": 0.31703918399197367, + "grad_norm": 0.5596272945404053, + "learning_rate": 7.870292731603544e-05, + "loss": 1.765, + "step": 5688 + }, + { + "epoch": 0.31709492224513686, + "grad_norm": 0.5629613995552063, + "learning_rate": 7.869563594530517e-05, + "loss": 1.6374, + "step": 5689 + }, + { + "epoch": 0.3171506604983, + "grad_norm": 0.5471567511558533, + "learning_rate": 7.868834366451665e-05, + "loss": 1.8048, + "step": 5690 + }, + { + "epoch": 0.3172063987514631, + "grad_norm": 0.6505834460258484, + "learning_rate": 7.868105047390113e-05, + "loss": 2.1298, + "step": 5691 + }, + { + "epoch": 0.3172621370046263, + "grad_norm": 0.5665611624717712, + "learning_rate": 7.867375637368993e-05, + "loss": 1.6, + "step": 5692 + }, + { + "epoch": 0.31731787525778943, + "grad_norm": 0.5327755212783813, + "learning_rate": 7.866646136411433e-05, + "loss": 1.7876, + "step": 5693 + }, + { + "epoch": 0.31737361351095256, + "grad_norm": 0.5993742942810059, + "learning_rate": 7.865916544540573e-05, + "loss": 1.7237, + "step": 5694 + }, + { + "epoch": 0.3174293517641157, + "grad_norm": 0.5317041873931885, + "learning_rate": 7.865186861779548e-05, + "loss": 1.5221, + "step": 5695 + }, + { + "epoch": 0.31748509001727887, + "grad_norm": 0.5825653076171875, + "learning_rate": 7.864457088151502e-05, + "loss": 1.7575, + "step": 5696 + }, + { + "epoch": 0.317540828270442, + "grad_norm": 0.5435444116592407, + "learning_rate": 7.863727223679578e-05, + "loss": 1.789, + "step": 5697 + }, + { + "epoch": 0.31759656652360513, + "grad_norm": 0.5559577941894531, + "learning_rate": 7.862997268386924e-05, + "loss": 1.802, + "step": 5698 + }, + { + "epoch": 0.3176523047767683, + "grad_norm": 0.6636247634887695, + "learning_rate": 7.862267222296687e-05, + "loss": 2.0765, + "step": 5699 + }, + { + "epoch": 0.31770804302993144, + "grad_norm": 0.49671420454978943, + "learning_rate": 7.861537085432025e-05, + "loss": 1.5644, + "step": 5700 + }, + { + "epoch": 0.31776378128309457, + "grad_norm": 0.5270445942878723, + "learning_rate": 7.860806857816088e-05, + "loss": 1.7291, + "step": 5701 + }, + { + "epoch": 0.31781951953625776, + "grad_norm": 0.6097070574760437, + "learning_rate": 7.860076539472037e-05, + "loss": 1.9244, + "step": 5702 + }, + { + "epoch": 0.3178752577894209, + "grad_norm": 0.537875235080719, + "learning_rate": 7.859346130423035e-05, + "loss": 1.7579, + "step": 5703 + }, + { + "epoch": 0.317930996042584, + "grad_norm": 0.5384728908538818, + "learning_rate": 7.858615630692244e-05, + "loss": 1.5755, + "step": 5704 + }, + { + "epoch": 0.3179867342957472, + "grad_norm": 0.5751199722290039, + "learning_rate": 7.857885040302833e-05, + "loss": 1.6979, + "step": 5705 + }, + { + "epoch": 0.3180424725489103, + "grad_norm": 0.5749076008796692, + "learning_rate": 7.857154359277972e-05, + "loss": 1.6744, + "step": 5706 + }, + { + "epoch": 0.31809821080207346, + "grad_norm": 0.5693714022636414, + "learning_rate": 7.85642358764083e-05, + "loss": 1.8986, + "step": 5707 + }, + { + "epoch": 0.3181539490552366, + "grad_norm": 0.504147469997406, + "learning_rate": 7.855692725414587e-05, + "loss": 1.5641, + "step": 5708 + }, + { + "epoch": 0.31820968730839977, + "grad_norm": 0.5494616031646729, + "learning_rate": 7.854961772622423e-05, + "loss": 1.6743, + "step": 5709 + }, + { + "epoch": 0.3182654255615629, + "grad_norm": 0.49635690450668335, + "learning_rate": 7.854230729287515e-05, + "loss": 1.5466, + "step": 5710 + }, + { + "epoch": 0.318321163814726, + "grad_norm": 0.569781482219696, + "learning_rate": 7.853499595433049e-05, + "loss": 1.7647, + "step": 5711 + }, + { + "epoch": 0.3183769020678892, + "grad_norm": 0.540679931640625, + "learning_rate": 7.852768371082215e-05, + "loss": 1.6237, + "step": 5712 + }, + { + "epoch": 0.31843264032105234, + "grad_norm": 0.5818458795547485, + "learning_rate": 7.852037056258199e-05, + "loss": 1.9955, + "step": 5713 + }, + { + "epoch": 0.31848837857421547, + "grad_norm": 0.5366159081459045, + "learning_rate": 7.851305650984197e-05, + "loss": 1.5985, + "step": 5714 + }, + { + "epoch": 0.31854411682737865, + "grad_norm": 0.7078673839569092, + "learning_rate": 7.850574155283404e-05, + "loss": 1.6371, + "step": 5715 + }, + { + "epoch": 0.3185998550805418, + "grad_norm": 0.6395692825317383, + "learning_rate": 7.849842569179017e-05, + "loss": 2.0647, + "step": 5716 + }, + { + "epoch": 0.3186555933337049, + "grad_norm": 0.5583460927009583, + "learning_rate": 7.849110892694242e-05, + "loss": 1.8005, + "step": 5717 + }, + { + "epoch": 0.31871133158686804, + "grad_norm": 0.6016951203346252, + "learning_rate": 7.848379125852282e-05, + "loss": 1.9861, + "step": 5718 + }, + { + "epoch": 0.3187670698400312, + "grad_norm": 0.5291598439216614, + "learning_rate": 7.847647268676341e-05, + "loss": 1.6806, + "step": 5719 + }, + { + "epoch": 0.31882280809319435, + "grad_norm": 0.5864149332046509, + "learning_rate": 7.846915321189632e-05, + "loss": 1.7323, + "step": 5720 + }, + { + "epoch": 0.3188785463463575, + "grad_norm": 0.5477664470672607, + "learning_rate": 7.846183283415367e-05, + "loss": 1.7307, + "step": 5721 + }, + { + "epoch": 0.31893428459952067, + "grad_norm": 0.5449158549308777, + "learning_rate": 7.845451155376764e-05, + "loss": 1.679, + "step": 5722 + }, + { + "epoch": 0.3189900228526838, + "grad_norm": 0.5383809804916382, + "learning_rate": 7.844718937097039e-05, + "loss": 1.6991, + "step": 5723 + }, + { + "epoch": 0.3190457611058469, + "grad_norm": 0.4735757112503052, + "learning_rate": 7.843986628599416e-05, + "loss": 1.4701, + "step": 5724 + }, + { + "epoch": 0.3191014993590101, + "grad_norm": 0.5248317122459412, + "learning_rate": 7.843254229907119e-05, + "loss": 1.7293, + "step": 5725 + }, + { + "epoch": 0.31915723761217324, + "grad_norm": 0.5262721180915833, + "learning_rate": 7.842521741043375e-05, + "loss": 1.6067, + "step": 5726 + }, + { + "epoch": 0.31921297586533637, + "grad_norm": 0.5584807991981506, + "learning_rate": 7.841789162031415e-05, + "loss": 1.8573, + "step": 5727 + }, + { + "epoch": 0.31926871411849955, + "grad_norm": 0.5617311596870422, + "learning_rate": 7.84105649289447e-05, + "loss": 1.7482, + "step": 5728 + }, + { + "epoch": 0.3193244523716627, + "grad_norm": 0.5431827902793884, + "learning_rate": 7.840323733655778e-05, + "loss": 1.8564, + "step": 5729 + }, + { + "epoch": 0.3193801906248258, + "grad_norm": 0.5269571542739868, + "learning_rate": 7.839590884338579e-05, + "loss": 1.4677, + "step": 5730 + }, + { + "epoch": 0.31943592887798894, + "grad_norm": 0.5726506114006042, + "learning_rate": 7.838857944966113e-05, + "loss": 1.7656, + "step": 5731 + }, + { + "epoch": 0.3194916671311521, + "grad_norm": 0.5350455641746521, + "learning_rate": 7.838124915561623e-05, + "loss": 1.525, + "step": 5732 + }, + { + "epoch": 0.31954740538431525, + "grad_norm": 0.6093659996986389, + "learning_rate": 7.837391796148359e-05, + "loss": 1.9737, + "step": 5733 + }, + { + "epoch": 0.3196031436374784, + "grad_norm": 0.5513406991958618, + "learning_rate": 7.83665858674957e-05, + "loss": 1.6783, + "step": 5734 + }, + { + "epoch": 0.31965888189064157, + "grad_norm": 0.5465078949928284, + "learning_rate": 7.835925287388511e-05, + "loss": 1.5786, + "step": 5735 + }, + { + "epoch": 0.3197146201438047, + "grad_norm": 0.5756266713142395, + "learning_rate": 7.835191898088435e-05, + "loss": 1.7969, + "step": 5736 + }, + { + "epoch": 0.3197703583969678, + "grad_norm": 0.5218703150749207, + "learning_rate": 7.8344584188726e-05, + "loss": 1.619, + "step": 5737 + }, + { + "epoch": 0.319826096650131, + "grad_norm": 0.5465853810310364, + "learning_rate": 7.833724849764273e-05, + "loss": 1.6193, + "step": 5738 + }, + { + "epoch": 0.31988183490329414, + "grad_norm": 0.596364438533783, + "learning_rate": 7.832991190786716e-05, + "loss": 1.7853, + "step": 5739 + }, + { + "epoch": 0.31993757315645727, + "grad_norm": 0.544185221195221, + "learning_rate": 7.832257441963195e-05, + "loss": 1.8835, + "step": 5740 + }, + { + "epoch": 0.3199933114096204, + "grad_norm": 0.6070075631141663, + "learning_rate": 7.83152360331698e-05, + "loss": 2.1082, + "step": 5741 + }, + { + "epoch": 0.3200490496627836, + "grad_norm": 0.5382431745529175, + "learning_rate": 7.830789674871346e-05, + "loss": 1.7184, + "step": 5742 + }, + { + "epoch": 0.3201047879159467, + "grad_norm": 0.5074361562728882, + "learning_rate": 7.830055656649568e-05, + "loss": 1.5133, + "step": 5743 + }, + { + "epoch": 0.32016052616910984, + "grad_norm": 0.5396546125411987, + "learning_rate": 7.829321548674926e-05, + "loss": 1.6203, + "step": 5744 + }, + { + "epoch": 0.320216264422273, + "grad_norm": 0.5758295059204102, + "learning_rate": 7.8285873509707e-05, + "loss": 1.8658, + "step": 5745 + }, + { + "epoch": 0.32027200267543615, + "grad_norm": 0.506420910358429, + "learning_rate": 7.827853063560175e-05, + "loss": 1.509, + "step": 5746 + }, + { + "epoch": 0.3203277409285993, + "grad_norm": 0.5390977263450623, + "learning_rate": 7.82711868646664e-05, + "loss": 1.8333, + "step": 5747 + }, + { + "epoch": 0.32038347918176246, + "grad_norm": 0.5680609345436096, + "learning_rate": 7.82638421971338e-05, + "loss": 1.6984, + "step": 5748 + }, + { + "epoch": 0.3204392174349256, + "grad_norm": 0.5344312191009521, + "learning_rate": 7.825649663323693e-05, + "loss": 1.6667, + "step": 5749 + }, + { + "epoch": 0.3204949556880887, + "grad_norm": 0.610658586025238, + "learning_rate": 7.824915017320874e-05, + "loss": 1.7763, + "step": 5750 + }, + { + "epoch": 0.3205506939412519, + "grad_norm": 0.5463300943374634, + "learning_rate": 7.824180281728222e-05, + "loss": 1.5632, + "step": 5751 + }, + { + "epoch": 0.32060643219441504, + "grad_norm": 0.5856190919876099, + "learning_rate": 7.823445456569036e-05, + "loss": 1.8129, + "step": 5752 + }, + { + "epoch": 0.32066217044757817, + "grad_norm": 0.7068459987640381, + "learning_rate": 7.822710541866622e-05, + "loss": 1.8126, + "step": 5753 + }, + { + "epoch": 0.3207179087007413, + "grad_norm": 0.6159639954566956, + "learning_rate": 7.821975537644286e-05, + "loss": 1.7802, + "step": 5754 + }, + { + "epoch": 0.3207736469539045, + "grad_norm": 0.583821177482605, + "learning_rate": 7.821240443925341e-05, + "loss": 1.9406, + "step": 5755 + }, + { + "epoch": 0.3208293852070676, + "grad_norm": 0.49633607268333435, + "learning_rate": 7.820505260733098e-05, + "loss": 1.4748, + "step": 5756 + }, + { + "epoch": 0.32088512346023074, + "grad_norm": 0.5159478187561035, + "learning_rate": 7.819769988090873e-05, + "loss": 1.716, + "step": 5757 + }, + { + "epoch": 0.3209408617133939, + "grad_norm": 0.5665544867515564, + "learning_rate": 7.819034626021983e-05, + "loss": 1.8005, + "step": 5758 + }, + { + "epoch": 0.32099659996655705, + "grad_norm": 0.567043125629425, + "learning_rate": 7.818299174549752e-05, + "loss": 1.675, + "step": 5759 + }, + { + "epoch": 0.3210523382197202, + "grad_norm": 0.5980729460716248, + "learning_rate": 7.817563633697503e-05, + "loss": 1.9635, + "step": 5760 + }, + { + "epoch": 0.32110807647288336, + "grad_norm": 0.5714271068572998, + "learning_rate": 7.816828003488563e-05, + "loss": 1.7265, + "step": 5761 + }, + { + "epoch": 0.3211638147260465, + "grad_norm": 0.5386238694190979, + "learning_rate": 7.816092283946261e-05, + "loss": 1.6653, + "step": 5762 + }, + { + "epoch": 0.3212195529792096, + "grad_norm": 0.5798346400260925, + "learning_rate": 7.815356475093931e-05, + "loss": 1.6578, + "step": 5763 + }, + { + "epoch": 0.32127529123237275, + "grad_norm": 0.5155278444290161, + "learning_rate": 7.81462057695491e-05, + "loss": 1.787, + "step": 5764 + }, + { + "epoch": 0.32133102948553594, + "grad_norm": 0.49146315455436707, + "learning_rate": 7.813884589552534e-05, + "loss": 1.5927, + "step": 5765 + }, + { + "epoch": 0.32138676773869906, + "grad_norm": 0.553433895111084, + "learning_rate": 7.813148512910144e-05, + "loss": 1.7973, + "step": 5766 + }, + { + "epoch": 0.3214425059918622, + "grad_norm": 0.5665645003318787, + "learning_rate": 7.812412347051083e-05, + "loss": 1.7949, + "step": 5767 + }, + { + "epoch": 0.3214982442450254, + "grad_norm": 0.5180385708808899, + "learning_rate": 7.811676091998704e-05, + "loss": 1.7011, + "step": 5768 + }, + { + "epoch": 0.3215539824981885, + "grad_norm": 0.581295371055603, + "learning_rate": 7.81093974777635e-05, + "loss": 1.7513, + "step": 5769 + }, + { + "epoch": 0.32160972075135164, + "grad_norm": 0.5677274465560913, + "learning_rate": 7.810203314407377e-05, + "loss": 1.9528, + "step": 5770 + }, + { + "epoch": 0.3216654590045148, + "grad_norm": 0.5377728939056396, + "learning_rate": 7.80946679191514e-05, + "loss": 1.6544, + "step": 5771 + }, + { + "epoch": 0.32172119725767795, + "grad_norm": 0.533319354057312, + "learning_rate": 7.808730180322996e-05, + "loss": 1.6561, + "step": 5772 + }, + { + "epoch": 0.3217769355108411, + "grad_norm": 0.5324406623840332, + "learning_rate": 7.807993479654307e-05, + "loss": 1.6776, + "step": 5773 + }, + { + "epoch": 0.32183267376400426, + "grad_norm": 0.5995755195617676, + "learning_rate": 7.807256689932435e-05, + "loss": 1.6976, + "step": 5774 + }, + { + "epoch": 0.3218884120171674, + "grad_norm": 0.5474086999893188, + "learning_rate": 7.806519811180751e-05, + "loss": 1.4983, + "step": 5775 + }, + { + "epoch": 0.3219441502703305, + "grad_norm": 0.5364895462989807, + "learning_rate": 7.805782843422618e-05, + "loss": 1.7632, + "step": 5776 + }, + { + "epoch": 0.32199988852349365, + "grad_norm": 0.5104418396949768, + "learning_rate": 7.805045786681415e-05, + "loss": 1.6873, + "step": 5777 + }, + { + "epoch": 0.32205562677665683, + "grad_norm": 0.5162766575813293, + "learning_rate": 7.804308640980513e-05, + "loss": 1.6692, + "step": 5778 + }, + { + "epoch": 0.32211136502981996, + "grad_norm": 0.5526577234268188, + "learning_rate": 7.803571406343293e-05, + "loss": 1.631, + "step": 5779 + }, + { + "epoch": 0.3221671032829831, + "grad_norm": 0.4954930245876312, + "learning_rate": 7.802834082793131e-05, + "loss": 1.4774, + "step": 5780 + }, + { + "epoch": 0.3222228415361463, + "grad_norm": 0.5704354643821716, + "learning_rate": 7.802096670353416e-05, + "loss": 1.9247, + "step": 5781 + }, + { + "epoch": 0.3222785797893094, + "grad_norm": 0.5746217966079712, + "learning_rate": 7.80135916904753e-05, + "loss": 1.9075, + "step": 5782 + }, + { + "epoch": 0.32233431804247253, + "grad_norm": 0.5538354516029358, + "learning_rate": 7.800621578898867e-05, + "loss": 1.6338, + "step": 5783 + }, + { + "epoch": 0.3223900562956357, + "grad_norm": 0.5441854596138, + "learning_rate": 7.799883899930815e-05, + "loss": 1.6214, + "step": 5784 + }, + { + "epoch": 0.32244579454879885, + "grad_norm": 0.5677271485328674, + "learning_rate": 7.79914613216677e-05, + "loss": 1.7258, + "step": 5785 + }, + { + "epoch": 0.322501532801962, + "grad_norm": 0.5610553026199341, + "learning_rate": 7.798408275630129e-05, + "loss": 1.6471, + "step": 5786 + }, + { + "epoch": 0.3225572710551251, + "grad_norm": 0.5126567482948303, + "learning_rate": 7.797670330344294e-05, + "loss": 1.7154, + "step": 5787 + }, + { + "epoch": 0.3226130093082883, + "grad_norm": 0.565370500087738, + "learning_rate": 7.796932296332667e-05, + "loss": 1.7534, + "step": 5788 + }, + { + "epoch": 0.3226687475614514, + "grad_norm": 0.5113086104393005, + "learning_rate": 7.796194173618654e-05, + "loss": 1.5581, + "step": 5789 + }, + { + "epoch": 0.32272448581461455, + "grad_norm": 0.543984591960907, + "learning_rate": 7.795455962225669e-05, + "loss": 1.7255, + "step": 5790 + }, + { + "epoch": 0.32278022406777773, + "grad_norm": 0.5158193707466125, + "learning_rate": 7.794717662177115e-05, + "loss": 1.6029, + "step": 5791 + }, + { + "epoch": 0.32283596232094086, + "grad_norm": 0.5405291318893433, + "learning_rate": 7.793979273496414e-05, + "loss": 1.6035, + "step": 5792 + }, + { + "epoch": 0.322891700574104, + "grad_norm": 0.617701530456543, + "learning_rate": 7.793240796206979e-05, + "loss": 1.8577, + "step": 5793 + }, + { + "epoch": 0.3229474388272672, + "grad_norm": 0.4910410940647125, + "learning_rate": 7.79250223033223e-05, + "loss": 1.4227, + "step": 5794 + }, + { + "epoch": 0.3230031770804303, + "grad_norm": 0.5436237454414368, + "learning_rate": 7.791763575895594e-05, + "loss": 1.5865, + "step": 5795 + }, + { + "epoch": 0.32305891533359343, + "grad_norm": 0.5777418613433838, + "learning_rate": 7.791024832920496e-05, + "loss": 1.8056, + "step": 5796 + }, + { + "epoch": 0.3231146535867566, + "grad_norm": 0.5960043668746948, + "learning_rate": 7.79028600143036e-05, + "loss": 1.8124, + "step": 5797 + }, + { + "epoch": 0.32317039183991975, + "grad_norm": 0.5568564534187317, + "learning_rate": 7.789547081448622e-05, + "loss": 1.614, + "step": 5798 + }, + { + "epoch": 0.3232261300930829, + "grad_norm": 0.5896525979042053, + "learning_rate": 7.788808072998715e-05, + "loss": 1.784, + "step": 5799 + }, + { + "epoch": 0.323281868346246, + "grad_norm": 0.5450705885887146, + "learning_rate": 7.788068976104074e-05, + "loss": 1.462, + "step": 5800 + }, + { + "epoch": 0.3233376065994092, + "grad_norm": 0.4870886206626892, + "learning_rate": 7.787329790788142e-05, + "loss": 1.5523, + "step": 5801 + }, + { + "epoch": 0.3233933448525723, + "grad_norm": 0.5481093525886536, + "learning_rate": 7.78659051707436e-05, + "loss": 1.6292, + "step": 5802 + }, + { + "epoch": 0.32344908310573545, + "grad_norm": 0.5144929885864258, + "learning_rate": 7.785851154986174e-05, + "loss": 1.4811, + "step": 5803 + }, + { + "epoch": 0.32350482135889863, + "grad_norm": 0.5884720683097839, + "learning_rate": 7.785111704547032e-05, + "loss": 1.8426, + "step": 5804 + }, + { + "epoch": 0.32356055961206176, + "grad_norm": 0.5478202104568481, + "learning_rate": 7.784372165780386e-05, + "loss": 1.4918, + "step": 5805 + }, + { + "epoch": 0.3236162978652249, + "grad_norm": 0.5706868767738342, + "learning_rate": 7.783632538709688e-05, + "loss": 1.6687, + "step": 5806 + }, + { + "epoch": 0.3236720361183881, + "grad_norm": 0.569288432598114, + "learning_rate": 7.782892823358394e-05, + "loss": 1.7208, + "step": 5807 + }, + { + "epoch": 0.3237277743715512, + "grad_norm": 0.6056145429611206, + "learning_rate": 7.782153019749967e-05, + "loss": 1.9566, + "step": 5808 + }, + { + "epoch": 0.32378351262471433, + "grad_norm": 0.5828245878219604, + "learning_rate": 7.781413127907868e-05, + "loss": 1.7169, + "step": 5809 + }, + { + "epoch": 0.32383925087787746, + "grad_norm": 0.5503557920455933, + "learning_rate": 7.780673147855559e-05, + "loss": 1.7084, + "step": 5810 + }, + { + "epoch": 0.32389498913104064, + "grad_norm": 0.5861828327178955, + "learning_rate": 7.779933079616512e-05, + "loss": 1.6815, + "step": 5811 + }, + { + "epoch": 0.3239507273842038, + "grad_norm": 0.5410308837890625, + "learning_rate": 7.779192923214196e-05, + "loss": 1.6899, + "step": 5812 + }, + { + "epoch": 0.3240064656373669, + "grad_norm": 0.6349414587020874, + "learning_rate": 7.778452678672084e-05, + "loss": 2.0061, + "step": 5813 + }, + { + "epoch": 0.3240622038905301, + "grad_norm": 0.6143296360969543, + "learning_rate": 7.777712346013651e-05, + "loss": 1.6939, + "step": 5814 + }, + { + "epoch": 0.3241179421436932, + "grad_norm": 0.5646039247512817, + "learning_rate": 7.776971925262379e-05, + "loss": 1.4296, + "step": 5815 + }, + { + "epoch": 0.32417368039685635, + "grad_norm": 0.570025622844696, + "learning_rate": 7.776231416441748e-05, + "loss": 1.8693, + "step": 5816 + }, + { + "epoch": 0.32422941865001953, + "grad_norm": 0.4873752295970917, + "learning_rate": 7.775490819575242e-05, + "loss": 1.5215, + "step": 5817 + }, + { + "epoch": 0.32428515690318266, + "grad_norm": 0.5546776652336121, + "learning_rate": 7.774750134686352e-05, + "loss": 1.6002, + "step": 5818 + }, + { + "epoch": 0.3243408951563458, + "grad_norm": 0.5605872273445129, + "learning_rate": 7.774009361798565e-05, + "loss": 1.42, + "step": 5819 + }, + { + "epoch": 0.32439663340950897, + "grad_norm": 0.5118110179901123, + "learning_rate": 7.773268500935372e-05, + "loss": 1.6076, + "step": 5820 + }, + { + "epoch": 0.3244523716626721, + "grad_norm": 0.5516108274459839, + "learning_rate": 7.772527552120273e-05, + "loss": 1.6444, + "step": 5821 + }, + { + "epoch": 0.32450810991583523, + "grad_norm": 0.5176465511322021, + "learning_rate": 7.771786515376765e-05, + "loss": 1.3809, + "step": 5822 + }, + { + "epoch": 0.32456384816899836, + "grad_norm": 0.5901971459388733, + "learning_rate": 7.77104539072835e-05, + "loss": 1.8976, + "step": 5823 + }, + { + "epoch": 0.32461958642216154, + "grad_norm": 0.5981687903404236, + "learning_rate": 7.770304178198531e-05, + "loss": 1.7352, + "step": 5824 + }, + { + "epoch": 0.3246753246753247, + "grad_norm": 0.48600277304649353, + "learning_rate": 7.769562877810816e-05, + "loss": 1.5827, + "step": 5825 + }, + { + "epoch": 0.3247310629284878, + "grad_norm": 0.47773730754852295, + "learning_rate": 7.768821489588713e-05, + "loss": 1.44, + "step": 5826 + }, + { + "epoch": 0.324786801181651, + "grad_norm": 0.5615780353546143, + "learning_rate": 7.768080013555737e-05, + "loss": 1.6719, + "step": 5827 + }, + { + "epoch": 0.3248425394348141, + "grad_norm": 0.5451145172119141, + "learning_rate": 7.767338449735401e-05, + "loss": 1.355, + "step": 5828 + }, + { + "epoch": 0.32489827768797724, + "grad_norm": 0.5609704852104187, + "learning_rate": 7.766596798151224e-05, + "loss": 1.6764, + "step": 5829 + }, + { + "epoch": 0.32495401594114043, + "grad_norm": 0.5926015973091125, + "learning_rate": 7.765855058826727e-05, + "loss": 1.8243, + "step": 5830 + }, + { + "epoch": 0.32500975419430356, + "grad_norm": 0.5234283804893494, + "learning_rate": 7.765113231785435e-05, + "loss": 1.7313, + "step": 5831 + }, + { + "epoch": 0.3250654924474667, + "grad_norm": 0.5433173179626465, + "learning_rate": 7.764371317050873e-05, + "loss": 1.7546, + "step": 5832 + }, + { + "epoch": 0.3251212307006298, + "grad_norm": 0.6074669361114502, + "learning_rate": 7.763629314646568e-05, + "loss": 1.7879, + "step": 5833 + }, + { + "epoch": 0.325176968953793, + "grad_norm": 0.6136168241500854, + "learning_rate": 7.762887224596055e-05, + "loss": 1.8066, + "step": 5834 + }, + { + "epoch": 0.32523270720695613, + "grad_norm": 0.5498754978179932, + "learning_rate": 7.76214504692287e-05, + "loss": 1.6913, + "step": 5835 + }, + { + "epoch": 0.32528844546011926, + "grad_norm": 0.5876418352127075, + "learning_rate": 7.761402781650547e-05, + "loss": 1.7581, + "step": 5836 + }, + { + "epoch": 0.32534418371328244, + "grad_norm": 0.5235028862953186, + "learning_rate": 7.760660428802628e-05, + "loss": 1.5955, + "step": 5837 + }, + { + "epoch": 0.32539992196644557, + "grad_norm": 0.54973304271698, + "learning_rate": 7.759917988402657e-05, + "loss": 1.6833, + "step": 5838 + }, + { + "epoch": 0.3254556602196087, + "grad_norm": 0.6082160472869873, + "learning_rate": 7.759175460474177e-05, + "loss": 1.8303, + "step": 5839 + }, + { + "epoch": 0.3255113984727719, + "grad_norm": 0.5204039812088013, + "learning_rate": 7.758432845040737e-05, + "loss": 1.7216, + "step": 5840 + }, + { + "epoch": 0.325567136725935, + "grad_norm": 0.5268458724021912, + "learning_rate": 7.757690142125893e-05, + "loss": 1.6099, + "step": 5841 + }, + { + "epoch": 0.32562287497909814, + "grad_norm": 0.5118129253387451, + "learning_rate": 7.756947351753196e-05, + "loss": 1.5388, + "step": 5842 + }, + { + "epoch": 0.3256786132322613, + "grad_norm": 0.5349292159080505, + "learning_rate": 7.756204473946203e-05, + "loss": 1.6813, + "step": 5843 + }, + { + "epoch": 0.32573435148542446, + "grad_norm": 0.5555446743965149, + "learning_rate": 7.755461508728472e-05, + "loss": 1.5549, + "step": 5844 + }, + { + "epoch": 0.3257900897385876, + "grad_norm": 0.5379804372787476, + "learning_rate": 7.75471845612357e-05, + "loss": 1.5658, + "step": 5845 + }, + { + "epoch": 0.3258458279917507, + "grad_norm": 0.618511974811554, + "learning_rate": 7.753975316155057e-05, + "loss": 1.8505, + "step": 5846 + }, + { + "epoch": 0.3259015662449139, + "grad_norm": 0.6143367290496826, + "learning_rate": 7.753232088846505e-05, + "loss": 1.953, + "step": 5847 + }, + { + "epoch": 0.325957304498077, + "grad_norm": 0.543201208114624, + "learning_rate": 7.752488774221485e-05, + "loss": 1.9068, + "step": 5848 + }, + { + "epoch": 0.32601304275124016, + "grad_norm": 0.5580254197120667, + "learning_rate": 7.751745372303567e-05, + "loss": 1.6766, + "step": 5849 + }, + { + "epoch": 0.32606878100440334, + "grad_norm": 0.5846728086471558, + "learning_rate": 7.751001883116331e-05, + "loss": 1.874, + "step": 5850 + }, + { + "epoch": 0.32612451925756647, + "grad_norm": 0.5597751140594482, + "learning_rate": 7.750258306683353e-05, + "loss": 1.7491, + "step": 5851 + }, + { + "epoch": 0.3261802575107296, + "grad_norm": 0.49921393394470215, + "learning_rate": 7.749514643028218e-05, + "loss": 1.3701, + "step": 5852 + }, + { + "epoch": 0.3262359957638928, + "grad_norm": 0.5255808234214783, + "learning_rate": 7.748770892174509e-05, + "loss": 1.4772, + "step": 5853 + }, + { + "epoch": 0.3262917340170559, + "grad_norm": 0.5470353960990906, + "learning_rate": 7.748027054145814e-05, + "loss": 1.7885, + "step": 5854 + }, + { + "epoch": 0.32634747227021904, + "grad_norm": 0.575181782245636, + "learning_rate": 7.747283128965723e-05, + "loss": 1.8875, + "step": 5855 + }, + { + "epoch": 0.32640321052338217, + "grad_norm": 0.6346047520637512, + "learning_rate": 7.74653911665783e-05, + "loss": 2.0948, + "step": 5856 + }, + { + "epoch": 0.32645894877654535, + "grad_norm": 0.5814865231513977, + "learning_rate": 7.745795017245729e-05, + "loss": 1.572, + "step": 5857 + }, + { + "epoch": 0.3265146870297085, + "grad_norm": 0.5990648865699768, + "learning_rate": 7.745050830753018e-05, + "loss": 1.7464, + "step": 5858 + }, + { + "epoch": 0.3265704252828716, + "grad_norm": 0.5689359903335571, + "learning_rate": 7.744306557203299e-05, + "loss": 1.9168, + "step": 5859 + }, + { + "epoch": 0.3266261635360348, + "grad_norm": 0.5398204326629639, + "learning_rate": 7.743562196620177e-05, + "loss": 1.6884, + "step": 5860 + }, + { + "epoch": 0.3266819017891979, + "grad_norm": 0.5738016366958618, + "learning_rate": 7.74281774902726e-05, + "loss": 1.815, + "step": 5861 + }, + { + "epoch": 0.32673764004236105, + "grad_norm": 0.5424049496650696, + "learning_rate": 7.742073214448153e-05, + "loss": 1.832, + "step": 5862 + }, + { + "epoch": 0.32679337829552424, + "grad_norm": 0.5409512519836426, + "learning_rate": 7.741328592906474e-05, + "loss": 1.7179, + "step": 5863 + }, + { + "epoch": 0.32684911654868737, + "grad_norm": 0.5621674656867981, + "learning_rate": 7.740583884425833e-05, + "loss": 1.8319, + "step": 5864 + }, + { + "epoch": 0.3269048548018505, + "grad_norm": 0.5400972962379456, + "learning_rate": 7.73983908902985e-05, + "loss": 1.6868, + "step": 5865 + }, + { + "epoch": 0.3269605930550137, + "grad_norm": 0.5927982926368713, + "learning_rate": 7.739094206742146e-05, + "loss": 1.6426, + "step": 5866 + }, + { + "epoch": 0.3270163313081768, + "grad_norm": 0.510775089263916, + "learning_rate": 7.738349237586343e-05, + "loss": 1.6661, + "step": 5867 + }, + { + "epoch": 0.32707206956133994, + "grad_norm": 0.5710152387619019, + "learning_rate": 7.737604181586068e-05, + "loss": 1.7263, + "step": 5868 + }, + { + "epoch": 0.32712780781450307, + "grad_norm": 0.5645250082015991, + "learning_rate": 7.736859038764952e-05, + "loss": 1.7197, + "step": 5869 + }, + { + "epoch": 0.32718354606766625, + "grad_norm": 0.5439823865890503, + "learning_rate": 7.73611380914662e-05, + "loss": 1.7229, + "step": 5870 + }, + { + "epoch": 0.3272392843208294, + "grad_norm": 0.5163010358810425, + "learning_rate": 7.735368492754715e-05, + "loss": 1.5273, + "step": 5871 + }, + { + "epoch": 0.3272950225739925, + "grad_norm": 0.5735363960266113, + "learning_rate": 7.734623089612867e-05, + "loss": 1.7926, + "step": 5872 + }, + { + "epoch": 0.3273507608271557, + "grad_norm": 0.5508522391319275, + "learning_rate": 7.73387759974472e-05, + "loss": 1.492, + "step": 5873 + }, + { + "epoch": 0.3274064990803188, + "grad_norm": 0.6105926632881165, + "learning_rate": 7.733132023173915e-05, + "loss": 1.6155, + "step": 5874 + }, + { + "epoch": 0.32746223733348195, + "grad_norm": 0.5956704020500183, + "learning_rate": 7.732386359924097e-05, + "loss": 1.7757, + "step": 5875 + }, + { + "epoch": 0.32751797558664514, + "grad_norm": 0.6001446843147278, + "learning_rate": 7.731640610018914e-05, + "loss": 1.6669, + "step": 5876 + }, + { + "epoch": 0.32757371383980827, + "grad_norm": 0.6132667660713196, + "learning_rate": 7.730894773482019e-05, + "loss": 1.944, + "step": 5877 + }, + { + "epoch": 0.3276294520929714, + "grad_norm": 0.5684986710548401, + "learning_rate": 7.730148850337062e-05, + "loss": 1.7491, + "step": 5878 + }, + { + "epoch": 0.3276851903461345, + "grad_norm": 0.537605881690979, + "learning_rate": 7.729402840607702e-05, + "loss": 1.7473, + "step": 5879 + }, + { + "epoch": 0.3277409285992977, + "grad_norm": 0.5186078548431396, + "learning_rate": 7.728656744317598e-05, + "loss": 1.7703, + "step": 5880 + }, + { + "epoch": 0.32779666685246084, + "grad_norm": 0.5188151001930237, + "learning_rate": 7.727910561490411e-05, + "loss": 1.6632, + "step": 5881 + }, + { + "epoch": 0.32785240510562397, + "grad_norm": 0.5799871683120728, + "learning_rate": 7.727164292149806e-05, + "loss": 1.7289, + "step": 5882 + }, + { + "epoch": 0.32790814335878715, + "grad_norm": 0.5974400639533997, + "learning_rate": 7.72641793631945e-05, + "loss": 1.9396, + "step": 5883 + }, + { + "epoch": 0.3279638816119503, + "grad_norm": 0.5383574366569519, + "learning_rate": 7.725671494023014e-05, + "loss": 1.6176, + "step": 5884 + }, + { + "epoch": 0.3280196198651134, + "grad_norm": 0.5623538494110107, + "learning_rate": 7.724924965284169e-05, + "loss": 1.7997, + "step": 5885 + }, + { + "epoch": 0.3280753581182766, + "grad_norm": 0.5270793437957764, + "learning_rate": 7.72417835012659e-05, + "loss": 1.762, + "step": 5886 + }, + { + "epoch": 0.3281310963714397, + "grad_norm": 0.4922736585140228, + "learning_rate": 7.72343164857396e-05, + "loss": 1.29, + "step": 5887 + }, + { + "epoch": 0.32818683462460285, + "grad_norm": 0.5568634867668152, + "learning_rate": 7.722684860649953e-05, + "loss": 1.8285, + "step": 5888 + }, + { + "epoch": 0.32824257287776604, + "grad_norm": 0.5732812285423279, + "learning_rate": 7.721937986378261e-05, + "loss": 1.6134, + "step": 5889 + }, + { + "epoch": 0.32829831113092917, + "grad_norm": 0.5091588497161865, + "learning_rate": 7.721191025782563e-05, + "loss": 1.5536, + "step": 5890 + }, + { + "epoch": 0.3283540493840923, + "grad_norm": 0.5646446347236633, + "learning_rate": 7.720443978886551e-05, + "loss": 1.6102, + "step": 5891 + }, + { + "epoch": 0.3284097876372554, + "grad_norm": 0.5230876207351685, + "learning_rate": 7.71969684571392e-05, + "loss": 1.7258, + "step": 5892 + }, + { + "epoch": 0.3284655258904186, + "grad_norm": 0.5695227980613708, + "learning_rate": 7.718949626288359e-05, + "loss": 1.7538, + "step": 5893 + }, + { + "epoch": 0.32852126414358174, + "grad_norm": 0.5724740028381348, + "learning_rate": 7.718202320633572e-05, + "loss": 1.5929, + "step": 5894 + }, + { + "epoch": 0.32857700239674487, + "grad_norm": 0.5088779926300049, + "learning_rate": 7.717454928773253e-05, + "loss": 1.5781, + "step": 5895 + }, + { + "epoch": 0.32863274064990805, + "grad_norm": 0.6324506402015686, + "learning_rate": 7.716707450731109e-05, + "loss": 1.97, + "step": 5896 + }, + { + "epoch": 0.3286884789030712, + "grad_norm": 0.5300724506378174, + "learning_rate": 7.715959886530843e-05, + "loss": 1.6759, + "step": 5897 + }, + { + "epoch": 0.3287442171562343, + "grad_norm": 0.5645179152488708, + "learning_rate": 7.715212236196164e-05, + "loss": 1.6515, + "step": 5898 + }, + { + "epoch": 0.3287999554093975, + "grad_norm": 0.575449526309967, + "learning_rate": 7.714464499750784e-05, + "loss": 1.7267, + "step": 5899 + }, + { + "epoch": 0.3288556936625606, + "grad_norm": 0.5279715657234192, + "learning_rate": 7.713716677218416e-05, + "loss": 1.6431, + "step": 5900 + }, + { + "epoch": 0.32891143191572375, + "grad_norm": 0.5209466814994812, + "learning_rate": 7.712968768622779e-05, + "loss": 1.5909, + "step": 5901 + }, + { + "epoch": 0.3289671701688869, + "grad_norm": 0.5469819903373718, + "learning_rate": 7.712220773987589e-05, + "loss": 1.6273, + "step": 5902 + }, + { + "epoch": 0.32902290842205006, + "grad_norm": 0.5781688690185547, + "learning_rate": 7.71147269333657e-05, + "loss": 1.8497, + "step": 5903 + }, + { + "epoch": 0.3290786466752132, + "grad_norm": 0.5549498200416565, + "learning_rate": 7.710724526693445e-05, + "loss": 1.6606, + "step": 5904 + }, + { + "epoch": 0.3291343849283763, + "grad_norm": 0.5616956949234009, + "learning_rate": 7.709976274081944e-05, + "loss": 1.8094, + "step": 5905 + }, + { + "epoch": 0.3291901231815395, + "grad_norm": 0.5189547538757324, + "learning_rate": 7.709227935525796e-05, + "loss": 1.7477, + "step": 5906 + }, + { + "epoch": 0.32924586143470264, + "grad_norm": 0.5060945749282837, + "learning_rate": 7.708479511048732e-05, + "loss": 1.4591, + "step": 5907 + }, + { + "epoch": 0.32930159968786576, + "grad_norm": 0.5463743209838867, + "learning_rate": 7.707731000674492e-05, + "loss": 1.6762, + "step": 5908 + }, + { + "epoch": 0.32935733794102895, + "grad_norm": 0.5190552473068237, + "learning_rate": 7.70698240442681e-05, + "loss": 1.529, + "step": 5909 + }, + { + "epoch": 0.3294130761941921, + "grad_norm": 0.5391181111335754, + "learning_rate": 7.70623372232943e-05, + "loss": 1.6953, + "step": 5910 + }, + { + "epoch": 0.3294688144473552, + "grad_norm": 0.5780003070831299, + "learning_rate": 7.705484954406092e-05, + "loss": 1.6728, + "step": 5911 + }, + { + "epoch": 0.3295245527005184, + "grad_norm": 0.554817795753479, + "learning_rate": 7.704736100680547e-05, + "loss": 1.6731, + "step": 5912 + }, + { + "epoch": 0.3295802909536815, + "grad_norm": 0.590787410736084, + "learning_rate": 7.703987161176545e-05, + "loss": 1.9063, + "step": 5913 + }, + { + "epoch": 0.32963602920684465, + "grad_norm": 0.5418079495429993, + "learning_rate": 7.703238135917832e-05, + "loss": 1.6984, + "step": 5914 + }, + { + "epoch": 0.3296917674600078, + "grad_norm": 0.5568365454673767, + "learning_rate": 7.702489024928168e-05, + "loss": 1.7057, + "step": 5915 + }, + { + "epoch": 0.32974750571317096, + "grad_norm": 0.5823662281036377, + "learning_rate": 7.701739828231309e-05, + "loss": 1.8851, + "step": 5916 + }, + { + "epoch": 0.3298032439663341, + "grad_norm": 0.588046133518219, + "learning_rate": 7.700990545851014e-05, + "loss": 1.6514, + "step": 5917 + }, + { + "epoch": 0.3298589822194972, + "grad_norm": 0.5833228826522827, + "learning_rate": 7.700241177811048e-05, + "loss": 1.7474, + "step": 5918 + }, + { + "epoch": 0.3299147204726604, + "grad_norm": 0.5376124978065491, + "learning_rate": 7.699491724135175e-05, + "loss": 1.65, + "step": 5919 + }, + { + "epoch": 0.32997045872582353, + "grad_norm": 0.579406201839447, + "learning_rate": 7.698742184847163e-05, + "loss": 1.7039, + "step": 5920 + }, + { + "epoch": 0.33002619697898666, + "grad_norm": 0.5547471046447754, + "learning_rate": 7.697992559970784e-05, + "loss": 1.7428, + "step": 5921 + }, + { + "epoch": 0.33008193523214985, + "grad_norm": 0.5924109816551208, + "learning_rate": 7.697242849529812e-05, + "loss": 1.7935, + "step": 5922 + }, + { + "epoch": 0.330137673485313, + "grad_norm": 0.5609079003334045, + "learning_rate": 7.69649305354802e-05, + "loss": 1.7302, + "step": 5923 + }, + { + "epoch": 0.3301934117384761, + "grad_norm": 0.5709410309791565, + "learning_rate": 7.695743172049192e-05, + "loss": 1.6529, + "step": 5924 + }, + { + "epoch": 0.33024914999163923, + "grad_norm": 0.5341020822525024, + "learning_rate": 7.694993205057108e-05, + "loss": 1.696, + "step": 5925 + }, + { + "epoch": 0.3303048882448024, + "grad_norm": 0.5852230787277222, + "learning_rate": 7.694243152595552e-05, + "loss": 1.6173, + "step": 5926 + }, + { + "epoch": 0.33036062649796555, + "grad_norm": 0.5338337421417236, + "learning_rate": 7.693493014688313e-05, + "loss": 1.4818, + "step": 5927 + }, + { + "epoch": 0.3304163647511287, + "grad_norm": 0.5398749113082886, + "learning_rate": 7.69274279135918e-05, + "loss": 1.631, + "step": 5928 + }, + { + "epoch": 0.33047210300429186, + "grad_norm": 0.5520002245903015, + "learning_rate": 7.691992482631944e-05, + "loss": 1.8426, + "step": 5929 + }, + { + "epoch": 0.330527841257455, + "grad_norm": 0.5498268008232117, + "learning_rate": 7.691242088530401e-05, + "loss": 1.8106, + "step": 5930 + }, + { + "epoch": 0.3305835795106181, + "grad_norm": 0.5437809824943542, + "learning_rate": 7.690491609078351e-05, + "loss": 1.7523, + "step": 5931 + }, + { + "epoch": 0.3306393177637813, + "grad_norm": 0.6089059114456177, + "learning_rate": 7.689741044299595e-05, + "loss": 1.7299, + "step": 5932 + }, + { + "epoch": 0.33069505601694443, + "grad_norm": 0.5289489030838013, + "learning_rate": 7.688990394217933e-05, + "loss": 1.691, + "step": 5933 + }, + { + "epoch": 0.33075079427010756, + "grad_norm": 0.555590033531189, + "learning_rate": 7.688239658857174e-05, + "loss": 1.45, + "step": 5934 + }, + { + "epoch": 0.33080653252327075, + "grad_norm": 0.6252313256263733, + "learning_rate": 7.687488838241128e-05, + "loss": 1.8009, + "step": 5935 + }, + { + "epoch": 0.3308622707764339, + "grad_norm": 0.5846867561340332, + "learning_rate": 7.686737932393605e-05, + "loss": 1.7873, + "step": 5936 + }, + { + "epoch": 0.330918009029597, + "grad_norm": 0.5312223434448242, + "learning_rate": 7.685986941338419e-05, + "loss": 1.6196, + "step": 5937 + }, + { + "epoch": 0.33097374728276013, + "grad_norm": 0.5511593222618103, + "learning_rate": 7.685235865099387e-05, + "loss": 1.7915, + "step": 5938 + }, + { + "epoch": 0.3310294855359233, + "grad_norm": 0.5287107825279236, + "learning_rate": 7.684484703700332e-05, + "loss": 1.6648, + "step": 5939 + }, + { + "epoch": 0.33108522378908645, + "grad_norm": 0.5697956681251526, + "learning_rate": 7.683733457165071e-05, + "loss": 2.0054, + "step": 5940 + }, + { + "epoch": 0.3311409620422496, + "grad_norm": 0.5331019759178162, + "learning_rate": 7.682982125517433e-05, + "loss": 1.7598, + "step": 5941 + }, + { + "epoch": 0.33119670029541276, + "grad_norm": 0.5488009452819824, + "learning_rate": 7.682230708781244e-05, + "loss": 1.4258, + "step": 5942 + }, + { + "epoch": 0.3312524385485759, + "grad_norm": 0.5415595173835754, + "learning_rate": 7.681479206980338e-05, + "loss": 1.766, + "step": 5943 + }, + { + "epoch": 0.331308176801739, + "grad_norm": 0.6208872199058533, + "learning_rate": 7.680727620138542e-05, + "loss": 1.879, + "step": 5944 + }, + { + "epoch": 0.3313639150549022, + "grad_norm": 0.5650165677070618, + "learning_rate": 7.679975948279699e-05, + "loss": 1.4933, + "step": 5945 + }, + { + "epoch": 0.33141965330806533, + "grad_norm": 0.5754852890968323, + "learning_rate": 7.679224191427642e-05, + "loss": 1.6821, + "step": 5946 + }, + { + "epoch": 0.33147539156122846, + "grad_norm": 0.5749027132987976, + "learning_rate": 7.678472349606215e-05, + "loss": 1.8599, + "step": 5947 + }, + { + "epoch": 0.3315311298143916, + "grad_norm": 0.5200157761573792, + "learning_rate": 7.677720422839263e-05, + "loss": 1.6659, + "step": 5948 + }, + { + "epoch": 0.3315868680675548, + "grad_norm": 0.6056989431381226, + "learning_rate": 7.676968411150629e-05, + "loss": 1.9657, + "step": 5949 + }, + { + "epoch": 0.3316426063207179, + "grad_norm": 0.5650584697723389, + "learning_rate": 7.676216314564166e-05, + "loss": 1.9396, + "step": 5950 + }, + { + "epoch": 0.33169834457388103, + "grad_norm": 0.5425543785095215, + "learning_rate": 7.675464133103726e-05, + "loss": 1.6447, + "step": 5951 + }, + { + "epoch": 0.3317540828270442, + "grad_norm": 0.5751011967658997, + "learning_rate": 7.674711866793163e-05, + "loss": 1.7975, + "step": 5952 + }, + { + "epoch": 0.33180982108020735, + "grad_norm": 0.521195113658905, + "learning_rate": 7.673959515656333e-05, + "loss": 1.6343, + "step": 5953 + }, + { + "epoch": 0.3318655593333705, + "grad_norm": 0.5193372964859009, + "learning_rate": 7.673207079717098e-05, + "loss": 1.7215, + "step": 5954 + }, + { + "epoch": 0.33192129758653366, + "grad_norm": 0.4974719285964966, + "learning_rate": 7.672454558999318e-05, + "loss": 1.5058, + "step": 5955 + }, + { + "epoch": 0.3319770358396968, + "grad_norm": 0.610576868057251, + "learning_rate": 7.671701953526863e-05, + "loss": 1.8826, + "step": 5956 + }, + { + "epoch": 0.3320327740928599, + "grad_norm": 0.5185069441795349, + "learning_rate": 7.670949263323599e-05, + "loss": 1.3823, + "step": 5957 + }, + { + "epoch": 0.3320885123460231, + "grad_norm": 0.5048871636390686, + "learning_rate": 7.670196488413397e-05, + "loss": 1.3208, + "step": 5958 + }, + { + "epoch": 0.33214425059918623, + "grad_norm": 0.512177586555481, + "learning_rate": 7.66944362882013e-05, + "loss": 1.4293, + "step": 5959 + }, + { + "epoch": 0.33219998885234936, + "grad_norm": 0.5636778473854065, + "learning_rate": 7.668690684567676e-05, + "loss": 1.5585, + "step": 5960 + }, + { + "epoch": 0.3322557271055125, + "grad_norm": 0.5499832630157471, + "learning_rate": 7.667937655679913e-05, + "loss": 1.5834, + "step": 5961 + }, + { + "epoch": 0.3323114653586757, + "grad_norm": 0.6139015555381775, + "learning_rate": 7.667184542180723e-05, + "loss": 2.0935, + "step": 5962 + }, + { + "epoch": 0.3323672036118388, + "grad_norm": 0.5284989476203918, + "learning_rate": 7.666431344093988e-05, + "loss": 1.6838, + "step": 5963 + }, + { + "epoch": 0.33242294186500193, + "grad_norm": 0.5448603630065918, + "learning_rate": 7.665678061443599e-05, + "loss": 1.6688, + "step": 5964 + }, + { + "epoch": 0.3324786801181651, + "grad_norm": 0.5356377959251404, + "learning_rate": 7.664924694253443e-05, + "loss": 1.6131, + "step": 5965 + }, + { + "epoch": 0.33253441837132824, + "grad_norm": 0.5786362886428833, + "learning_rate": 7.664171242547414e-05, + "loss": 1.859, + "step": 5966 + }, + { + "epoch": 0.3325901566244914, + "grad_norm": 0.5811523199081421, + "learning_rate": 7.663417706349407e-05, + "loss": 1.6848, + "step": 5967 + }, + { + "epoch": 0.33264589487765456, + "grad_norm": 0.5504920482635498, + "learning_rate": 7.662664085683317e-05, + "loss": 1.7, + "step": 5968 + }, + { + "epoch": 0.3327016331308177, + "grad_norm": 0.6110926866531372, + "learning_rate": 7.66191038057305e-05, + "loss": 1.87, + "step": 5969 + }, + { + "epoch": 0.3327573713839808, + "grad_norm": 0.5238990187644958, + "learning_rate": 7.661156591042502e-05, + "loss": 1.6083, + "step": 5970 + }, + { + "epoch": 0.33281310963714394, + "grad_norm": 0.5919533371925354, + "learning_rate": 7.660402717115584e-05, + "loss": 1.6786, + "step": 5971 + }, + { + "epoch": 0.33286884789030713, + "grad_norm": 0.565631091594696, + "learning_rate": 7.659648758816205e-05, + "loss": 1.595, + "step": 5972 + }, + { + "epoch": 0.33292458614347026, + "grad_norm": 0.6189529299736023, + "learning_rate": 7.658894716168271e-05, + "loss": 2.0188, + "step": 5973 + }, + { + "epoch": 0.3329803243966334, + "grad_norm": 0.5532551407814026, + "learning_rate": 7.658140589195701e-05, + "loss": 1.6095, + "step": 5974 + }, + { + "epoch": 0.33303606264979657, + "grad_norm": 0.4914916157722473, + "learning_rate": 7.657386377922409e-05, + "loss": 1.6199, + "step": 5975 + }, + { + "epoch": 0.3330918009029597, + "grad_norm": 0.5677047371864319, + "learning_rate": 7.656632082372315e-05, + "loss": 1.5635, + "step": 5976 + }, + { + "epoch": 0.33314753915612283, + "grad_norm": 0.5638590455055237, + "learning_rate": 7.65587770256934e-05, + "loss": 1.7578, + "step": 5977 + }, + { + "epoch": 0.333203277409286, + "grad_norm": 0.5115950107574463, + "learning_rate": 7.655123238537409e-05, + "loss": 1.4157, + "step": 5978 + }, + { + "epoch": 0.33325901566244914, + "grad_norm": 0.6125264763832092, + "learning_rate": 7.65436869030045e-05, + "loss": 1.8876, + "step": 5979 + }, + { + "epoch": 0.33331475391561227, + "grad_norm": 0.5354574918746948, + "learning_rate": 7.653614057882393e-05, + "loss": 1.7052, + "step": 5980 + }, + { + "epoch": 0.33337049216877546, + "grad_norm": 0.5426600575447083, + "learning_rate": 7.652859341307168e-05, + "loss": 1.7011, + "step": 5981 + }, + { + "epoch": 0.3334262304219386, + "grad_norm": 0.7442419528961182, + "learning_rate": 7.652104540598712e-05, + "loss": 1.7664, + "step": 5982 + }, + { + "epoch": 0.3334819686751017, + "grad_norm": 0.5431948900222778, + "learning_rate": 7.651349655780965e-05, + "loss": 1.5627, + "step": 5983 + }, + { + "epoch": 0.33353770692826484, + "grad_norm": 0.5939268469810486, + "learning_rate": 7.650594686877863e-05, + "loss": 1.8128, + "step": 5984 + }, + { + "epoch": 0.333593445181428, + "grad_norm": 0.540123462677002, + "learning_rate": 7.649839633913352e-05, + "loss": 1.6395, + "step": 5985 + }, + { + "epoch": 0.33364918343459116, + "grad_norm": 0.5777207016944885, + "learning_rate": 7.649084496911378e-05, + "loss": 1.7467, + "step": 5986 + }, + { + "epoch": 0.3337049216877543, + "grad_norm": 0.5720601081848145, + "learning_rate": 7.648329275895889e-05, + "loss": 1.8314, + "step": 5987 + }, + { + "epoch": 0.33376065994091747, + "grad_norm": 0.5010839104652405, + "learning_rate": 7.647573970890837e-05, + "loss": 1.5876, + "step": 5988 + }, + { + "epoch": 0.3338163981940806, + "grad_norm": 0.5364264249801636, + "learning_rate": 7.646818581920173e-05, + "loss": 1.6042, + "step": 5989 + }, + { + "epoch": 0.33387213644724373, + "grad_norm": 0.5355646014213562, + "learning_rate": 7.646063109007858e-05, + "loss": 1.5054, + "step": 5990 + }, + { + "epoch": 0.3339278747004069, + "grad_norm": 0.5173195600509644, + "learning_rate": 7.645307552177847e-05, + "loss": 1.7355, + "step": 5991 + }, + { + "epoch": 0.33398361295357004, + "grad_norm": 0.5141093134880066, + "learning_rate": 7.644551911454103e-05, + "loss": 1.5428, + "step": 5992 + }, + { + "epoch": 0.33403935120673317, + "grad_norm": 0.5739405751228333, + "learning_rate": 7.643796186860595e-05, + "loss": 1.8064, + "step": 5993 + }, + { + "epoch": 0.3340950894598963, + "grad_norm": 0.6502695083618164, + "learning_rate": 7.643040378421282e-05, + "loss": 1.9495, + "step": 5994 + }, + { + "epoch": 0.3341508277130595, + "grad_norm": 0.5652748942375183, + "learning_rate": 7.64228448616014e-05, + "loss": 1.6926, + "step": 5995 + }, + { + "epoch": 0.3342065659662226, + "grad_norm": 0.5500004291534424, + "learning_rate": 7.64152851010114e-05, + "loss": 1.6566, + "step": 5996 + }, + { + "epoch": 0.33426230421938574, + "grad_norm": 0.6248365044593811, + "learning_rate": 7.640772450268255e-05, + "loss": 1.6196, + "step": 5997 + }, + { + "epoch": 0.3343180424725489, + "grad_norm": 0.5509215593338013, + "learning_rate": 7.640016306685467e-05, + "loss": 1.6845, + "step": 5998 + }, + { + "epoch": 0.33437378072571206, + "grad_norm": 0.6251245141029358, + "learning_rate": 7.639260079376753e-05, + "loss": 1.9948, + "step": 5999 + }, + { + "epoch": 0.3344295189788752, + "grad_norm": 0.536384642124176, + "learning_rate": 7.638503768366098e-05, + "loss": 1.6778, + "step": 6000 + }, + { + "epoch": 0.33448525723203837, + "grad_norm": 0.5998651385307312, + "learning_rate": 7.637747373677486e-05, + "loss": 1.6279, + "step": 6001 + }, + { + "epoch": 0.3345409954852015, + "grad_norm": 0.5673259496688843, + "learning_rate": 7.636990895334907e-05, + "loss": 1.7001, + "step": 6002 + }, + { + "epoch": 0.3345967337383646, + "grad_norm": 0.5465088486671448, + "learning_rate": 7.63623433336235e-05, + "loss": 1.7576, + "step": 6003 + }, + { + "epoch": 0.3346524719915278, + "grad_norm": 0.5544756054878235, + "learning_rate": 7.635477687783814e-05, + "loss": 1.844, + "step": 6004 + }, + { + "epoch": 0.33470821024469094, + "grad_norm": 0.5186877846717834, + "learning_rate": 7.634720958623287e-05, + "loss": 1.6125, + "step": 6005 + }, + { + "epoch": 0.33476394849785407, + "grad_norm": 0.5501444935798645, + "learning_rate": 7.633964145904777e-05, + "loss": 1.7169, + "step": 6006 + }, + { + "epoch": 0.3348196867510172, + "grad_norm": 0.5606530904769897, + "learning_rate": 7.633207249652278e-05, + "loss": 1.6944, + "step": 6007 + }, + { + "epoch": 0.3348754250041804, + "grad_norm": 0.49215444922447205, + "learning_rate": 7.6324502698898e-05, + "loss": 1.4025, + "step": 6008 + }, + { + "epoch": 0.3349311632573435, + "grad_norm": 0.555610716342926, + "learning_rate": 7.631693206641346e-05, + "loss": 1.7292, + "step": 6009 + }, + { + "epoch": 0.33498690151050664, + "grad_norm": 0.5174264907836914, + "learning_rate": 7.630936059930927e-05, + "loss": 1.5525, + "step": 6010 + }, + { + "epoch": 0.3350426397636698, + "grad_norm": 0.5901679992675781, + "learning_rate": 7.630178829782558e-05, + "loss": 1.7284, + "step": 6011 + }, + { + "epoch": 0.33509837801683295, + "grad_norm": 0.5459769368171692, + "learning_rate": 7.629421516220249e-05, + "loss": 1.6727, + "step": 6012 + }, + { + "epoch": 0.3351541162699961, + "grad_norm": 0.5339307188987732, + "learning_rate": 7.628664119268023e-05, + "loss": 1.7325, + "step": 6013 + }, + { + "epoch": 0.33520985452315927, + "grad_norm": 0.533289909362793, + "learning_rate": 7.627906638949895e-05, + "loss": 1.5102, + "step": 6014 + }, + { + "epoch": 0.3352655927763224, + "grad_norm": 0.5171735286712646, + "learning_rate": 7.62714907528989e-05, + "loss": 1.5725, + "step": 6015 + }, + { + "epoch": 0.3353213310294855, + "grad_norm": 0.585667610168457, + "learning_rate": 7.626391428312035e-05, + "loss": 1.8119, + "step": 6016 + }, + { + "epoch": 0.33537706928264865, + "grad_norm": 0.504396378993988, + "learning_rate": 7.625633698040357e-05, + "loss": 1.4209, + "step": 6017 + }, + { + "epoch": 0.33543280753581184, + "grad_norm": 0.5608323216438293, + "learning_rate": 7.624875884498886e-05, + "loss": 1.8436, + "step": 6018 + }, + { + "epoch": 0.33548854578897497, + "grad_norm": 0.5625400543212891, + "learning_rate": 7.624117987711656e-05, + "loss": 1.836, + "step": 6019 + }, + { + "epoch": 0.3355442840421381, + "grad_norm": 0.6377468109130859, + "learning_rate": 7.623360007702702e-05, + "loss": 1.7539, + "step": 6020 + }, + { + "epoch": 0.3356000222953013, + "grad_norm": 0.556115984916687, + "learning_rate": 7.622601944496064e-05, + "loss": 1.6686, + "step": 6021 + }, + { + "epoch": 0.3356557605484644, + "grad_norm": 0.49739575386047363, + "learning_rate": 7.621843798115785e-05, + "loss": 1.5361, + "step": 6022 + }, + { + "epoch": 0.33571149880162754, + "grad_norm": 0.5968783497810364, + "learning_rate": 7.621085568585905e-05, + "loss": 1.8225, + "step": 6023 + }, + { + "epoch": 0.3357672370547907, + "grad_norm": 0.575768232345581, + "learning_rate": 7.620327255930474e-05, + "loss": 1.908, + "step": 6024 + }, + { + "epoch": 0.33582297530795385, + "grad_norm": 0.5628235340118408, + "learning_rate": 7.61956886017354e-05, + "loss": 1.6388, + "step": 6025 + }, + { + "epoch": 0.335878713561117, + "grad_norm": 0.5842387676239014, + "learning_rate": 7.618810381339155e-05, + "loss": 1.8774, + "step": 6026 + }, + { + "epoch": 0.33593445181428017, + "grad_norm": 0.5307137370109558, + "learning_rate": 7.618051819451373e-05, + "loss": 1.6372, + "step": 6027 + }, + { + "epoch": 0.3359901900674433, + "grad_norm": 0.5524066090583801, + "learning_rate": 7.617293174534253e-05, + "loss": 1.7415, + "step": 6028 + }, + { + "epoch": 0.3360459283206064, + "grad_norm": 0.5315592885017395, + "learning_rate": 7.616534446611851e-05, + "loss": 1.6005, + "step": 6029 + }, + { + "epoch": 0.33610166657376955, + "grad_norm": 0.5379803776741028, + "learning_rate": 7.615775635708234e-05, + "loss": 1.6998, + "step": 6030 + }, + { + "epoch": 0.33615740482693274, + "grad_norm": 0.593471884727478, + "learning_rate": 7.615016741847463e-05, + "loss": 1.6948, + "step": 6031 + }, + { + "epoch": 0.33621314308009587, + "grad_norm": 0.5759322643280029, + "learning_rate": 7.614257765053609e-05, + "loss": 1.5575, + "step": 6032 + }, + { + "epoch": 0.336268881333259, + "grad_norm": 0.5627144575119019, + "learning_rate": 7.61349870535074e-05, + "loss": 1.7633, + "step": 6033 + }, + { + "epoch": 0.3363246195864222, + "grad_norm": 0.5872805714607239, + "learning_rate": 7.612739562762929e-05, + "loss": 1.8196, + "step": 6034 + }, + { + "epoch": 0.3363803578395853, + "grad_norm": 0.5651592016220093, + "learning_rate": 7.611980337314254e-05, + "loss": 1.7916, + "step": 6035 + }, + { + "epoch": 0.33643609609274844, + "grad_norm": 0.5263227820396423, + "learning_rate": 7.61122102902879e-05, + "loss": 1.6909, + "step": 6036 + }, + { + "epoch": 0.3364918343459116, + "grad_norm": 0.5474349856376648, + "learning_rate": 7.610461637930621e-05, + "loss": 1.7166, + "step": 6037 + }, + { + "epoch": 0.33654757259907475, + "grad_norm": 0.5443328022956848, + "learning_rate": 7.609702164043829e-05, + "loss": 1.6479, + "step": 6038 + }, + { + "epoch": 0.3366033108522379, + "grad_norm": 0.5788392424583435, + "learning_rate": 7.6089426073925e-05, + "loss": 1.7645, + "step": 6039 + }, + { + "epoch": 0.336659049105401, + "grad_norm": 0.5407717823982239, + "learning_rate": 7.608182968000721e-05, + "loss": 1.7543, + "step": 6040 + }, + { + "epoch": 0.3367147873585642, + "grad_norm": 0.5548073649406433, + "learning_rate": 7.607423245892586e-05, + "loss": 1.6023, + "step": 6041 + }, + { + "epoch": 0.3367705256117273, + "grad_norm": 0.5452112555503845, + "learning_rate": 7.606663441092188e-05, + "loss": 1.7298, + "step": 6042 + }, + { + "epoch": 0.33682626386489045, + "grad_norm": 0.5845810770988464, + "learning_rate": 7.605903553623625e-05, + "loss": 1.9093, + "step": 6043 + }, + { + "epoch": 0.33688200211805364, + "grad_norm": 0.5392171740531921, + "learning_rate": 7.605143583510991e-05, + "loss": 1.7111, + "step": 6044 + }, + { + "epoch": 0.33693774037121677, + "grad_norm": 0.51267009973526, + "learning_rate": 7.604383530778396e-05, + "loss": 1.5154, + "step": 6045 + }, + { + "epoch": 0.3369934786243799, + "grad_norm": 0.5741301774978638, + "learning_rate": 7.603623395449937e-05, + "loss": 1.7287, + "step": 6046 + }, + { + "epoch": 0.3370492168775431, + "grad_norm": 0.5356318354606628, + "learning_rate": 7.602863177549724e-05, + "loss": 1.7299, + "step": 6047 + }, + { + "epoch": 0.3371049551307062, + "grad_norm": 0.5820077061653137, + "learning_rate": 7.602102877101869e-05, + "loss": 1.8304, + "step": 6048 + }, + { + "epoch": 0.33716069338386934, + "grad_norm": 0.5404535531997681, + "learning_rate": 7.60134249413048e-05, + "loss": 1.5754, + "step": 6049 + }, + { + "epoch": 0.3372164316370325, + "grad_norm": 0.5398672819137573, + "learning_rate": 7.600582028659675e-05, + "loss": 1.7943, + "step": 6050 + }, + { + "epoch": 0.33727216989019565, + "grad_norm": 0.5376107692718506, + "learning_rate": 7.59982148071357e-05, + "loss": 1.4528, + "step": 6051 + }, + { + "epoch": 0.3373279081433588, + "grad_norm": 0.5899469256401062, + "learning_rate": 7.599060850316287e-05, + "loss": 1.7503, + "step": 6052 + }, + { + "epoch": 0.3373836463965219, + "grad_norm": 0.5668314695358276, + "learning_rate": 7.598300137491946e-05, + "loss": 1.7732, + "step": 6053 + }, + { + "epoch": 0.3374393846496851, + "grad_norm": 0.6154149174690247, + "learning_rate": 7.597539342264675e-05, + "loss": 1.6534, + "step": 6054 + }, + { + "epoch": 0.3374951229028482, + "grad_norm": 0.5487502813339233, + "learning_rate": 7.596778464658599e-05, + "loss": 1.6286, + "step": 6055 + }, + { + "epoch": 0.33755086115601135, + "grad_norm": 0.5876896977424622, + "learning_rate": 7.596017504697851e-05, + "loss": 1.7787, + "step": 6056 + }, + { + "epoch": 0.33760659940917453, + "grad_norm": 0.5587677359580994, + "learning_rate": 7.595256462406564e-05, + "loss": 1.7862, + "step": 6057 + }, + { + "epoch": 0.33766233766233766, + "grad_norm": 0.5694131255149841, + "learning_rate": 7.594495337808873e-05, + "loss": 1.6926, + "step": 6058 + }, + { + "epoch": 0.3377180759155008, + "grad_norm": 0.5591508150100708, + "learning_rate": 7.593734130928918e-05, + "loss": 1.6135, + "step": 6059 + }, + { + "epoch": 0.337773814168664, + "grad_norm": 0.5355261564254761, + "learning_rate": 7.592972841790837e-05, + "loss": 1.5746, + "step": 6060 + }, + { + "epoch": 0.3378295524218271, + "grad_norm": 0.5518434047698975, + "learning_rate": 7.592211470418777e-05, + "loss": 1.6457, + "step": 6061 + }, + { + "epoch": 0.33788529067499024, + "grad_norm": 0.5891780257225037, + "learning_rate": 7.59145001683688e-05, + "loss": 1.7026, + "step": 6062 + }, + { + "epoch": 0.33794102892815336, + "grad_norm": 0.5723276734352112, + "learning_rate": 7.590688481069302e-05, + "loss": 1.8168, + "step": 6063 + }, + { + "epoch": 0.33799676718131655, + "grad_norm": 0.5468711853027344, + "learning_rate": 7.589926863140187e-05, + "loss": 1.607, + "step": 6064 + }, + { + "epoch": 0.3380525054344797, + "grad_norm": 0.6062466502189636, + "learning_rate": 7.589165163073695e-05, + "loss": 1.9372, + "step": 6065 + }, + { + "epoch": 0.3381082436876428, + "grad_norm": 0.5140287280082703, + "learning_rate": 7.588403380893979e-05, + "loss": 1.6545, + "step": 6066 + }, + { + "epoch": 0.338163981940806, + "grad_norm": 0.5543786287307739, + "learning_rate": 7.587641516625197e-05, + "loss": 1.8205, + "step": 6067 + }, + { + "epoch": 0.3382197201939691, + "grad_norm": 0.5844648480415344, + "learning_rate": 7.586879570291514e-05, + "loss": 1.8597, + "step": 6068 + }, + { + "epoch": 0.33827545844713225, + "grad_norm": 0.5109902024269104, + "learning_rate": 7.586117541917095e-05, + "loss": 1.5266, + "step": 6069 + }, + { + "epoch": 0.33833119670029543, + "grad_norm": 0.5208814740180969, + "learning_rate": 7.585355431526104e-05, + "loss": 1.721, + "step": 6070 + }, + { + "epoch": 0.33838693495345856, + "grad_norm": 0.5144614577293396, + "learning_rate": 7.584593239142712e-05, + "loss": 1.624, + "step": 6071 + }, + { + "epoch": 0.3384426732066217, + "grad_norm": 0.5855271220207214, + "learning_rate": 7.583830964791094e-05, + "loss": 1.8765, + "step": 6072 + }, + { + "epoch": 0.3384984114597849, + "grad_norm": 0.5410987138748169, + "learning_rate": 7.58306860849542e-05, + "loss": 1.6027, + "step": 6073 + }, + { + "epoch": 0.338554149712948, + "grad_norm": 0.6230753064155579, + "learning_rate": 7.582306170279872e-05, + "loss": 1.8485, + "step": 6074 + }, + { + "epoch": 0.33860988796611113, + "grad_norm": 0.5517315864562988, + "learning_rate": 7.581543650168628e-05, + "loss": 1.7822, + "step": 6075 + }, + { + "epoch": 0.33866562621927426, + "grad_norm": 0.5739060044288635, + "learning_rate": 7.580781048185871e-05, + "loss": 1.6443, + "step": 6076 + }, + { + "epoch": 0.33872136447243745, + "grad_norm": 0.5618791580200195, + "learning_rate": 7.580018364355785e-05, + "loss": 1.5943, + "step": 6077 + }, + { + "epoch": 0.3387771027256006, + "grad_norm": 0.5723870396614075, + "learning_rate": 7.579255598702562e-05, + "loss": 1.4501, + "step": 6078 + }, + { + "epoch": 0.3388328409787637, + "grad_norm": 0.5427421927452087, + "learning_rate": 7.578492751250386e-05, + "loss": 1.7001, + "step": 6079 + }, + { + "epoch": 0.3388885792319269, + "grad_norm": 0.5765356421470642, + "learning_rate": 7.577729822023455e-05, + "loss": 1.6652, + "step": 6080 + }, + { + "epoch": 0.33894431748509, + "grad_norm": 0.5492302179336548, + "learning_rate": 7.576966811045963e-05, + "loss": 1.6988, + "step": 6081 + }, + { + "epoch": 0.33900005573825315, + "grad_norm": 0.5814895033836365, + "learning_rate": 7.576203718342108e-05, + "loss": 1.9584, + "step": 6082 + }, + { + "epoch": 0.33905579399141633, + "grad_norm": 0.6068232655525208, + "learning_rate": 7.575440543936092e-05, + "loss": 2.0357, + "step": 6083 + }, + { + "epoch": 0.33911153224457946, + "grad_norm": 0.5426899790763855, + "learning_rate": 7.574677287852117e-05, + "loss": 1.6323, + "step": 6084 + }, + { + "epoch": 0.3391672704977426, + "grad_norm": 0.5811708569526672, + "learning_rate": 7.573913950114391e-05, + "loss": 1.538, + "step": 6085 + }, + { + "epoch": 0.3392230087509057, + "grad_norm": 0.5753393769264221, + "learning_rate": 7.573150530747122e-05, + "loss": 1.6013, + "step": 6086 + }, + { + "epoch": 0.3392787470040689, + "grad_norm": 0.5427485108375549, + "learning_rate": 7.572387029774519e-05, + "loss": 1.6444, + "step": 6087 + }, + { + "epoch": 0.33933448525723203, + "grad_norm": 0.5431930422782898, + "learning_rate": 7.571623447220797e-05, + "loss": 1.6733, + "step": 6088 + }, + { + "epoch": 0.33939022351039516, + "grad_norm": 0.555357813835144, + "learning_rate": 7.570859783110176e-05, + "loss": 1.7219, + "step": 6089 + }, + { + "epoch": 0.33944596176355835, + "grad_norm": 0.5578222274780273, + "learning_rate": 7.570096037466869e-05, + "loss": 1.407, + "step": 6090 + }, + { + "epoch": 0.3395017000167215, + "grad_norm": 0.5213090777397156, + "learning_rate": 7.5693322103151e-05, + "loss": 1.4608, + "step": 6091 + }, + { + "epoch": 0.3395574382698846, + "grad_norm": 0.5651876330375671, + "learning_rate": 7.568568301679096e-05, + "loss": 1.6756, + "step": 6092 + }, + { + "epoch": 0.3396131765230478, + "grad_norm": 0.5914562940597534, + "learning_rate": 7.56780431158308e-05, + "loss": 1.7648, + "step": 6093 + }, + { + "epoch": 0.3396689147762109, + "grad_norm": 0.5577222108840942, + "learning_rate": 7.567040240051281e-05, + "loss": 1.6954, + "step": 6094 + }, + { + "epoch": 0.33972465302937405, + "grad_norm": 0.5938786268234253, + "learning_rate": 7.566276087107935e-05, + "loss": 1.8131, + "step": 6095 + }, + { + "epoch": 0.33978039128253723, + "grad_norm": 0.5387003421783447, + "learning_rate": 7.565511852777274e-05, + "loss": 1.6522, + "step": 6096 + }, + { + "epoch": 0.33983612953570036, + "grad_norm": 0.5465493202209473, + "learning_rate": 7.564747537083534e-05, + "loss": 1.6971, + "step": 6097 + }, + { + "epoch": 0.3398918677888635, + "grad_norm": 0.5273247361183167, + "learning_rate": 7.563983140050955e-05, + "loss": 1.6759, + "step": 6098 + }, + { + "epoch": 0.3399476060420266, + "grad_norm": 0.5733767151832581, + "learning_rate": 7.563218661703782e-05, + "loss": 1.7203, + "step": 6099 + }, + { + "epoch": 0.3400033442951898, + "grad_norm": 0.6077031493186951, + "learning_rate": 7.562454102066255e-05, + "loss": 1.9364, + "step": 6100 + }, + { + "epoch": 0.34005908254835293, + "grad_norm": 0.5688176155090332, + "learning_rate": 7.561689461162625e-05, + "loss": 1.6623, + "step": 6101 + }, + { + "epoch": 0.34011482080151606, + "grad_norm": 0.5663187503814697, + "learning_rate": 7.56092473901714e-05, + "loss": 1.567, + "step": 6102 + }, + { + "epoch": 0.34017055905467924, + "grad_norm": 0.6150177121162415, + "learning_rate": 7.560159935654056e-05, + "loss": 1.8714, + "step": 6103 + }, + { + "epoch": 0.3402262973078424, + "grad_norm": 0.5515531301498413, + "learning_rate": 7.559395051097624e-05, + "loss": 1.6713, + "step": 6104 + }, + { + "epoch": 0.3402820355610055, + "grad_norm": 0.687240481376648, + "learning_rate": 7.558630085372105e-05, + "loss": 1.6552, + "step": 6105 + }, + { + "epoch": 0.3403377738141687, + "grad_norm": 0.5493181943893433, + "learning_rate": 7.557865038501756e-05, + "loss": 1.65, + "step": 6106 + }, + { + "epoch": 0.3403935120673318, + "grad_norm": 0.5683436989784241, + "learning_rate": 7.55709991051084e-05, + "loss": 1.8507, + "step": 6107 + }, + { + "epoch": 0.34044925032049494, + "grad_norm": 0.5895001292228699, + "learning_rate": 7.556334701423627e-05, + "loss": 2.0143, + "step": 6108 + }, + { + "epoch": 0.3405049885736581, + "grad_norm": 0.5967059135437012, + "learning_rate": 7.555569411264378e-05, + "loss": 1.9006, + "step": 6109 + }, + { + "epoch": 0.34056072682682126, + "grad_norm": 0.5140407085418701, + "learning_rate": 7.554804040057369e-05, + "loss": 1.4028, + "step": 6110 + }, + { + "epoch": 0.3406164650799844, + "grad_norm": 0.5586955547332764, + "learning_rate": 7.554038587826872e-05, + "loss": 1.6835, + "step": 6111 + }, + { + "epoch": 0.3406722033331475, + "grad_norm": 0.4853399395942688, + "learning_rate": 7.553273054597163e-05, + "loss": 1.5901, + "step": 6112 + }, + { + "epoch": 0.3407279415863107, + "grad_norm": 0.5674946308135986, + "learning_rate": 7.552507440392518e-05, + "loss": 1.8776, + "step": 6113 + }, + { + "epoch": 0.34078367983947383, + "grad_norm": 0.5115534663200378, + "learning_rate": 7.551741745237218e-05, + "loss": 1.4647, + "step": 6114 + }, + { + "epoch": 0.34083941809263696, + "grad_norm": 0.6239203214645386, + "learning_rate": 7.55097596915555e-05, + "loss": 1.8638, + "step": 6115 + }, + { + "epoch": 0.34089515634580014, + "grad_norm": 0.5367839336395264, + "learning_rate": 7.550210112171796e-05, + "loss": 1.7598, + "step": 6116 + }, + { + "epoch": 0.34095089459896327, + "grad_norm": 0.5434908270835876, + "learning_rate": 7.549444174310246e-05, + "loss": 1.8239, + "step": 6117 + }, + { + "epoch": 0.3410066328521264, + "grad_norm": 0.5503940582275391, + "learning_rate": 7.548678155595192e-05, + "loss": 1.7103, + "step": 6118 + }, + { + "epoch": 0.3410623711052896, + "grad_norm": 0.5601882338523865, + "learning_rate": 7.547912056050925e-05, + "loss": 1.8269, + "step": 6119 + }, + { + "epoch": 0.3411181093584527, + "grad_norm": 0.5472147464752197, + "learning_rate": 7.547145875701744e-05, + "loss": 1.7221, + "step": 6120 + }, + { + "epoch": 0.34117384761161584, + "grad_norm": 0.5327697396278381, + "learning_rate": 7.546379614571947e-05, + "loss": 1.6879, + "step": 6121 + }, + { + "epoch": 0.341229585864779, + "grad_norm": 0.5991697311401367, + "learning_rate": 7.545613272685834e-05, + "loss": 1.9402, + "step": 6122 + }, + { + "epoch": 0.34128532411794216, + "grad_norm": 0.5222532749176025, + "learning_rate": 7.544846850067711e-05, + "loss": 1.6331, + "step": 6123 + }, + { + "epoch": 0.3413410623711053, + "grad_norm": 0.5213292837142944, + "learning_rate": 7.544080346741884e-05, + "loss": 1.6547, + "step": 6124 + }, + { + "epoch": 0.3413968006242684, + "grad_norm": 0.516547441482544, + "learning_rate": 7.54331376273266e-05, + "loss": 1.5988, + "step": 6125 + }, + { + "epoch": 0.3414525388774316, + "grad_norm": 0.5505926609039307, + "learning_rate": 7.542547098064351e-05, + "loss": 1.8314, + "step": 6126 + }, + { + "epoch": 0.34150827713059473, + "grad_norm": 0.5631290078163147, + "learning_rate": 7.541780352761275e-05, + "loss": 1.7797, + "step": 6127 + }, + { + "epoch": 0.34156401538375786, + "grad_norm": 0.5578431487083435, + "learning_rate": 7.541013526847745e-05, + "loss": 1.7118, + "step": 6128 + }, + { + "epoch": 0.34161975363692104, + "grad_norm": 0.6077129244804382, + "learning_rate": 7.540246620348079e-05, + "loss": 1.8582, + "step": 6129 + }, + { + "epoch": 0.34167549189008417, + "grad_norm": 0.5378260612487793, + "learning_rate": 7.539479633286604e-05, + "loss": 1.5773, + "step": 6130 + }, + { + "epoch": 0.3417312301432473, + "grad_norm": 0.5147218108177185, + "learning_rate": 7.538712565687637e-05, + "loss": 1.6079, + "step": 6131 + }, + { + "epoch": 0.34178696839641043, + "grad_norm": 0.5637179017066956, + "learning_rate": 7.537945417575513e-05, + "loss": 1.7772, + "step": 6132 + }, + { + "epoch": 0.3418427066495736, + "grad_norm": 0.5718836188316345, + "learning_rate": 7.537178188974556e-05, + "loss": 1.8646, + "step": 6133 + }, + { + "epoch": 0.34189844490273674, + "grad_norm": 0.5593611001968384, + "learning_rate": 7.5364108799091e-05, + "loss": 1.7059, + "step": 6134 + }, + { + "epoch": 0.34195418315589987, + "grad_norm": 0.5491702556610107, + "learning_rate": 7.535643490403478e-05, + "loss": 1.5904, + "step": 6135 + }, + { + "epoch": 0.34200992140906306, + "grad_norm": 0.5673286318778992, + "learning_rate": 7.534876020482032e-05, + "loss": 1.6569, + "step": 6136 + }, + { + "epoch": 0.3420656596622262, + "grad_norm": 0.555279552936554, + "learning_rate": 7.534108470169094e-05, + "loss": 1.947, + "step": 6137 + }, + { + "epoch": 0.3421213979153893, + "grad_norm": 0.5502607226371765, + "learning_rate": 7.533340839489011e-05, + "loss": 1.6199, + "step": 6138 + }, + { + "epoch": 0.3421771361685525, + "grad_norm": 0.5711556673049927, + "learning_rate": 7.532573128466129e-05, + "loss": 1.901, + "step": 6139 + }, + { + "epoch": 0.3422328744217156, + "grad_norm": 0.5685670375823975, + "learning_rate": 7.53180533712479e-05, + "loss": 1.7284, + "step": 6140 + }, + { + "epoch": 0.34228861267487876, + "grad_norm": 0.555075466632843, + "learning_rate": 7.53103746548935e-05, + "loss": 1.8184, + "step": 6141 + }, + { + "epoch": 0.34234435092804194, + "grad_norm": 0.5404545664787292, + "learning_rate": 7.530269513584158e-05, + "loss": 1.6444, + "step": 6142 + }, + { + "epoch": 0.34240008918120507, + "grad_norm": 0.5739527344703674, + "learning_rate": 7.52950148143357e-05, + "loss": 1.5748, + "step": 6143 + }, + { + "epoch": 0.3424558274343682, + "grad_norm": 0.5569913983345032, + "learning_rate": 7.528733369061942e-05, + "loss": 1.8188, + "step": 6144 + }, + { + "epoch": 0.3425115656875313, + "grad_norm": 0.5430577397346497, + "learning_rate": 7.527965176493636e-05, + "loss": 1.5839, + "step": 6145 + }, + { + "epoch": 0.3425673039406945, + "grad_norm": 0.5321673154830933, + "learning_rate": 7.527196903753011e-05, + "loss": 1.3862, + "step": 6146 + }, + { + "epoch": 0.34262304219385764, + "grad_norm": 0.5757884979248047, + "learning_rate": 7.526428550864437e-05, + "loss": 1.5308, + "step": 6147 + }, + { + "epoch": 0.34267878044702077, + "grad_norm": 0.556651771068573, + "learning_rate": 7.525660117852279e-05, + "loss": 1.7377, + "step": 6148 + }, + { + "epoch": 0.34273451870018395, + "grad_norm": 0.5236818790435791, + "learning_rate": 7.524891604740908e-05, + "loss": 1.7305, + "step": 6149 + }, + { + "epoch": 0.3427902569533471, + "grad_norm": 0.5686874985694885, + "learning_rate": 7.524123011554697e-05, + "loss": 1.5379, + "step": 6150 + }, + { + "epoch": 0.3428459952065102, + "grad_norm": 0.5817770957946777, + "learning_rate": 7.52335433831802e-05, + "loss": 1.7069, + "step": 6151 + }, + { + "epoch": 0.3429017334596734, + "grad_norm": 0.5717275738716125, + "learning_rate": 7.522585585055255e-05, + "loss": 1.8944, + "step": 6152 + }, + { + "epoch": 0.3429574717128365, + "grad_norm": 0.5469644665718079, + "learning_rate": 7.521816751790783e-05, + "loss": 1.622, + "step": 6153 + }, + { + "epoch": 0.34301320996599965, + "grad_norm": 0.5735164880752563, + "learning_rate": 7.521047838548988e-05, + "loss": 1.8005, + "step": 6154 + }, + { + "epoch": 0.3430689482191628, + "grad_norm": 0.5070759057998657, + "learning_rate": 7.520278845354254e-05, + "loss": 1.4795, + "step": 6155 + }, + { + "epoch": 0.34312468647232597, + "grad_norm": 0.5179046392440796, + "learning_rate": 7.519509772230968e-05, + "loss": 1.5029, + "step": 6156 + }, + { + "epoch": 0.3431804247254891, + "grad_norm": 0.5747403502464294, + "learning_rate": 7.518740619203523e-05, + "loss": 1.7075, + "step": 6157 + }, + { + "epoch": 0.3432361629786522, + "grad_norm": 0.6233847141265869, + "learning_rate": 7.517971386296309e-05, + "loss": 1.9524, + "step": 6158 + }, + { + "epoch": 0.3432919012318154, + "grad_norm": 0.5195590853691101, + "learning_rate": 7.517202073533727e-05, + "loss": 1.533, + "step": 6159 + }, + { + "epoch": 0.34334763948497854, + "grad_norm": 0.6035041213035583, + "learning_rate": 7.516432680940168e-05, + "loss": 1.7298, + "step": 6160 + }, + { + "epoch": 0.34340337773814167, + "grad_norm": 0.59979248046875, + "learning_rate": 7.515663208540037e-05, + "loss": 1.7295, + "step": 6161 + }, + { + "epoch": 0.34345911599130485, + "grad_norm": 0.5844981074333191, + "learning_rate": 7.514893656357738e-05, + "loss": 1.756, + "step": 6162 + }, + { + "epoch": 0.343514854244468, + "grad_norm": 0.5281308889389038, + "learning_rate": 7.514124024417674e-05, + "loss": 1.7149, + "step": 6163 + }, + { + "epoch": 0.3435705924976311, + "grad_norm": 0.5352674126625061, + "learning_rate": 7.513354312744256e-05, + "loss": 1.7262, + "step": 6164 + }, + { + "epoch": 0.3436263307507943, + "grad_norm": 0.562127411365509, + "learning_rate": 7.512584521361891e-05, + "loss": 1.6434, + "step": 6165 + }, + { + "epoch": 0.3436820690039574, + "grad_norm": 0.5535931587219238, + "learning_rate": 7.511814650294994e-05, + "loss": 1.5353, + "step": 6166 + }, + { + "epoch": 0.34373780725712055, + "grad_norm": 0.543641209602356, + "learning_rate": 7.511044699567981e-05, + "loss": 1.8312, + "step": 6167 + }, + { + "epoch": 0.3437935455102837, + "grad_norm": 0.559559166431427, + "learning_rate": 7.510274669205273e-05, + "loss": 1.6326, + "step": 6168 + }, + { + "epoch": 0.34384928376344687, + "grad_norm": 0.5449449419975281, + "learning_rate": 7.509504559231287e-05, + "loss": 1.7319, + "step": 6169 + }, + { + "epoch": 0.34390502201661, + "grad_norm": 0.5315961837768555, + "learning_rate": 7.508734369670447e-05, + "loss": 1.69, + "step": 6170 + }, + { + "epoch": 0.3439607602697731, + "grad_norm": 0.5506524443626404, + "learning_rate": 7.507964100547181e-05, + "loss": 1.6961, + "step": 6171 + }, + { + "epoch": 0.3440164985229363, + "grad_norm": 0.5587935447692871, + "learning_rate": 7.507193751885915e-05, + "loss": 1.794, + "step": 6172 + }, + { + "epoch": 0.34407223677609944, + "grad_norm": 0.5281456112861633, + "learning_rate": 7.506423323711083e-05, + "loss": 1.637, + "step": 6173 + }, + { + "epoch": 0.34412797502926257, + "grad_norm": 0.5220721960067749, + "learning_rate": 7.505652816047115e-05, + "loss": 1.4696, + "step": 6174 + }, + { + "epoch": 0.34418371328242575, + "grad_norm": 0.565938413143158, + "learning_rate": 7.504882228918449e-05, + "loss": 1.6329, + "step": 6175 + }, + { + "epoch": 0.3442394515355889, + "grad_norm": 0.532490074634552, + "learning_rate": 7.504111562349524e-05, + "loss": 1.5929, + "step": 6176 + }, + { + "epoch": 0.344295189788752, + "grad_norm": 0.5559155941009521, + "learning_rate": 7.503340816364779e-05, + "loss": 1.6935, + "step": 6177 + }, + { + "epoch": 0.3443509280419152, + "grad_norm": 0.5494531989097595, + "learning_rate": 7.502569990988659e-05, + "loss": 1.5508, + "step": 6178 + }, + { + "epoch": 0.3444066662950783, + "grad_norm": 0.48615095019340515, + "learning_rate": 7.50179908624561e-05, + "loss": 1.3464, + "step": 6179 + }, + { + "epoch": 0.34446240454824145, + "grad_norm": 0.543402373790741, + "learning_rate": 7.501028102160082e-05, + "loss": 1.6306, + "step": 6180 + }, + { + "epoch": 0.3445181428014046, + "grad_norm": 0.5688214898109436, + "learning_rate": 7.500257038756522e-05, + "loss": 1.9743, + "step": 6181 + }, + { + "epoch": 0.34457388105456777, + "grad_norm": 0.5336653590202332, + "learning_rate": 7.499485896059389e-05, + "loss": 1.7876, + "step": 6182 + }, + { + "epoch": 0.3446296193077309, + "grad_norm": 0.6009781360626221, + "learning_rate": 7.498714674093134e-05, + "loss": 1.599, + "step": 6183 + }, + { + "epoch": 0.344685357560894, + "grad_norm": 0.5108974575996399, + "learning_rate": 7.497943372882219e-05, + "loss": 1.3671, + "step": 6184 + }, + { + "epoch": 0.3447410958140572, + "grad_norm": 0.5875006914138794, + "learning_rate": 7.497171992451104e-05, + "loss": 1.8846, + "step": 6185 + }, + { + "epoch": 0.34479683406722034, + "grad_norm": 0.5741475820541382, + "learning_rate": 7.496400532824252e-05, + "loss": 1.8147, + "step": 6186 + }, + { + "epoch": 0.34485257232038347, + "grad_norm": 0.5426183938980103, + "learning_rate": 7.495628994026131e-05, + "loss": 1.8584, + "step": 6187 + }, + { + "epoch": 0.34490831057354665, + "grad_norm": 0.5665351152420044, + "learning_rate": 7.49485737608121e-05, + "loss": 1.6254, + "step": 6188 + }, + { + "epoch": 0.3449640488267098, + "grad_norm": 0.6417822241783142, + "learning_rate": 7.494085679013959e-05, + "loss": 1.5997, + "step": 6189 + }, + { + "epoch": 0.3450197870798729, + "grad_norm": 0.580936849117279, + "learning_rate": 7.49331390284885e-05, + "loss": 1.7723, + "step": 6190 + }, + { + "epoch": 0.34507552533303604, + "grad_norm": 0.5405949354171753, + "learning_rate": 7.492542047610362e-05, + "loss": 1.7536, + "step": 6191 + }, + { + "epoch": 0.3451312635861992, + "grad_norm": 0.567459225654602, + "learning_rate": 7.491770113322972e-05, + "loss": 1.5518, + "step": 6192 + }, + { + "epoch": 0.34518700183936235, + "grad_norm": 0.5930157899856567, + "learning_rate": 7.490998100011164e-05, + "loss": 1.8805, + "step": 6193 + }, + { + "epoch": 0.3452427400925255, + "grad_norm": 0.5590851902961731, + "learning_rate": 7.490226007699418e-05, + "loss": 1.7369, + "step": 6194 + }, + { + "epoch": 0.34529847834568866, + "grad_norm": 0.5540249943733215, + "learning_rate": 7.489453836412224e-05, + "loss": 1.7199, + "step": 6195 + }, + { + "epoch": 0.3453542165988518, + "grad_norm": 0.6100202798843384, + "learning_rate": 7.488681586174066e-05, + "loss": 1.8962, + "step": 6196 + }, + { + "epoch": 0.3454099548520149, + "grad_norm": 0.5453261137008667, + "learning_rate": 7.48790925700944e-05, + "loss": 1.6779, + "step": 6197 + }, + { + "epoch": 0.3454656931051781, + "grad_norm": 0.6191526651382446, + "learning_rate": 7.487136848942838e-05, + "loss": 1.837, + "step": 6198 + }, + { + "epoch": 0.34552143135834124, + "grad_norm": 0.5043689608573914, + "learning_rate": 7.486364361998754e-05, + "loss": 1.5438, + "step": 6199 + }, + { + "epoch": 0.34557716961150436, + "grad_norm": 0.5927308797836304, + "learning_rate": 7.485591796201692e-05, + "loss": 1.8893, + "step": 6200 + }, + { + "epoch": 0.34563290786466755, + "grad_norm": 0.5387723445892334, + "learning_rate": 7.484819151576147e-05, + "loss": 1.7063, + "step": 6201 + }, + { + "epoch": 0.3456886461178307, + "grad_norm": 0.5273063778877258, + "learning_rate": 7.48404642814663e-05, + "loss": 1.6052, + "step": 6202 + }, + { + "epoch": 0.3457443843709938, + "grad_norm": 0.5235535502433777, + "learning_rate": 7.48327362593764e-05, + "loss": 1.5859, + "step": 6203 + }, + { + "epoch": 0.34580012262415694, + "grad_norm": 0.5952630043029785, + "learning_rate": 7.48250074497369e-05, + "loss": 1.9669, + "step": 6204 + }, + { + "epoch": 0.3458558608773201, + "grad_norm": 0.5512803196907043, + "learning_rate": 7.48172778527929e-05, + "loss": 1.6103, + "step": 6205 + }, + { + "epoch": 0.34591159913048325, + "grad_norm": 0.5485497117042542, + "learning_rate": 7.480954746878955e-05, + "loss": 1.4648, + "step": 6206 + }, + { + "epoch": 0.3459673373836464, + "grad_norm": 0.5755242109298706, + "learning_rate": 7.480181629797201e-05, + "loss": 1.7882, + "step": 6207 + }, + { + "epoch": 0.34602307563680956, + "grad_norm": 0.586279034614563, + "learning_rate": 7.479408434058545e-05, + "loss": 1.757, + "step": 6208 + }, + { + "epoch": 0.3460788138899727, + "grad_norm": 0.6023716926574707, + "learning_rate": 7.47863515968751e-05, + "loss": 1.6573, + "step": 6209 + }, + { + "epoch": 0.3461345521431358, + "grad_norm": 0.5629722476005554, + "learning_rate": 7.477861806708618e-05, + "loss": 1.8348, + "step": 6210 + }, + { + "epoch": 0.346190290396299, + "grad_norm": 0.64363032579422, + "learning_rate": 7.477088375146397e-05, + "loss": 2.1581, + "step": 6211 + }, + { + "epoch": 0.34624602864946213, + "grad_norm": 0.5952073335647583, + "learning_rate": 7.476314865025376e-05, + "loss": 1.7823, + "step": 6212 + }, + { + "epoch": 0.34630176690262526, + "grad_norm": 0.5444992780685425, + "learning_rate": 7.475541276370083e-05, + "loss": 1.5717, + "step": 6213 + }, + { + "epoch": 0.3463575051557884, + "grad_norm": 0.5698938965797424, + "learning_rate": 7.474767609205057e-05, + "loss": 1.8471, + "step": 6214 + }, + { + "epoch": 0.3464132434089516, + "grad_norm": 0.521270751953125, + "learning_rate": 7.473993863554832e-05, + "loss": 1.5991, + "step": 6215 + }, + { + "epoch": 0.3464689816621147, + "grad_norm": 0.5909140110015869, + "learning_rate": 7.473220039443942e-05, + "loss": 1.8795, + "step": 6216 + }, + { + "epoch": 0.34652471991527783, + "grad_norm": 0.5595431923866272, + "learning_rate": 7.472446136896935e-05, + "loss": 1.5189, + "step": 6217 + }, + { + "epoch": 0.346580458168441, + "grad_norm": 0.5549118518829346, + "learning_rate": 7.471672155938351e-05, + "loss": 1.5113, + "step": 6218 + }, + { + "epoch": 0.34663619642160415, + "grad_norm": 0.5784697532653809, + "learning_rate": 7.470898096592738e-05, + "loss": 1.62, + "step": 6219 + }, + { + "epoch": 0.3466919346747673, + "grad_norm": 0.582065224647522, + "learning_rate": 7.470123958884643e-05, + "loss": 1.7652, + "step": 6220 + }, + { + "epoch": 0.34674767292793046, + "grad_norm": 0.5781643986701965, + "learning_rate": 7.469349742838619e-05, + "loss": 1.816, + "step": 6221 + }, + { + "epoch": 0.3468034111810936, + "grad_norm": 0.5270411968231201, + "learning_rate": 7.468575448479217e-05, + "loss": 1.4521, + "step": 6222 + }, + { + "epoch": 0.3468591494342567, + "grad_norm": 0.5568832159042358, + "learning_rate": 7.467801075830995e-05, + "loss": 1.6393, + "step": 6223 + }, + { + "epoch": 0.3469148876874199, + "grad_norm": 0.6102818846702576, + "learning_rate": 7.467026624918511e-05, + "loss": 1.8486, + "step": 6224 + }, + { + "epoch": 0.34697062594058303, + "grad_norm": 0.6040059328079224, + "learning_rate": 7.466252095766326e-05, + "loss": 1.9639, + "step": 6225 + }, + { + "epoch": 0.34702636419374616, + "grad_norm": 0.5577713847160339, + "learning_rate": 7.465477488399004e-05, + "loss": 1.7672, + "step": 6226 + }, + { + "epoch": 0.3470821024469093, + "grad_norm": 0.6022251844406128, + "learning_rate": 7.464702802841111e-05, + "loss": 1.8587, + "step": 6227 + }, + { + "epoch": 0.3471378407000725, + "grad_norm": 0.6043629050254822, + "learning_rate": 7.463928039117216e-05, + "loss": 1.6798, + "step": 6228 + }, + { + "epoch": 0.3471935789532356, + "grad_norm": 0.5550456643104553, + "learning_rate": 7.463153197251889e-05, + "loss": 1.6258, + "step": 6229 + }, + { + "epoch": 0.34724931720639873, + "grad_norm": 0.5740575790405273, + "learning_rate": 7.462378277269704e-05, + "loss": 1.6253, + "step": 6230 + }, + { + "epoch": 0.3473050554595619, + "grad_norm": 0.5348698496818542, + "learning_rate": 7.461603279195235e-05, + "loss": 1.7417, + "step": 6231 + }, + { + "epoch": 0.34736079371272505, + "grad_norm": 0.5703982710838318, + "learning_rate": 7.460828203053063e-05, + "loss": 1.8448, + "step": 6232 + }, + { + "epoch": 0.3474165319658882, + "grad_norm": 0.5818899869918823, + "learning_rate": 7.460053048867768e-05, + "loss": 1.783, + "step": 6233 + }, + { + "epoch": 0.34747227021905136, + "grad_norm": 0.5640279054641724, + "learning_rate": 7.459277816663934e-05, + "loss": 1.8757, + "step": 6234 + }, + { + "epoch": 0.3475280084722145, + "grad_norm": 0.519883394241333, + "learning_rate": 7.458502506466147e-05, + "loss": 1.622, + "step": 6235 + }, + { + "epoch": 0.3475837467253776, + "grad_norm": 0.5207779407501221, + "learning_rate": 7.457727118298991e-05, + "loss": 1.4801, + "step": 6236 + }, + { + "epoch": 0.34763948497854075, + "grad_norm": 0.5227778553962708, + "learning_rate": 7.456951652187063e-05, + "loss": 1.6797, + "step": 6237 + }, + { + "epoch": 0.34769522323170393, + "grad_norm": 0.6305186748504639, + "learning_rate": 7.456176108154956e-05, + "loss": 2.0804, + "step": 6238 + }, + { + "epoch": 0.34775096148486706, + "grad_norm": 0.6344568133354187, + "learning_rate": 7.45540048622726e-05, + "loss": 1.881, + "step": 6239 + }, + { + "epoch": 0.3478066997380302, + "grad_norm": 0.5849176645278931, + "learning_rate": 7.454624786428576e-05, + "loss": 1.7058, + "step": 6240 + }, + { + "epoch": 0.3478624379911934, + "grad_norm": 0.5511870980262756, + "learning_rate": 7.453849008783507e-05, + "loss": 1.7262, + "step": 6241 + }, + { + "epoch": 0.3479181762443565, + "grad_norm": 0.590895414352417, + "learning_rate": 7.453073153316654e-05, + "loss": 1.7584, + "step": 6242 + }, + { + "epoch": 0.34797391449751963, + "grad_norm": 0.5347367525100708, + "learning_rate": 7.452297220052624e-05, + "loss": 1.7057, + "step": 6243 + }, + { + "epoch": 0.3480296527506828, + "grad_norm": 0.5574136972427368, + "learning_rate": 7.451521209016021e-05, + "loss": 1.8928, + "step": 6244 + }, + { + "epoch": 0.34808539100384595, + "grad_norm": 0.5794700384140015, + "learning_rate": 7.450745120231462e-05, + "loss": 1.9479, + "step": 6245 + }, + { + "epoch": 0.3481411292570091, + "grad_norm": 0.5384243726730347, + "learning_rate": 7.449968953723554e-05, + "loss": 1.678, + "step": 6246 + }, + { + "epoch": 0.34819686751017226, + "grad_norm": 0.560627281665802, + "learning_rate": 7.449192709516916e-05, + "loss": 1.7936, + "step": 6247 + }, + { + "epoch": 0.3482526057633354, + "grad_norm": 0.6408939957618713, + "learning_rate": 7.448416387636166e-05, + "loss": 1.8022, + "step": 6248 + }, + { + "epoch": 0.3483083440164985, + "grad_norm": 0.5532012581825256, + "learning_rate": 7.447639988105922e-05, + "loss": 1.6318, + "step": 6249 + }, + { + "epoch": 0.34836408226966165, + "grad_norm": 0.6528187990188599, + "learning_rate": 7.44686351095081e-05, + "loss": 2.0857, + "step": 6250 + }, + { + "epoch": 0.34841982052282483, + "grad_norm": 0.5271794199943542, + "learning_rate": 7.446086956195452e-05, + "loss": 1.6236, + "step": 6251 + }, + { + "epoch": 0.34847555877598796, + "grad_norm": 0.6053271293640137, + "learning_rate": 7.445310323864478e-05, + "loss": 1.895, + "step": 6252 + }, + { + "epoch": 0.3485312970291511, + "grad_norm": 0.5544027090072632, + "learning_rate": 7.444533613982519e-05, + "loss": 1.6158, + "step": 6253 + }, + { + "epoch": 0.3485870352823143, + "grad_norm": 0.5839915871620178, + "learning_rate": 7.443756826574204e-05, + "loss": 1.7887, + "step": 6254 + }, + { + "epoch": 0.3486427735354774, + "grad_norm": 0.5946133732795715, + "learning_rate": 7.442979961664171e-05, + "loss": 1.7628, + "step": 6255 + }, + { + "epoch": 0.34869851178864053, + "grad_norm": 0.5356269478797913, + "learning_rate": 7.442203019277059e-05, + "loss": 1.6563, + "step": 6256 + }, + { + "epoch": 0.3487542500418037, + "grad_norm": 0.5791853666305542, + "learning_rate": 7.441425999437505e-05, + "loss": 1.7944, + "step": 6257 + }, + { + "epoch": 0.34880998829496684, + "grad_norm": 0.514127254486084, + "learning_rate": 7.440648902170153e-05, + "loss": 1.6007, + "step": 6258 + }, + { + "epoch": 0.34886572654813, + "grad_norm": 0.5857915878295898, + "learning_rate": 7.439871727499648e-05, + "loss": 1.6401, + "step": 6259 + }, + { + "epoch": 0.3489214648012931, + "grad_norm": 0.5310158729553223, + "learning_rate": 7.439094475450638e-05, + "loss": 1.6605, + "step": 6260 + }, + { + "epoch": 0.3489772030544563, + "grad_norm": 0.5631361603736877, + "learning_rate": 7.43831714604777e-05, + "loss": 1.7541, + "step": 6261 + }, + { + "epoch": 0.3490329413076194, + "grad_norm": 0.5697758197784424, + "learning_rate": 7.4375397393157e-05, + "loss": 1.5488, + "step": 6262 + }, + { + "epoch": 0.34908867956078254, + "grad_norm": 0.5197820663452148, + "learning_rate": 7.43676225527908e-05, + "loss": 1.7463, + "step": 6263 + }, + { + "epoch": 0.34914441781394573, + "grad_norm": 0.6369295120239258, + "learning_rate": 7.43598469396257e-05, + "loss": 2.106, + "step": 6264 + }, + { + "epoch": 0.34920015606710886, + "grad_norm": 0.5751513242721558, + "learning_rate": 7.435207055390828e-05, + "loss": 1.8146, + "step": 6265 + }, + { + "epoch": 0.349255894320272, + "grad_norm": 0.5785645246505737, + "learning_rate": 7.434429339588516e-05, + "loss": 1.8598, + "step": 6266 + }, + { + "epoch": 0.34931163257343517, + "grad_norm": 0.5536054968833923, + "learning_rate": 7.4336515465803e-05, + "loss": 1.7508, + "step": 6267 + }, + { + "epoch": 0.3493673708265983, + "grad_norm": 0.5529542565345764, + "learning_rate": 7.432873676390845e-05, + "loss": 1.7749, + "step": 6268 + }, + { + "epoch": 0.34942310907976143, + "grad_norm": 0.5571187734603882, + "learning_rate": 7.432095729044823e-05, + "loss": 1.6954, + "step": 6269 + }, + { + "epoch": 0.3494788473329246, + "grad_norm": 0.5445393323898315, + "learning_rate": 7.431317704566902e-05, + "loss": 1.5363, + "step": 6270 + }, + { + "epoch": 0.34953458558608774, + "grad_norm": 0.5723183155059814, + "learning_rate": 7.430539602981761e-05, + "loss": 1.7007, + "step": 6271 + }, + { + "epoch": 0.34959032383925087, + "grad_norm": 0.5553802847862244, + "learning_rate": 7.429761424314075e-05, + "loss": 1.9324, + "step": 6272 + }, + { + "epoch": 0.349646062092414, + "grad_norm": 0.5308825969696045, + "learning_rate": 7.428983168588522e-05, + "loss": 1.6236, + "step": 6273 + }, + { + "epoch": 0.3497018003455772, + "grad_norm": 0.5892744064331055, + "learning_rate": 7.428204835829787e-05, + "loss": 1.8567, + "step": 6274 + }, + { + "epoch": 0.3497575385987403, + "grad_norm": 0.5890315175056458, + "learning_rate": 7.42742642606255e-05, + "loss": 1.7612, + "step": 6275 + }, + { + "epoch": 0.34981327685190344, + "grad_norm": 0.5714004635810852, + "learning_rate": 7.426647939311499e-05, + "loss": 1.8783, + "step": 6276 + }, + { + "epoch": 0.3498690151050666, + "grad_norm": 0.5221744775772095, + "learning_rate": 7.425869375601324e-05, + "loss": 1.533, + "step": 6277 + }, + { + "epoch": 0.34992475335822976, + "grad_norm": 0.5754460692405701, + "learning_rate": 7.425090734956717e-05, + "loss": 1.7922, + "step": 6278 + }, + { + "epoch": 0.3499804916113929, + "grad_norm": 0.5325612425804138, + "learning_rate": 7.424312017402371e-05, + "loss": 1.5523, + "step": 6279 + }, + { + "epoch": 0.35003622986455607, + "grad_norm": 0.5452947020530701, + "learning_rate": 7.423533222962984e-05, + "loss": 1.7528, + "step": 6280 + }, + { + "epoch": 0.3500919681177192, + "grad_norm": 0.5132524371147156, + "learning_rate": 7.422754351663252e-05, + "loss": 1.6118, + "step": 6281 + }, + { + "epoch": 0.35014770637088233, + "grad_norm": 0.5661509037017822, + "learning_rate": 7.421975403527877e-05, + "loss": 1.7999, + "step": 6282 + }, + { + "epoch": 0.35020344462404546, + "grad_norm": 0.5532317161560059, + "learning_rate": 7.421196378581563e-05, + "loss": 1.8317, + "step": 6283 + }, + { + "epoch": 0.35025918287720864, + "grad_norm": 0.5239238142967224, + "learning_rate": 7.420417276849018e-05, + "loss": 1.6949, + "step": 6284 + }, + { + "epoch": 0.35031492113037177, + "grad_norm": 0.5444215536117554, + "learning_rate": 7.419638098354948e-05, + "loss": 1.666, + "step": 6285 + }, + { + "epoch": 0.3503706593835349, + "grad_norm": 0.5257874131202698, + "learning_rate": 7.418858843124065e-05, + "loss": 1.7663, + "step": 6286 + }, + { + "epoch": 0.3504263976366981, + "grad_norm": 0.5424786806106567, + "learning_rate": 7.418079511181084e-05, + "loss": 1.6048, + "step": 6287 + }, + { + "epoch": 0.3504821358898612, + "grad_norm": 0.5822529196739197, + "learning_rate": 7.417300102550718e-05, + "loss": 1.7153, + "step": 6288 + }, + { + "epoch": 0.35053787414302434, + "grad_norm": 0.6322096586227417, + "learning_rate": 7.416520617257686e-05, + "loss": 2.0466, + "step": 6289 + }, + { + "epoch": 0.3505936123961875, + "grad_norm": 0.6034446358680725, + "learning_rate": 7.41574105532671e-05, + "loss": 1.7793, + "step": 6290 + }, + { + "epoch": 0.35064935064935066, + "grad_norm": 0.5261698365211487, + "learning_rate": 7.414961416782512e-05, + "loss": 1.6958, + "step": 6291 + }, + { + "epoch": 0.3507050889025138, + "grad_norm": 0.5508055090904236, + "learning_rate": 7.414181701649818e-05, + "loss": 1.7336, + "step": 6292 + }, + { + "epoch": 0.35076082715567697, + "grad_norm": 0.5106075406074524, + "learning_rate": 7.413401909953356e-05, + "loss": 1.5585, + "step": 6293 + }, + { + "epoch": 0.3508165654088401, + "grad_norm": 0.5312706232070923, + "learning_rate": 7.412622041717858e-05, + "loss": 1.5692, + "step": 6294 + }, + { + "epoch": 0.3508723036620032, + "grad_norm": 0.5598204135894775, + "learning_rate": 7.411842096968055e-05, + "loss": 1.6424, + "step": 6295 + }, + { + "epoch": 0.35092804191516636, + "grad_norm": 0.5455132126808167, + "learning_rate": 7.411062075728681e-05, + "loss": 1.7084, + "step": 6296 + }, + { + "epoch": 0.35098378016832954, + "grad_norm": 0.5335630774497986, + "learning_rate": 7.410281978024478e-05, + "loss": 1.6269, + "step": 6297 + }, + { + "epoch": 0.35103951842149267, + "grad_norm": 0.5936735272407532, + "learning_rate": 7.409501803880182e-05, + "loss": 1.6821, + "step": 6298 + }, + { + "epoch": 0.3510952566746558, + "grad_norm": 0.626340389251709, + "learning_rate": 7.408721553320536e-05, + "loss": 1.8958, + "step": 6299 + }, + { + "epoch": 0.351150994927819, + "grad_norm": 0.5382502675056458, + "learning_rate": 7.407941226370289e-05, + "loss": 1.6456, + "step": 6300 + }, + { + "epoch": 0.3512067331809821, + "grad_norm": 0.5597545504570007, + "learning_rate": 7.407160823054182e-05, + "loss": 1.7168, + "step": 6301 + }, + { + "epoch": 0.35126247143414524, + "grad_norm": 0.5945395231246948, + "learning_rate": 7.406380343396973e-05, + "loss": 2.0034, + "step": 6302 + }, + { + "epoch": 0.3513182096873084, + "grad_norm": 0.5297150611877441, + "learning_rate": 7.405599787423406e-05, + "loss": 1.5787, + "step": 6303 + }, + { + "epoch": 0.35137394794047155, + "grad_norm": 0.5702363848686218, + "learning_rate": 7.40481915515824e-05, + "loss": 1.8993, + "step": 6304 + }, + { + "epoch": 0.3514296861936347, + "grad_norm": 0.6293717622756958, + "learning_rate": 7.404038446626231e-05, + "loss": 1.9086, + "step": 6305 + }, + { + "epoch": 0.3514854244467978, + "grad_norm": 0.579983651638031, + "learning_rate": 7.403257661852142e-05, + "loss": 1.74, + "step": 6306 + }, + { + "epoch": 0.351541162699961, + "grad_norm": 0.558723509311676, + "learning_rate": 7.40247680086073e-05, + "loss": 1.7519, + "step": 6307 + }, + { + "epoch": 0.3515969009531241, + "grad_norm": 0.5575239062309265, + "learning_rate": 7.401695863676761e-05, + "loss": 1.8393, + "step": 6308 + }, + { + "epoch": 0.35165263920628725, + "grad_norm": 0.5667286515235901, + "learning_rate": 7.400914850325001e-05, + "loss": 1.7958, + "step": 6309 + }, + { + "epoch": 0.35170837745945044, + "grad_norm": 0.5829740762710571, + "learning_rate": 7.400133760830221e-05, + "loss": 1.7113, + "step": 6310 + }, + { + "epoch": 0.35176411571261357, + "grad_norm": 0.5255504846572876, + "learning_rate": 7.399352595217193e-05, + "loss": 1.6819, + "step": 6311 + }, + { + "epoch": 0.3518198539657767, + "grad_norm": 0.5315664410591125, + "learning_rate": 7.39857135351069e-05, + "loss": 1.5692, + "step": 6312 + }, + { + "epoch": 0.3518755922189399, + "grad_norm": 0.5694820880889893, + "learning_rate": 7.397790035735487e-05, + "loss": 1.813, + "step": 6313 + }, + { + "epoch": 0.351931330472103, + "grad_norm": 0.5584225058555603, + "learning_rate": 7.397008641916364e-05, + "loss": 1.6653, + "step": 6314 + }, + { + "epoch": 0.35198706872526614, + "grad_norm": 0.5575059652328491, + "learning_rate": 7.396227172078103e-05, + "loss": 1.7948, + "step": 6315 + }, + { + "epoch": 0.3520428069784293, + "grad_norm": 0.5385696887969971, + "learning_rate": 7.395445626245486e-05, + "loss": 1.6823, + "step": 6316 + }, + { + "epoch": 0.35209854523159245, + "grad_norm": 0.5181571841239929, + "learning_rate": 7.394664004443302e-05, + "loss": 1.4832, + "step": 6317 + }, + { + "epoch": 0.3521542834847556, + "grad_norm": 0.5436875224113464, + "learning_rate": 7.393882306696338e-05, + "loss": 1.5743, + "step": 6318 + }, + { + "epoch": 0.3522100217379187, + "grad_norm": 0.5831631422042847, + "learning_rate": 7.393100533029383e-05, + "loss": 1.7726, + "step": 6319 + }, + { + "epoch": 0.3522657599910819, + "grad_norm": 0.5740854144096375, + "learning_rate": 7.392318683467232e-05, + "loss": 1.5639, + "step": 6320 + }, + { + "epoch": 0.352321498244245, + "grad_norm": 0.5731649994850159, + "learning_rate": 7.391536758034682e-05, + "loss": 1.9563, + "step": 6321 + }, + { + "epoch": 0.35237723649740815, + "grad_norm": 0.6104768514633179, + "learning_rate": 7.390754756756526e-05, + "loss": 1.6392, + "step": 6322 + }, + { + "epoch": 0.35243297475057134, + "grad_norm": 0.5218120813369751, + "learning_rate": 7.389972679657571e-05, + "loss": 1.6262, + "step": 6323 + }, + { + "epoch": 0.35248871300373447, + "grad_norm": 0.5537388324737549, + "learning_rate": 7.389190526762618e-05, + "loss": 1.7317, + "step": 6324 + }, + { + "epoch": 0.3525444512568976, + "grad_norm": 0.577392578125, + "learning_rate": 7.38840829809647e-05, + "loss": 1.7069, + "step": 6325 + }, + { + "epoch": 0.3526001895100608, + "grad_norm": 0.5511906147003174, + "learning_rate": 7.387625993683937e-05, + "loss": 1.6009, + "step": 6326 + }, + { + "epoch": 0.3526559277632239, + "grad_norm": 0.5822625756263733, + "learning_rate": 7.386843613549827e-05, + "loss": 1.7174, + "step": 6327 + }, + { + "epoch": 0.35271166601638704, + "grad_norm": 0.5413920879364014, + "learning_rate": 7.386061157718955e-05, + "loss": 1.5927, + "step": 6328 + }, + { + "epoch": 0.35276740426955017, + "grad_norm": 0.5867698192596436, + "learning_rate": 7.385278626216133e-05, + "loss": 1.7494, + "step": 6329 + }, + { + "epoch": 0.35282314252271335, + "grad_norm": 0.6775004863739014, + "learning_rate": 7.384496019066182e-05, + "loss": 1.8777, + "step": 6330 + }, + { + "epoch": 0.3528788807758765, + "grad_norm": 0.6009215116500854, + "learning_rate": 7.383713336293919e-05, + "loss": 1.7538, + "step": 6331 + }, + { + "epoch": 0.3529346190290396, + "grad_norm": 0.5513560771942139, + "learning_rate": 7.382930577924168e-05, + "loss": 1.6307, + "step": 6332 + }, + { + "epoch": 0.3529903572822028, + "grad_norm": 0.5479623079299927, + "learning_rate": 7.382147743981751e-05, + "loss": 1.6945, + "step": 6333 + }, + { + "epoch": 0.3530460955353659, + "grad_norm": 0.603458046913147, + "learning_rate": 7.381364834491499e-05, + "loss": 1.7531, + "step": 6334 + }, + { + "epoch": 0.35310183378852905, + "grad_norm": 0.951324999332428, + "learning_rate": 7.380581849478236e-05, + "loss": 1.8593, + "step": 6335 + }, + { + "epoch": 0.35315757204169224, + "grad_norm": 0.5293959975242615, + "learning_rate": 7.379798788966798e-05, + "loss": 1.7638, + "step": 6336 + }, + { + "epoch": 0.35321331029485536, + "grad_norm": 0.5229690670967102, + "learning_rate": 7.379015652982016e-05, + "loss": 1.7042, + "step": 6337 + }, + { + "epoch": 0.3532690485480185, + "grad_norm": 0.5152291059494019, + "learning_rate": 7.378232441548729e-05, + "loss": 1.607, + "step": 6338 + }, + { + "epoch": 0.3533247868011817, + "grad_norm": 0.5136567950248718, + "learning_rate": 7.377449154691775e-05, + "loss": 1.7222, + "step": 6339 + }, + { + "epoch": 0.3533805250543448, + "grad_norm": 0.5531160235404968, + "learning_rate": 7.376665792435996e-05, + "loss": 1.6946, + "step": 6340 + }, + { + "epoch": 0.35343626330750794, + "grad_norm": 0.554097592830658, + "learning_rate": 7.375882354806235e-05, + "loss": 1.6551, + "step": 6341 + }, + { + "epoch": 0.35349200156067107, + "grad_norm": 0.5862346887588501, + "learning_rate": 7.375098841827337e-05, + "loss": 1.7594, + "step": 6342 + }, + { + "epoch": 0.35354773981383425, + "grad_norm": 0.5202105641365051, + "learning_rate": 7.374315253524152e-05, + "loss": 1.6205, + "step": 6343 + }, + { + "epoch": 0.3536034780669974, + "grad_norm": 0.5510536432266235, + "learning_rate": 7.373531589921531e-05, + "loss": 1.5776, + "step": 6344 + }, + { + "epoch": 0.3536592163201605, + "grad_norm": 0.5484849214553833, + "learning_rate": 7.372747851044326e-05, + "loss": 1.5603, + "step": 6345 + }, + { + "epoch": 0.3537149545733237, + "grad_norm": 0.55774986743927, + "learning_rate": 7.371964036917394e-05, + "loss": 1.7814, + "step": 6346 + }, + { + "epoch": 0.3537706928264868, + "grad_norm": 0.5338320732116699, + "learning_rate": 7.371180147565592e-05, + "loss": 1.5941, + "step": 6347 + }, + { + "epoch": 0.35382643107964995, + "grad_norm": 0.5263161659240723, + "learning_rate": 7.370396183013779e-05, + "loss": 1.2328, + "step": 6348 + }, + { + "epoch": 0.35388216933281313, + "grad_norm": 0.533647894859314, + "learning_rate": 7.369612143286822e-05, + "loss": 1.7327, + "step": 6349 + }, + { + "epoch": 0.35393790758597626, + "grad_norm": 0.5682227611541748, + "learning_rate": 7.368828028409581e-05, + "loss": 1.8406, + "step": 6350 + }, + { + "epoch": 0.3539936458391394, + "grad_norm": 0.5832127332687378, + "learning_rate": 7.368043838406927e-05, + "loss": 1.7841, + "step": 6351 + }, + { + "epoch": 0.3540493840923025, + "grad_norm": 0.5741327404975891, + "learning_rate": 7.36725957330373e-05, + "loss": 1.787, + "step": 6352 + }, + { + "epoch": 0.3541051223454657, + "grad_norm": 0.5750821828842163, + "learning_rate": 7.366475233124861e-05, + "loss": 1.7946, + "step": 6353 + }, + { + "epoch": 0.35416086059862883, + "grad_norm": 0.5595529079437256, + "learning_rate": 7.365690817895195e-05, + "loss": 1.6904, + "step": 6354 + }, + { + "epoch": 0.35421659885179196, + "grad_norm": 0.5768024921417236, + "learning_rate": 7.364906327639608e-05, + "loss": 1.7634, + "step": 6355 + }, + { + "epoch": 0.35427233710495515, + "grad_norm": 0.5867105722427368, + "learning_rate": 7.364121762382983e-05, + "loss": 1.7406, + "step": 6356 + }, + { + "epoch": 0.3543280753581183, + "grad_norm": 0.5967558026313782, + "learning_rate": 7.363337122150197e-05, + "loss": 1.5078, + "step": 6357 + }, + { + "epoch": 0.3543838136112814, + "grad_norm": 0.5712282061576843, + "learning_rate": 7.36255240696614e-05, + "loss": 1.767, + "step": 6358 + }, + { + "epoch": 0.3544395518644446, + "grad_norm": 0.5473513603210449, + "learning_rate": 7.361767616855692e-05, + "loss": 1.6409, + "step": 6359 + }, + { + "epoch": 0.3544952901176077, + "grad_norm": 0.5412675738334656, + "learning_rate": 7.360982751843747e-05, + "loss": 1.6319, + "step": 6360 + }, + { + "epoch": 0.35455102837077085, + "grad_norm": 0.5327848792076111, + "learning_rate": 7.360197811955194e-05, + "loss": 1.511, + "step": 6361 + }, + { + "epoch": 0.35460676662393403, + "grad_norm": 0.5604977607727051, + "learning_rate": 7.359412797214929e-05, + "loss": 1.7604, + "step": 6362 + }, + { + "epoch": 0.35466250487709716, + "grad_norm": 0.5807721018791199, + "learning_rate": 7.358627707647844e-05, + "loss": 1.5816, + "step": 6363 + }, + { + "epoch": 0.3547182431302603, + "grad_norm": 0.5296190977096558, + "learning_rate": 7.357842543278841e-05, + "loss": 1.2601, + "step": 6364 + }, + { + "epoch": 0.3547739813834234, + "grad_norm": 0.5498451590538025, + "learning_rate": 7.357057304132819e-05, + "loss": 1.8474, + "step": 6365 + }, + { + "epoch": 0.3548297196365866, + "grad_norm": 0.5772817134857178, + "learning_rate": 7.356271990234683e-05, + "loss": 1.7508, + "step": 6366 + }, + { + "epoch": 0.35488545788974973, + "grad_norm": 0.520463764667511, + "learning_rate": 7.355486601609339e-05, + "loss": 1.5589, + "step": 6367 + }, + { + "epoch": 0.35494119614291286, + "grad_norm": 0.5433523058891296, + "learning_rate": 7.354701138281688e-05, + "loss": 1.7982, + "step": 6368 + }, + { + "epoch": 0.35499693439607605, + "grad_norm": 0.587772011756897, + "learning_rate": 7.35391560027665e-05, + "loss": 1.7944, + "step": 6369 + }, + { + "epoch": 0.3550526726492392, + "grad_norm": 0.562419056892395, + "learning_rate": 7.353129987619133e-05, + "loss": 1.8376, + "step": 6370 + }, + { + "epoch": 0.3551084109024023, + "grad_norm": 0.524745523929596, + "learning_rate": 7.352344300334053e-05, + "loss": 1.575, + "step": 6371 + }, + { + "epoch": 0.3551641491555655, + "grad_norm": 0.5049068927764893, + "learning_rate": 7.351558538446326e-05, + "loss": 1.3716, + "step": 6372 + }, + { + "epoch": 0.3552198874087286, + "grad_norm": 0.6006641387939453, + "learning_rate": 7.350772701980872e-05, + "loss": 1.9018, + "step": 6373 + }, + { + "epoch": 0.35527562566189175, + "grad_norm": 0.5516168475151062, + "learning_rate": 7.349986790962613e-05, + "loss": 1.6401, + "step": 6374 + }, + { + "epoch": 0.3553313639150549, + "grad_norm": 0.5250164270401001, + "learning_rate": 7.349200805416478e-05, + "loss": 1.5694, + "step": 6375 + }, + { + "epoch": 0.35538710216821806, + "grad_norm": 0.5079348087310791, + "learning_rate": 7.348414745367387e-05, + "loss": 1.6291, + "step": 6376 + }, + { + "epoch": 0.3554428404213812, + "grad_norm": 0.5634783506393433, + "learning_rate": 7.347628610840274e-05, + "loss": 1.6777, + "step": 6377 + }, + { + "epoch": 0.3554985786745443, + "grad_norm": 0.5921057462692261, + "learning_rate": 7.346842401860069e-05, + "loss": 1.922, + "step": 6378 + }, + { + "epoch": 0.3555543169277075, + "grad_norm": 0.5826466679573059, + "learning_rate": 7.346056118451705e-05, + "loss": 1.7305, + "step": 6379 + }, + { + "epoch": 0.35561005518087063, + "grad_norm": 0.5478690266609192, + "learning_rate": 7.345269760640121e-05, + "loss": 1.7387, + "step": 6380 + }, + { + "epoch": 0.35566579343403376, + "grad_norm": 0.5795879364013672, + "learning_rate": 7.344483328450253e-05, + "loss": 1.6662, + "step": 6381 + }, + { + "epoch": 0.35572153168719695, + "grad_norm": 0.5886217355728149, + "learning_rate": 7.343696821907042e-05, + "loss": 1.8065, + "step": 6382 + }, + { + "epoch": 0.3557772699403601, + "grad_norm": 0.6385563611984253, + "learning_rate": 7.342910241035434e-05, + "loss": 1.7933, + "step": 6383 + }, + { + "epoch": 0.3558330081935232, + "grad_norm": 0.5828480124473572, + "learning_rate": 7.342123585860374e-05, + "loss": 1.6203, + "step": 6384 + }, + { + "epoch": 0.3558887464466864, + "grad_norm": 0.5478693842887878, + "learning_rate": 7.341336856406808e-05, + "loss": 1.6706, + "step": 6385 + }, + { + "epoch": 0.3559444846998495, + "grad_norm": 0.5751214027404785, + "learning_rate": 7.340550052699689e-05, + "loss": 1.8427, + "step": 6386 + }, + { + "epoch": 0.35600022295301265, + "grad_norm": 0.5512586236000061, + "learning_rate": 7.339763174763968e-05, + "loss": 1.7332, + "step": 6387 + }, + { + "epoch": 0.3560559612061758, + "grad_norm": 0.5546371340751648, + "learning_rate": 7.3389762226246e-05, + "loss": 1.5966, + "step": 6388 + }, + { + "epoch": 0.35611169945933896, + "grad_norm": 0.5267236232757568, + "learning_rate": 7.338189196306544e-05, + "loss": 1.8137, + "step": 6389 + }, + { + "epoch": 0.3561674377125021, + "grad_norm": 0.5219095945358276, + "learning_rate": 7.33740209583476e-05, + "loss": 1.6799, + "step": 6390 + }, + { + "epoch": 0.3562231759656652, + "grad_norm": 0.5330881476402283, + "learning_rate": 7.33661492123421e-05, + "loss": 1.6959, + "step": 6391 + }, + { + "epoch": 0.3562789142188284, + "grad_norm": 0.5660157203674316, + "learning_rate": 7.335827672529856e-05, + "loss": 1.7565, + "step": 6392 + }, + { + "epoch": 0.35633465247199153, + "grad_norm": 0.5627869963645935, + "learning_rate": 7.335040349746669e-05, + "loss": 1.7526, + "step": 6393 + }, + { + "epoch": 0.35639039072515466, + "grad_norm": 0.588152289390564, + "learning_rate": 7.334252952909615e-05, + "loss": 1.64, + "step": 6394 + }, + { + "epoch": 0.35644612897831784, + "grad_norm": 0.5885617733001709, + "learning_rate": 7.333465482043667e-05, + "loss": 1.7358, + "step": 6395 + }, + { + "epoch": 0.356501867231481, + "grad_norm": 0.6158447265625, + "learning_rate": 7.3326779371738e-05, + "loss": 1.854, + "step": 6396 + }, + { + "epoch": 0.3565576054846441, + "grad_norm": 0.5353176593780518, + "learning_rate": 7.33189031832499e-05, + "loss": 1.6502, + "step": 6397 + }, + { + "epoch": 0.35661334373780723, + "grad_norm": 0.5986976027488708, + "learning_rate": 7.331102625522212e-05, + "loss": 1.6757, + "step": 6398 + }, + { + "epoch": 0.3566690819909704, + "grad_norm": 0.5034981966018677, + "learning_rate": 7.330314858790453e-05, + "loss": 1.5362, + "step": 6399 + }, + { + "epoch": 0.35672482024413354, + "grad_norm": 0.5768936276435852, + "learning_rate": 7.32952701815469e-05, + "loss": 1.7302, + "step": 6400 + }, + { + "epoch": 0.3567805584972967, + "grad_norm": 0.5493230819702148, + "learning_rate": 7.328739103639916e-05, + "loss": 1.7755, + "step": 6401 + }, + { + "epoch": 0.35683629675045986, + "grad_norm": 0.5121830105781555, + "learning_rate": 7.327951115271113e-05, + "loss": 1.5803, + "step": 6402 + }, + { + "epoch": 0.356892035003623, + "grad_norm": 0.546416699886322, + "learning_rate": 7.327163053073273e-05, + "loss": 1.5991, + "step": 6403 + }, + { + "epoch": 0.3569477732567861, + "grad_norm": 0.5108504891395569, + "learning_rate": 7.32637491707139e-05, + "loss": 1.6789, + "step": 6404 + }, + { + "epoch": 0.3570035115099493, + "grad_norm": 0.5747851729393005, + "learning_rate": 7.32558670729046e-05, + "loss": 1.8266, + "step": 6405 + }, + { + "epoch": 0.35705924976311243, + "grad_norm": 0.587032675743103, + "learning_rate": 7.324798423755476e-05, + "loss": 1.6093, + "step": 6406 + }, + { + "epoch": 0.35711498801627556, + "grad_norm": 0.5485719442367554, + "learning_rate": 7.324010066491442e-05, + "loss": 1.6672, + "step": 6407 + }, + { + "epoch": 0.35717072626943874, + "grad_norm": 0.5325014591217041, + "learning_rate": 7.323221635523358e-05, + "loss": 1.7776, + "step": 6408 + }, + { + "epoch": 0.35722646452260187, + "grad_norm": 0.5524224638938904, + "learning_rate": 7.32243313087623e-05, + "loss": 1.9326, + "step": 6409 + }, + { + "epoch": 0.357282202775765, + "grad_norm": 0.5688652396202087, + "learning_rate": 7.321644552575062e-05, + "loss": 1.8942, + "step": 6410 + }, + { + "epoch": 0.35733794102892813, + "grad_norm": 0.5133098363876343, + "learning_rate": 7.320855900644867e-05, + "loss": 1.6339, + "step": 6411 + }, + { + "epoch": 0.3573936792820913, + "grad_norm": 0.5422292947769165, + "learning_rate": 7.320067175110653e-05, + "loss": 1.681, + "step": 6412 + }, + { + "epoch": 0.35744941753525444, + "grad_norm": 0.5691182613372803, + "learning_rate": 7.319278375997436e-05, + "loss": 1.847, + "step": 6413 + }, + { + "epoch": 0.3575051557884176, + "grad_norm": 0.5584883689880371, + "learning_rate": 7.31848950333023e-05, + "loss": 1.7616, + "step": 6414 + }, + { + "epoch": 0.35756089404158076, + "grad_norm": 0.5878840088844299, + "learning_rate": 7.317700557134056e-05, + "loss": 1.7561, + "step": 6415 + }, + { + "epoch": 0.3576166322947439, + "grad_norm": 0.5363910794258118, + "learning_rate": 7.316911537433933e-05, + "loss": 1.6086, + "step": 6416 + }, + { + "epoch": 0.357672370547907, + "grad_norm": 0.5783511996269226, + "learning_rate": 7.316122444254884e-05, + "loss": 1.7853, + "step": 6417 + }, + { + "epoch": 0.3577281088010702, + "grad_norm": 0.5695887804031372, + "learning_rate": 7.315333277621935e-05, + "loss": 1.5816, + "step": 6418 + }, + { + "epoch": 0.35778384705423333, + "grad_norm": 0.5631670355796814, + "learning_rate": 7.314544037560114e-05, + "loss": 1.5703, + "step": 6419 + }, + { + "epoch": 0.35783958530739646, + "grad_norm": 0.5459564328193665, + "learning_rate": 7.313754724094451e-05, + "loss": 1.6222, + "step": 6420 + }, + { + "epoch": 0.3578953235605596, + "grad_norm": 0.5215150117874146, + "learning_rate": 7.312965337249979e-05, + "loss": 1.7888, + "step": 6421 + }, + { + "epoch": 0.35795106181372277, + "grad_norm": 0.5654617547988892, + "learning_rate": 7.312175877051732e-05, + "loss": 1.7508, + "step": 6422 + }, + { + "epoch": 0.3580068000668859, + "grad_norm": 0.5510186553001404, + "learning_rate": 7.311386343524747e-05, + "loss": 1.8401, + "step": 6423 + }, + { + "epoch": 0.35806253832004903, + "grad_norm": 0.521782398223877, + "learning_rate": 7.310596736694062e-05, + "loss": 1.5428, + "step": 6424 + }, + { + "epoch": 0.3581182765732122, + "grad_norm": 0.5308924317359924, + "learning_rate": 7.309807056584722e-05, + "loss": 1.464, + "step": 6425 + }, + { + "epoch": 0.35817401482637534, + "grad_norm": 0.5567795634269714, + "learning_rate": 7.309017303221768e-05, + "loss": 1.7063, + "step": 6426 + }, + { + "epoch": 0.35822975307953847, + "grad_norm": 0.5558245778083801, + "learning_rate": 7.308227476630249e-05, + "loss": 1.6636, + "step": 6427 + }, + { + "epoch": 0.35828549133270166, + "grad_norm": 0.5258497595787048, + "learning_rate": 7.30743757683521e-05, + "loss": 1.5777, + "step": 6428 + }, + { + "epoch": 0.3583412295858648, + "grad_norm": 0.5101563930511475, + "learning_rate": 7.306647603861706e-05, + "loss": 1.5602, + "step": 6429 + }, + { + "epoch": 0.3583969678390279, + "grad_norm": 0.5508061647415161, + "learning_rate": 7.305857557734789e-05, + "loss": 1.659, + "step": 6430 + }, + { + "epoch": 0.3584527060921911, + "grad_norm": 0.6159545183181763, + "learning_rate": 7.305067438479513e-05, + "loss": 1.9413, + "step": 6431 + }, + { + "epoch": 0.3585084443453542, + "grad_norm": 0.5804408192634583, + "learning_rate": 7.30427724612094e-05, + "loss": 1.7138, + "step": 6432 + }, + { + "epoch": 0.35856418259851736, + "grad_norm": 0.5316668748855591, + "learning_rate": 7.303486980684125e-05, + "loss": 1.7588, + "step": 6433 + }, + { + "epoch": 0.3586199208516805, + "grad_norm": 0.6093178391456604, + "learning_rate": 7.302696642194134e-05, + "loss": 1.8426, + "step": 6434 + }, + { + "epoch": 0.35867565910484367, + "grad_norm": 0.5371636152267456, + "learning_rate": 7.30190623067603e-05, + "loss": 1.5852, + "step": 6435 + }, + { + "epoch": 0.3587313973580068, + "grad_norm": 0.5050824284553528, + "learning_rate": 7.301115746154884e-05, + "loss": 1.5495, + "step": 6436 + }, + { + "epoch": 0.3587871356111699, + "grad_norm": 0.5830590724945068, + "learning_rate": 7.300325188655761e-05, + "loss": 1.8611, + "step": 6437 + }, + { + "epoch": 0.3588428738643331, + "grad_norm": 0.5415953397750854, + "learning_rate": 7.299534558203735e-05, + "loss": 1.6437, + "step": 6438 + }, + { + "epoch": 0.35889861211749624, + "grad_norm": 0.5701804757118225, + "learning_rate": 7.298743854823882e-05, + "loss": 1.8723, + "step": 6439 + }, + { + "epoch": 0.35895435037065937, + "grad_norm": 0.5361306667327881, + "learning_rate": 7.297953078541274e-05, + "loss": 1.518, + "step": 6440 + }, + { + "epoch": 0.35901008862382255, + "grad_norm": 0.5895618796348572, + "learning_rate": 7.297162229380994e-05, + "loss": 1.8528, + "step": 6441 + }, + { + "epoch": 0.3590658268769857, + "grad_norm": 0.5555623173713684, + "learning_rate": 7.29637130736812e-05, + "loss": 1.6619, + "step": 6442 + }, + { + "epoch": 0.3591215651301488, + "grad_norm": 0.5527105331420898, + "learning_rate": 7.295580312527739e-05, + "loss": 1.8209, + "step": 6443 + }, + { + "epoch": 0.35917730338331194, + "grad_norm": 0.5717308521270752, + "learning_rate": 7.294789244884932e-05, + "loss": 1.6109, + "step": 6444 + }, + { + "epoch": 0.3592330416364751, + "grad_norm": 0.5484607815742493, + "learning_rate": 7.293998104464792e-05, + "loss": 1.7449, + "step": 6445 + }, + { + "epoch": 0.35928877988963825, + "grad_norm": 0.5548183917999268, + "learning_rate": 7.293206891292405e-05, + "loss": 1.7952, + "step": 6446 + }, + { + "epoch": 0.3593445181428014, + "grad_norm": 0.5666037201881409, + "learning_rate": 7.292415605392867e-05, + "loss": 1.8784, + "step": 6447 + }, + { + "epoch": 0.35940025639596457, + "grad_norm": 0.5922662615776062, + "learning_rate": 7.291624246791272e-05, + "loss": 1.8764, + "step": 6448 + }, + { + "epoch": 0.3594559946491277, + "grad_norm": 0.5456053018569946, + "learning_rate": 7.290832815512716e-05, + "loss": 1.7389, + "step": 6449 + }, + { + "epoch": 0.3595117329022908, + "grad_norm": 0.5417848229408264, + "learning_rate": 7.290041311582301e-05, + "loss": 1.591, + "step": 6450 + }, + { + "epoch": 0.359567471155454, + "grad_norm": 0.5787496566772461, + "learning_rate": 7.289249735025127e-05, + "loss": 1.765, + "step": 6451 + }, + { + "epoch": 0.35962320940861714, + "grad_norm": 0.5513389110565186, + "learning_rate": 7.288458085866298e-05, + "loss": 1.6685, + "step": 6452 + }, + { + "epoch": 0.35967894766178027, + "grad_norm": 0.5737441182136536, + "learning_rate": 7.287666364130921e-05, + "loss": 1.6956, + "step": 6453 + }, + { + "epoch": 0.35973468591494345, + "grad_norm": 0.6044551134109497, + "learning_rate": 7.286874569844106e-05, + "loss": 1.7829, + "step": 6454 + }, + { + "epoch": 0.3597904241681066, + "grad_norm": 0.5688374638557434, + "learning_rate": 7.286082703030961e-05, + "loss": 1.8747, + "step": 6455 + }, + { + "epoch": 0.3598461624212697, + "grad_norm": 0.5276156067848206, + "learning_rate": 7.285290763716604e-05, + "loss": 1.5944, + "step": 6456 + }, + { + "epoch": 0.35990190067443284, + "grad_norm": 0.5913518667221069, + "learning_rate": 7.284498751926147e-05, + "loss": 1.6307, + "step": 6457 + }, + { + "epoch": 0.359957638927596, + "grad_norm": 0.5470561981201172, + "learning_rate": 7.283706667684709e-05, + "loss": 1.6096, + "step": 6458 + }, + { + "epoch": 0.36001337718075915, + "grad_norm": 0.5165275931358337, + "learning_rate": 7.28291451101741e-05, + "loss": 1.6963, + "step": 6459 + }, + { + "epoch": 0.3600691154339223, + "grad_norm": 0.552894651889801, + "learning_rate": 7.282122281949374e-05, + "loss": 1.7304, + "step": 6460 + }, + { + "epoch": 0.36012485368708547, + "grad_norm": 0.573884129524231, + "learning_rate": 7.281329980505724e-05, + "loss": 1.8304, + "step": 6461 + }, + { + "epoch": 0.3601805919402486, + "grad_norm": 0.5113431811332703, + "learning_rate": 7.280537606711589e-05, + "loss": 1.509, + "step": 6462 + }, + { + "epoch": 0.3602363301934117, + "grad_norm": 0.54507976770401, + "learning_rate": 7.279745160592097e-05, + "loss": 1.765, + "step": 6463 + }, + { + "epoch": 0.3602920684465749, + "grad_norm": 0.5524507761001587, + "learning_rate": 7.278952642172381e-05, + "loss": 1.6604, + "step": 6464 + }, + { + "epoch": 0.36034780669973804, + "grad_norm": 0.5713779926300049, + "learning_rate": 7.278160051477574e-05, + "loss": 1.6273, + "step": 6465 + }, + { + "epoch": 0.36040354495290117, + "grad_norm": 0.5713092684745789, + "learning_rate": 7.277367388532812e-05, + "loss": 1.7693, + "step": 6466 + }, + { + "epoch": 0.3604592832060643, + "grad_norm": 0.5316145420074463, + "learning_rate": 7.276574653363236e-05, + "loss": 1.6402, + "step": 6467 + }, + { + "epoch": 0.3605150214592275, + "grad_norm": 0.5453936457633972, + "learning_rate": 7.275781845993983e-05, + "loss": 1.9642, + "step": 6468 + }, + { + "epoch": 0.3605707597123906, + "grad_norm": 0.5773400068283081, + "learning_rate": 7.274988966450201e-05, + "loss": 1.8417, + "step": 6469 + }, + { + "epoch": 0.36062649796555374, + "grad_norm": 0.5517837405204773, + "learning_rate": 7.274196014757032e-05, + "loss": 1.6307, + "step": 6470 + }, + { + "epoch": 0.3606822362187169, + "grad_norm": 0.5454963445663452, + "learning_rate": 7.273402990939626e-05, + "loss": 1.7725, + "step": 6471 + }, + { + "epoch": 0.36073797447188005, + "grad_norm": 0.5993366837501526, + "learning_rate": 7.272609895023129e-05, + "loss": 1.831, + "step": 6472 + }, + { + "epoch": 0.3607937127250432, + "grad_norm": 0.5621082186698914, + "learning_rate": 7.2718167270327e-05, + "loss": 1.4942, + "step": 6473 + }, + { + "epoch": 0.36084945097820637, + "grad_norm": 0.5455790758132935, + "learning_rate": 7.271023486993488e-05, + "loss": 1.722, + "step": 6474 + }, + { + "epoch": 0.3609051892313695, + "grad_norm": 0.5093836784362793, + "learning_rate": 7.270230174930653e-05, + "loss": 1.5921, + "step": 6475 + }, + { + "epoch": 0.3609609274845326, + "grad_norm": 0.5746651887893677, + "learning_rate": 7.269436790869352e-05, + "loss": 1.7303, + "step": 6476 + }, + { + "epoch": 0.3610166657376958, + "grad_norm": 0.5042871832847595, + "learning_rate": 7.268643334834748e-05, + "loss": 1.4386, + "step": 6477 + }, + { + "epoch": 0.36107240399085894, + "grad_norm": 0.6014384627342224, + "learning_rate": 7.267849806852005e-05, + "loss": 1.7803, + "step": 6478 + }, + { + "epoch": 0.36112814224402207, + "grad_norm": 0.49684464931488037, + "learning_rate": 7.267056206946289e-05, + "loss": 1.6513, + "step": 6479 + }, + { + "epoch": 0.3611838804971852, + "grad_norm": 0.6013120412826538, + "learning_rate": 7.266262535142767e-05, + "loss": 1.718, + "step": 6480 + }, + { + "epoch": 0.3612396187503484, + "grad_norm": 0.5482946038246155, + "learning_rate": 7.26546879146661e-05, + "loss": 1.8295, + "step": 6481 + }, + { + "epoch": 0.3612953570035115, + "grad_norm": 0.5593370199203491, + "learning_rate": 7.264674975942994e-05, + "loss": 1.8042, + "step": 6482 + }, + { + "epoch": 0.36135109525667464, + "grad_norm": 0.5430756211280823, + "learning_rate": 7.26388108859709e-05, + "loss": 1.6976, + "step": 6483 + }, + { + "epoch": 0.3614068335098378, + "grad_norm": 0.5408653020858765, + "learning_rate": 7.263087129454078e-05, + "loss": 1.5425, + "step": 6484 + }, + { + "epoch": 0.36146257176300095, + "grad_norm": 0.5399406552314758, + "learning_rate": 7.262293098539134e-05, + "loss": 1.7552, + "step": 6485 + }, + { + "epoch": 0.3615183100161641, + "grad_norm": 0.5077804923057556, + "learning_rate": 7.261498995877447e-05, + "loss": 1.5728, + "step": 6486 + }, + { + "epoch": 0.36157404826932726, + "grad_norm": 0.5409159660339355, + "learning_rate": 7.260704821494196e-05, + "loss": 1.7926, + "step": 6487 + }, + { + "epoch": 0.3616297865224904, + "grad_norm": 0.4922293424606323, + "learning_rate": 7.259910575414569e-05, + "loss": 1.46, + "step": 6488 + }, + { + "epoch": 0.3616855247756535, + "grad_norm": 0.530104398727417, + "learning_rate": 7.259116257663753e-05, + "loss": 1.4995, + "step": 6489 + }, + { + "epoch": 0.36174126302881665, + "grad_norm": 0.5683631896972656, + "learning_rate": 7.258321868266943e-05, + "loss": 1.6736, + "step": 6490 + }, + { + "epoch": 0.36179700128197984, + "grad_norm": 0.5562074184417725, + "learning_rate": 7.25752740724933e-05, + "loss": 1.6224, + "step": 6491 + }, + { + "epoch": 0.36185273953514296, + "grad_norm": 0.6077651381492615, + "learning_rate": 7.256732874636109e-05, + "loss": 1.7814, + "step": 6492 + }, + { + "epoch": 0.3619084777883061, + "grad_norm": 0.5739646553993225, + "learning_rate": 7.255938270452479e-05, + "loss": 1.7024, + "step": 6493 + }, + { + "epoch": 0.3619642160414693, + "grad_norm": 0.5540484189987183, + "learning_rate": 7.25514359472364e-05, + "loss": 1.5576, + "step": 6494 + }, + { + "epoch": 0.3620199542946324, + "grad_norm": 0.5674034953117371, + "learning_rate": 7.254348847474797e-05, + "loss": 1.8389, + "step": 6495 + }, + { + "epoch": 0.36207569254779554, + "grad_norm": 0.5664230585098267, + "learning_rate": 7.253554028731148e-05, + "loss": 1.7194, + "step": 6496 + }, + { + "epoch": 0.3621314308009587, + "grad_norm": 0.5525626540184021, + "learning_rate": 7.252759138517909e-05, + "loss": 1.3394, + "step": 6497 + }, + { + "epoch": 0.36218716905412185, + "grad_norm": 0.5549319982528687, + "learning_rate": 7.251964176860281e-05, + "loss": 1.6234, + "step": 6498 + }, + { + "epoch": 0.362242907307285, + "grad_norm": 0.5454506874084473, + "learning_rate": 7.25116914378348e-05, + "loss": 1.8937, + "step": 6499 + }, + { + "epoch": 0.36229864556044816, + "grad_norm": 0.5178475379943848, + "learning_rate": 7.25037403931272e-05, + "loss": 1.5599, + "step": 6500 + }, + { + "epoch": 0.3623543838136113, + "grad_norm": 0.5836609601974487, + "learning_rate": 7.249578863473216e-05, + "loss": 1.8547, + "step": 6501 + }, + { + "epoch": 0.3624101220667744, + "grad_norm": 0.5162068605422974, + "learning_rate": 7.248783616290186e-05, + "loss": 1.4538, + "step": 6502 + }, + { + "epoch": 0.36246586031993755, + "grad_norm": 0.5959255695343018, + "learning_rate": 7.24798829778885e-05, + "loss": 1.8237, + "step": 6503 + }, + { + "epoch": 0.36252159857310073, + "grad_norm": 0.5471253395080566, + "learning_rate": 7.247192907994433e-05, + "loss": 1.5705, + "step": 6504 + }, + { + "epoch": 0.36257733682626386, + "grad_norm": 0.5264948010444641, + "learning_rate": 7.246397446932159e-05, + "loss": 1.6597, + "step": 6505 + }, + { + "epoch": 0.362633075079427, + "grad_norm": 0.5829636454582214, + "learning_rate": 7.245601914627255e-05, + "loss": 1.9137, + "step": 6506 + }, + { + "epoch": 0.3626888133325902, + "grad_norm": 0.5371459722518921, + "learning_rate": 7.244806311104952e-05, + "loss": 1.5883, + "step": 6507 + }, + { + "epoch": 0.3627445515857533, + "grad_norm": 0.6225298643112183, + "learning_rate": 7.24401063639048e-05, + "loss": 1.9112, + "step": 6508 + }, + { + "epoch": 0.36280028983891643, + "grad_norm": 0.5452820062637329, + "learning_rate": 7.243214890509073e-05, + "loss": 1.6557, + "step": 6509 + }, + { + "epoch": 0.3628560280920796, + "grad_norm": 0.5052100419998169, + "learning_rate": 7.24241907348597e-05, + "loss": 1.4815, + "step": 6510 + }, + { + "epoch": 0.36291176634524275, + "grad_norm": 0.5527931451797485, + "learning_rate": 7.241623185346409e-05, + "loss": 1.6867, + "step": 6511 + }, + { + "epoch": 0.3629675045984059, + "grad_norm": 0.5412555932998657, + "learning_rate": 7.240827226115629e-05, + "loss": 1.5461, + "step": 6512 + }, + { + "epoch": 0.363023242851569, + "grad_norm": 0.5910593271255493, + "learning_rate": 7.240031195818874e-05, + "loss": 1.7713, + "step": 6513 + }, + { + "epoch": 0.3630789811047322, + "grad_norm": 0.5672844052314758, + "learning_rate": 7.239235094481391e-05, + "loss": 1.3757, + "step": 6514 + }, + { + "epoch": 0.3631347193578953, + "grad_norm": 0.580847442150116, + "learning_rate": 7.238438922128425e-05, + "loss": 1.9571, + "step": 6515 + }, + { + "epoch": 0.36319045761105845, + "grad_norm": 0.642082691192627, + "learning_rate": 7.237642678785228e-05, + "loss": 1.9311, + "step": 6516 + }, + { + "epoch": 0.36324619586422163, + "grad_norm": 0.49659648537635803, + "learning_rate": 7.236846364477052e-05, + "loss": 1.6393, + "step": 6517 + }, + { + "epoch": 0.36330193411738476, + "grad_norm": 0.5082789063453674, + "learning_rate": 7.23604997922915e-05, + "loss": 1.5183, + "step": 6518 + }, + { + "epoch": 0.3633576723705479, + "grad_norm": 0.5978274941444397, + "learning_rate": 7.235253523066781e-05, + "loss": 1.8529, + "step": 6519 + }, + { + "epoch": 0.3634134106237111, + "grad_norm": 0.5323169231414795, + "learning_rate": 7.234456996015202e-05, + "loss": 1.6463, + "step": 6520 + }, + { + "epoch": 0.3634691488768742, + "grad_norm": 0.5250840187072754, + "learning_rate": 7.233660398099675e-05, + "loss": 1.4439, + "step": 6521 + }, + { + "epoch": 0.36352488713003733, + "grad_norm": 0.566667914390564, + "learning_rate": 7.232863729345464e-05, + "loss": 1.5871, + "step": 6522 + }, + { + "epoch": 0.3635806253832005, + "grad_norm": 0.5944371223449707, + "learning_rate": 7.232066989777833e-05, + "loss": 1.978, + "step": 6523 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.556692361831665, + "learning_rate": 7.231270179422051e-05, + "loss": 1.5579, + "step": 6524 + }, + { + "epoch": 0.3636921018895268, + "grad_norm": 0.5578793883323669, + "learning_rate": 7.230473298303388e-05, + "loss": 1.6899, + "step": 6525 + }, + { + "epoch": 0.3637478401426899, + "grad_norm": 0.672099232673645, + "learning_rate": 7.229676346447117e-05, + "loss": 2.1324, + "step": 6526 + }, + { + "epoch": 0.3638035783958531, + "grad_norm": 0.5312888622283936, + "learning_rate": 7.228879323878512e-05, + "loss": 1.453, + "step": 6527 + }, + { + "epoch": 0.3638593166490162, + "grad_norm": 0.5675061345100403, + "learning_rate": 7.22808223062285e-05, + "loss": 1.8623, + "step": 6528 + }, + { + "epoch": 0.36391505490217935, + "grad_norm": 0.49803319573402405, + "learning_rate": 7.227285066705412e-05, + "loss": 1.41, + "step": 6529 + }, + { + "epoch": 0.36397079315534253, + "grad_norm": 0.5489189028739929, + "learning_rate": 7.226487832151476e-05, + "loss": 1.6551, + "step": 6530 + }, + { + "epoch": 0.36402653140850566, + "grad_norm": 0.5769960284233093, + "learning_rate": 7.225690526986326e-05, + "loss": 1.7853, + "step": 6531 + }, + { + "epoch": 0.3640822696616688, + "grad_norm": 0.5400393605232239, + "learning_rate": 7.224893151235251e-05, + "loss": 1.5544, + "step": 6532 + }, + { + "epoch": 0.364138007914832, + "grad_norm": 0.5720942616462708, + "learning_rate": 7.224095704923537e-05, + "loss": 1.9824, + "step": 6533 + }, + { + "epoch": 0.3641937461679951, + "grad_norm": 0.5403176546096802, + "learning_rate": 7.223298188076475e-05, + "loss": 1.5978, + "step": 6534 + }, + { + "epoch": 0.36424948442115823, + "grad_norm": 0.5350765585899353, + "learning_rate": 7.222500600719356e-05, + "loss": 1.5945, + "step": 6535 + }, + { + "epoch": 0.36430522267432136, + "grad_norm": 0.542413592338562, + "learning_rate": 7.221702942877477e-05, + "loss": 1.717, + "step": 6536 + }, + { + "epoch": 0.36436096092748455, + "grad_norm": 0.5472922921180725, + "learning_rate": 7.220905214576134e-05, + "loss": 1.6535, + "step": 6537 + }, + { + "epoch": 0.3644166991806477, + "grad_norm": 0.5479559302330017, + "learning_rate": 7.220107415840626e-05, + "loss": 1.5444, + "step": 6538 + }, + { + "epoch": 0.3644724374338108, + "grad_norm": 0.5131190419197083, + "learning_rate": 7.219309546696255e-05, + "loss": 1.3543, + "step": 6539 + }, + { + "epoch": 0.364528175686974, + "grad_norm": 0.5852962732315063, + "learning_rate": 7.218511607168326e-05, + "loss": 1.7422, + "step": 6540 + }, + { + "epoch": 0.3645839139401371, + "grad_norm": 0.5998173356056213, + "learning_rate": 7.21771359728214e-05, + "loss": 1.9125, + "step": 6541 + }, + { + "epoch": 0.36463965219330025, + "grad_norm": 0.5412231683731079, + "learning_rate": 7.216915517063012e-05, + "loss": 1.8743, + "step": 6542 + }, + { + "epoch": 0.36469539044646343, + "grad_norm": 0.5305824279785156, + "learning_rate": 7.216117366536249e-05, + "loss": 1.476, + "step": 6543 + }, + { + "epoch": 0.36475112869962656, + "grad_norm": 0.586646556854248, + "learning_rate": 7.215319145727161e-05, + "loss": 1.7591, + "step": 6544 + }, + { + "epoch": 0.3648068669527897, + "grad_norm": 0.5975636839866638, + "learning_rate": 7.214520854661071e-05, + "loss": 1.9996, + "step": 6545 + }, + { + "epoch": 0.3648626052059529, + "grad_norm": 0.543255627155304, + "learning_rate": 7.213722493363288e-05, + "loss": 1.6909, + "step": 6546 + }, + { + "epoch": 0.364918343459116, + "grad_norm": 0.5423970222473145, + "learning_rate": 7.212924061859135e-05, + "loss": 1.6429, + "step": 6547 + }, + { + "epoch": 0.36497408171227913, + "grad_norm": 0.5859336256980896, + "learning_rate": 7.212125560173933e-05, + "loss": 1.9055, + "step": 6548 + }, + { + "epoch": 0.36502981996544226, + "grad_norm": 0.5425530672073364, + "learning_rate": 7.211326988333006e-05, + "loss": 1.7073, + "step": 6549 + }, + { + "epoch": 0.36508555821860544, + "grad_norm": 0.580575168132782, + "learning_rate": 7.210528346361678e-05, + "loss": 1.6739, + "step": 6550 + }, + { + "epoch": 0.3651412964717686, + "grad_norm": 0.599686324596405, + "learning_rate": 7.209729634285282e-05, + "loss": 1.7179, + "step": 6551 + }, + { + "epoch": 0.3651970347249317, + "grad_norm": 0.5199704170227051, + "learning_rate": 7.208930852129143e-05, + "loss": 1.7484, + "step": 6552 + }, + { + "epoch": 0.3652527729780949, + "grad_norm": 0.5557204484939575, + "learning_rate": 7.208131999918599e-05, + "loss": 1.6169, + "step": 6553 + }, + { + "epoch": 0.365308511231258, + "grad_norm": 0.5307885408401489, + "learning_rate": 7.20733307767898e-05, + "loss": 1.4029, + "step": 6554 + }, + { + "epoch": 0.36536424948442114, + "grad_norm": 0.5462751388549805, + "learning_rate": 7.206534085435625e-05, + "loss": 1.6399, + "step": 6555 + }, + { + "epoch": 0.36541998773758433, + "grad_norm": 0.5815526247024536, + "learning_rate": 7.205735023213877e-05, + "loss": 1.7943, + "step": 6556 + }, + { + "epoch": 0.36547572599074746, + "grad_norm": 0.5783229470252991, + "learning_rate": 7.204935891039071e-05, + "loss": 1.7919, + "step": 6557 + }, + { + "epoch": 0.3655314642439106, + "grad_norm": 0.5671087503433228, + "learning_rate": 7.204136688936556e-05, + "loss": 1.8125, + "step": 6558 + }, + { + "epoch": 0.3655872024970737, + "grad_norm": 0.5661280751228333, + "learning_rate": 7.203337416931675e-05, + "loss": 1.6377, + "step": 6559 + }, + { + "epoch": 0.3656429407502369, + "grad_norm": 0.5448043942451477, + "learning_rate": 7.202538075049781e-05, + "loss": 1.6929, + "step": 6560 + }, + { + "epoch": 0.36569867900340003, + "grad_norm": 0.5517578125, + "learning_rate": 7.201738663316217e-05, + "loss": 1.7849, + "step": 6561 + }, + { + "epoch": 0.36575441725656316, + "grad_norm": 0.5554426312446594, + "learning_rate": 7.200939181756341e-05, + "loss": 1.3314, + "step": 6562 + }, + { + "epoch": 0.36581015550972634, + "grad_norm": 0.5693673491477966, + "learning_rate": 7.200139630395507e-05, + "loss": 1.8517, + "step": 6563 + }, + { + "epoch": 0.36586589376288947, + "grad_norm": 0.5405746698379517, + "learning_rate": 7.199340009259072e-05, + "loss": 1.58, + "step": 6564 + }, + { + "epoch": 0.3659216320160526, + "grad_norm": 0.633660078048706, + "learning_rate": 7.198540318372394e-05, + "loss": 1.9478, + "step": 6565 + }, + { + "epoch": 0.3659773702692158, + "grad_norm": 0.5665812492370605, + "learning_rate": 7.197740557760834e-05, + "loss": 1.6334, + "step": 6566 + }, + { + "epoch": 0.3660331085223789, + "grad_norm": 0.549199104309082, + "learning_rate": 7.196940727449759e-05, + "loss": 1.4779, + "step": 6567 + }, + { + "epoch": 0.36608884677554204, + "grad_norm": 0.49754953384399414, + "learning_rate": 7.196140827464533e-05, + "loss": 1.5101, + "step": 6568 + }, + { + "epoch": 0.3661445850287052, + "grad_norm": 0.5829338431358337, + "learning_rate": 7.195340857830524e-05, + "loss": 1.7219, + "step": 6569 + }, + { + "epoch": 0.36620032328186836, + "grad_norm": 0.5498637557029724, + "learning_rate": 7.194540818573103e-05, + "loss": 1.6491, + "step": 6570 + }, + { + "epoch": 0.3662560615350315, + "grad_norm": 0.5562663674354553, + "learning_rate": 7.193740709717643e-05, + "loss": 1.7846, + "step": 6571 + }, + { + "epoch": 0.3663117997881946, + "grad_norm": 0.5268211364746094, + "learning_rate": 7.192940531289517e-05, + "loss": 1.5521, + "step": 6572 + }, + { + "epoch": 0.3663675380413578, + "grad_norm": 0.5425642132759094, + "learning_rate": 7.192140283314104e-05, + "loss": 1.7968, + "step": 6573 + }, + { + "epoch": 0.3664232762945209, + "grad_norm": 0.5653149485588074, + "learning_rate": 7.191339965816781e-05, + "loss": 1.6086, + "step": 6574 + }, + { + "epoch": 0.36647901454768406, + "grad_norm": 0.5728870630264282, + "learning_rate": 7.190539578822932e-05, + "loss": 1.8264, + "step": 6575 + }, + { + "epoch": 0.36653475280084724, + "grad_norm": 0.5501007437705994, + "learning_rate": 7.189739122357939e-05, + "loss": 1.8426, + "step": 6576 + }, + { + "epoch": 0.36659049105401037, + "grad_norm": 0.5318872332572937, + "learning_rate": 7.188938596447188e-05, + "loss": 1.7968, + "step": 6577 + }, + { + "epoch": 0.3666462293071735, + "grad_norm": 0.5750231146812439, + "learning_rate": 7.188138001116065e-05, + "loss": 1.6745, + "step": 6578 + }, + { + "epoch": 0.3667019675603367, + "grad_norm": 0.6171157956123352, + "learning_rate": 7.187337336389966e-05, + "loss": 2.0264, + "step": 6579 + }, + { + "epoch": 0.3667577058134998, + "grad_norm": 0.5361387133598328, + "learning_rate": 7.186536602294278e-05, + "loss": 1.5105, + "step": 6580 + }, + { + "epoch": 0.36681344406666294, + "grad_norm": 0.5726244449615479, + "learning_rate": 7.185735798854396e-05, + "loss": 1.6055, + "step": 6581 + }, + { + "epoch": 0.36686918231982607, + "grad_norm": 0.5350404381752014, + "learning_rate": 7.184934926095721e-05, + "loss": 1.7493, + "step": 6582 + }, + { + "epoch": 0.36692492057298925, + "grad_norm": 0.5755828022956848, + "learning_rate": 7.184133984043646e-05, + "loss": 1.6443, + "step": 6583 + }, + { + "epoch": 0.3669806588261524, + "grad_norm": 0.5558964610099792, + "learning_rate": 7.183332972723578e-05, + "loss": 1.816, + "step": 6584 + }, + { + "epoch": 0.3670363970793155, + "grad_norm": 0.5483201146125793, + "learning_rate": 7.182531892160917e-05, + "loss": 1.6545, + "step": 6585 + }, + { + "epoch": 0.3670921353324787, + "grad_norm": 0.5599815249443054, + "learning_rate": 7.18173074238107e-05, + "loss": 1.634, + "step": 6586 + }, + { + "epoch": 0.3671478735856418, + "grad_norm": 0.5529213547706604, + "learning_rate": 7.180929523409443e-05, + "loss": 1.7378, + "step": 6587 + }, + { + "epoch": 0.36720361183880496, + "grad_norm": 0.5131180286407471, + "learning_rate": 7.180128235271449e-05, + "loss": 1.5528, + "step": 6588 + }, + { + "epoch": 0.36725935009196814, + "grad_norm": 0.591602623462677, + "learning_rate": 7.179326877992497e-05, + "loss": 1.7482, + "step": 6589 + }, + { + "epoch": 0.36731508834513127, + "grad_norm": 0.4902382791042328, + "learning_rate": 7.178525451598003e-05, + "loss": 1.4865, + "step": 6590 + }, + { + "epoch": 0.3673708265982944, + "grad_norm": 0.5887609720230103, + "learning_rate": 7.177723956113383e-05, + "loss": 1.9031, + "step": 6591 + }, + { + "epoch": 0.3674265648514576, + "grad_norm": 0.5403375625610352, + "learning_rate": 7.176922391564056e-05, + "loss": 1.6702, + "step": 6592 + }, + { + "epoch": 0.3674823031046207, + "grad_norm": 0.5793707370758057, + "learning_rate": 7.176120757975444e-05, + "loss": 1.6571, + "step": 6593 + }, + { + "epoch": 0.36753804135778384, + "grad_norm": 0.5770851373672485, + "learning_rate": 7.175319055372969e-05, + "loss": 1.7841, + "step": 6594 + }, + { + "epoch": 0.36759377961094697, + "grad_norm": 0.5472514629364014, + "learning_rate": 7.174517283782058e-05, + "loss": 1.6785, + "step": 6595 + }, + { + "epoch": 0.36764951786411015, + "grad_norm": 0.5961628556251526, + "learning_rate": 7.173715443228133e-05, + "loss": 1.6604, + "step": 6596 + }, + { + "epoch": 0.3677052561172733, + "grad_norm": 0.5890954732894897, + "learning_rate": 7.172913533736632e-05, + "loss": 1.7003, + "step": 6597 + }, + { + "epoch": 0.3677609943704364, + "grad_norm": 0.6537253260612488, + "learning_rate": 7.17211155533298e-05, + "loss": 1.9955, + "step": 6598 + }, + { + "epoch": 0.3678167326235996, + "grad_norm": 0.5514366030693054, + "learning_rate": 7.171309508042615e-05, + "loss": 1.5601, + "step": 6599 + }, + { + "epoch": 0.3678724708767627, + "grad_norm": 0.6790293455123901, + "learning_rate": 7.170507391890972e-05, + "loss": 2.1675, + "step": 6600 + }, + { + "epoch": 0.36792820912992585, + "grad_norm": 0.5294934511184692, + "learning_rate": 7.16970520690349e-05, + "loss": 1.6509, + "step": 6601 + }, + { + "epoch": 0.36798394738308904, + "grad_norm": 0.5617215037345886, + "learning_rate": 7.168902953105608e-05, + "loss": 1.7301, + "step": 6602 + }, + { + "epoch": 0.36803968563625217, + "grad_norm": 0.5187042355537415, + "learning_rate": 7.16810063052277e-05, + "loss": 1.4945, + "step": 6603 + }, + { + "epoch": 0.3680954238894153, + "grad_norm": 0.5646756291389465, + "learning_rate": 7.16729823918042e-05, + "loss": 1.8281, + "step": 6604 + }, + { + "epoch": 0.3681511621425784, + "grad_norm": 0.5496782064437866, + "learning_rate": 7.166495779104007e-05, + "loss": 1.6996, + "step": 6605 + }, + { + "epoch": 0.3682069003957416, + "grad_norm": 0.6056029796600342, + "learning_rate": 7.16569325031898e-05, + "loss": 1.9787, + "step": 6606 + }, + { + "epoch": 0.36826263864890474, + "grad_norm": 0.5624659061431885, + "learning_rate": 7.164890652850789e-05, + "loss": 1.7931, + "step": 6607 + }, + { + "epoch": 0.36831837690206787, + "grad_norm": 0.5342402458190918, + "learning_rate": 7.16408798672489e-05, + "loss": 1.664, + "step": 6608 + }, + { + "epoch": 0.36837411515523105, + "grad_norm": 0.5402200818061829, + "learning_rate": 7.163285251966736e-05, + "loss": 1.6754, + "step": 6609 + }, + { + "epoch": 0.3684298534083942, + "grad_norm": 0.5262821316719055, + "learning_rate": 7.162482448601789e-05, + "loss": 1.5501, + "step": 6610 + }, + { + "epoch": 0.3684855916615573, + "grad_norm": 0.5371507406234741, + "learning_rate": 7.161679576655503e-05, + "loss": 1.6168, + "step": 6611 + }, + { + "epoch": 0.3685413299147205, + "grad_norm": 0.5895312428474426, + "learning_rate": 7.160876636153349e-05, + "loss": 1.8576, + "step": 6612 + }, + { + "epoch": 0.3685970681678836, + "grad_norm": 0.5309399962425232, + "learning_rate": 7.160073627120784e-05, + "loss": 1.5803, + "step": 6613 + }, + { + "epoch": 0.36865280642104675, + "grad_norm": 0.564697265625, + "learning_rate": 7.159270549583278e-05, + "loss": 1.2999, + "step": 6614 + }, + { + "epoch": 0.36870854467420994, + "grad_norm": 0.5483527183532715, + "learning_rate": 7.158467403566299e-05, + "loss": 1.559, + "step": 6615 + }, + { + "epoch": 0.36876428292737307, + "grad_norm": 0.47662925720214844, + "learning_rate": 7.15766418909532e-05, + "loss": 1.2871, + "step": 6616 + }, + { + "epoch": 0.3688200211805362, + "grad_norm": 0.5505543947219849, + "learning_rate": 7.156860906195811e-05, + "loss": 1.717, + "step": 6617 + }, + { + "epoch": 0.3688757594336993, + "grad_norm": 0.5837799310684204, + "learning_rate": 7.156057554893251e-05, + "loss": 1.8828, + "step": 6618 + }, + { + "epoch": 0.3689314976868625, + "grad_norm": 0.6020135283470154, + "learning_rate": 7.155254135213117e-05, + "loss": 1.6727, + "step": 6619 + }, + { + "epoch": 0.36898723594002564, + "grad_norm": 0.5805865526199341, + "learning_rate": 7.154450647180886e-05, + "loss": 1.7273, + "step": 6620 + }, + { + "epoch": 0.36904297419318877, + "grad_norm": 0.5338916182518005, + "learning_rate": 7.153647090822043e-05, + "loss": 1.5732, + "step": 6621 + }, + { + "epoch": 0.36909871244635195, + "grad_norm": 0.5388802886009216, + "learning_rate": 7.152843466162069e-05, + "loss": 1.5612, + "step": 6622 + }, + { + "epoch": 0.3691544506995151, + "grad_norm": 0.5497878789901733, + "learning_rate": 7.152039773226456e-05, + "loss": 1.6601, + "step": 6623 + }, + { + "epoch": 0.3692101889526782, + "grad_norm": 0.5147888660430908, + "learning_rate": 7.151236012040685e-05, + "loss": 1.6467, + "step": 6624 + }, + { + "epoch": 0.3692659272058414, + "grad_norm": 0.5906471014022827, + "learning_rate": 7.150432182630252e-05, + "loss": 1.6429, + "step": 6625 + }, + { + "epoch": 0.3693216654590045, + "grad_norm": 0.5193469524383545, + "learning_rate": 7.149628285020648e-05, + "loss": 1.7369, + "step": 6626 + }, + { + "epoch": 0.36937740371216765, + "grad_norm": 0.5903412699699402, + "learning_rate": 7.148824319237367e-05, + "loss": 1.7329, + "step": 6627 + }, + { + "epoch": 0.3694331419653308, + "grad_norm": 0.5230131149291992, + "learning_rate": 7.148020285305907e-05, + "loss": 1.5495, + "step": 6628 + }, + { + "epoch": 0.36948888021849396, + "grad_norm": 0.5554400086402893, + "learning_rate": 7.147216183251768e-05, + "loss": 1.7592, + "step": 6629 + }, + { + "epoch": 0.3695446184716571, + "grad_norm": 0.4992237985134125, + "learning_rate": 7.146412013100451e-05, + "loss": 1.5094, + "step": 6630 + }, + { + "epoch": 0.3696003567248202, + "grad_norm": 0.6239908933639526, + "learning_rate": 7.14560777487746e-05, + "loss": 1.9804, + "step": 6631 + }, + { + "epoch": 0.3696560949779834, + "grad_norm": 0.49736112356185913, + "learning_rate": 7.144803468608298e-05, + "loss": 1.4165, + "step": 6632 + }, + { + "epoch": 0.36971183323114654, + "grad_norm": 0.5291538834571838, + "learning_rate": 7.143999094318477e-05, + "loss": 1.6362, + "step": 6633 + }, + { + "epoch": 0.36976757148430967, + "grad_norm": 0.5881434679031372, + "learning_rate": 7.143194652033505e-05, + "loss": 1.8459, + "step": 6634 + }, + { + "epoch": 0.36982330973747285, + "grad_norm": 0.5663610100746155, + "learning_rate": 7.142390141778895e-05, + "loss": 1.655, + "step": 6635 + }, + { + "epoch": 0.369879047990636, + "grad_norm": 0.6780499219894409, + "learning_rate": 7.141585563580158e-05, + "loss": 1.8284, + "step": 6636 + }, + { + "epoch": 0.3699347862437991, + "grad_norm": 0.544389009475708, + "learning_rate": 7.140780917462814e-05, + "loss": 1.6024, + "step": 6637 + }, + { + "epoch": 0.3699905244969623, + "grad_norm": 0.5259643197059631, + "learning_rate": 7.139976203452383e-05, + "loss": 1.6143, + "step": 6638 + }, + { + "epoch": 0.3700462627501254, + "grad_norm": 0.5904932022094727, + "learning_rate": 7.139171421574383e-05, + "loss": 1.7714, + "step": 6639 + }, + { + "epoch": 0.37010200100328855, + "grad_norm": 0.5398536920547485, + "learning_rate": 7.138366571854338e-05, + "loss": 1.5943, + "step": 6640 + }, + { + "epoch": 0.3701577392564517, + "grad_norm": 0.5698688626289368, + "learning_rate": 7.137561654317772e-05, + "loss": 1.7892, + "step": 6641 + }, + { + "epoch": 0.37021347750961486, + "grad_norm": 0.5498561859130859, + "learning_rate": 7.136756668990213e-05, + "loss": 1.7051, + "step": 6642 + }, + { + "epoch": 0.370269215762778, + "grad_norm": 0.5418841242790222, + "learning_rate": 7.13595161589719e-05, + "loss": 1.6284, + "step": 6643 + }, + { + "epoch": 0.3703249540159411, + "grad_norm": 0.5735422968864441, + "learning_rate": 7.135146495064236e-05, + "loss": 1.5837, + "step": 6644 + }, + { + "epoch": 0.3703806922691043, + "grad_norm": 0.593471348285675, + "learning_rate": 7.134341306516885e-05, + "loss": 1.891, + "step": 6645 + }, + { + "epoch": 0.37043643052226743, + "grad_norm": 0.519626796245575, + "learning_rate": 7.13353605028067e-05, + "loss": 1.676, + "step": 6646 + }, + { + "epoch": 0.37049216877543056, + "grad_norm": 0.59029620885849, + "learning_rate": 7.132730726381134e-05, + "loss": 1.8638, + "step": 6647 + }, + { + "epoch": 0.37054790702859375, + "grad_norm": 0.6374014019966125, + "learning_rate": 7.13192533484381e-05, + "loss": 2.0887, + "step": 6648 + }, + { + "epoch": 0.3706036452817569, + "grad_norm": 0.5250412821769714, + "learning_rate": 7.131119875694246e-05, + "loss": 1.5408, + "step": 6649 + }, + { + "epoch": 0.37065938353492, + "grad_norm": 0.5467897653579712, + "learning_rate": 7.130314348957986e-05, + "loss": 1.4246, + "step": 6650 + }, + { + "epoch": 0.37071512178808314, + "grad_norm": 0.5109268426895142, + "learning_rate": 7.129508754660575e-05, + "loss": 1.4972, + "step": 6651 + }, + { + "epoch": 0.3707708600412463, + "grad_norm": 0.5759547352790833, + "learning_rate": 7.128703092827562e-05, + "loss": 1.9089, + "step": 6652 + }, + { + "epoch": 0.37082659829440945, + "grad_norm": 0.6243898272514343, + "learning_rate": 7.127897363484497e-05, + "loss": 1.9196, + "step": 6653 + }, + { + "epoch": 0.3708823365475726, + "grad_norm": 0.5852481722831726, + "learning_rate": 7.127091566656936e-05, + "loss": 1.7842, + "step": 6654 + }, + { + "epoch": 0.37093807480073576, + "grad_norm": 0.5579434037208557, + "learning_rate": 7.12628570237043e-05, + "loss": 1.6261, + "step": 6655 + }, + { + "epoch": 0.3709938130538989, + "grad_norm": 0.5315961837768555, + "learning_rate": 7.125479770650539e-05, + "loss": 1.6085, + "step": 6656 + }, + { + "epoch": 0.371049551307062, + "grad_norm": 0.5678053498268127, + "learning_rate": 7.124673771522824e-05, + "loss": 1.905, + "step": 6657 + }, + { + "epoch": 0.3711052895602252, + "grad_norm": 0.5308210849761963, + "learning_rate": 7.123867705012843e-05, + "loss": 1.5081, + "step": 6658 + }, + { + "epoch": 0.37116102781338833, + "grad_norm": 0.5750522017478943, + "learning_rate": 7.123061571146161e-05, + "loss": 1.6793, + "step": 6659 + }, + { + "epoch": 0.37121676606655146, + "grad_norm": 0.5785144567489624, + "learning_rate": 7.122255369948346e-05, + "loss": 1.6402, + "step": 6660 + }, + { + "epoch": 0.37127250431971465, + "grad_norm": 0.5107117891311646, + "learning_rate": 7.121449101444964e-05, + "loss": 1.6232, + "step": 6661 + }, + { + "epoch": 0.3713282425728778, + "grad_norm": 0.5365573763847351, + "learning_rate": 7.120642765661584e-05, + "loss": 1.7163, + "step": 6662 + }, + { + "epoch": 0.3713839808260409, + "grad_norm": 0.5924217104911804, + "learning_rate": 7.119836362623781e-05, + "loss": 1.9706, + "step": 6663 + }, + { + "epoch": 0.37143971907920403, + "grad_norm": 0.5683318972587585, + "learning_rate": 7.119029892357128e-05, + "loss": 1.9116, + "step": 6664 + }, + { + "epoch": 0.3714954573323672, + "grad_norm": 0.524502694606781, + "learning_rate": 7.118223354887201e-05, + "loss": 1.5862, + "step": 6665 + }, + { + "epoch": 0.37155119558553035, + "grad_norm": 0.5245027542114258, + "learning_rate": 7.11741675023958e-05, + "loss": 1.5945, + "step": 6666 + }, + { + "epoch": 0.3716069338386935, + "grad_norm": 0.5658608675003052, + "learning_rate": 7.116610078439845e-05, + "loss": 1.5802, + "step": 6667 + }, + { + "epoch": 0.37166267209185666, + "grad_norm": 0.5938420295715332, + "learning_rate": 7.115803339513578e-05, + "loss": 2.005, + "step": 6668 + }, + { + "epoch": 0.3717184103450198, + "grad_norm": 0.5456317663192749, + "learning_rate": 7.114996533486366e-05, + "loss": 1.5013, + "step": 6669 + }, + { + "epoch": 0.3717741485981829, + "grad_norm": 0.5922924280166626, + "learning_rate": 7.114189660383794e-05, + "loss": 2.0418, + "step": 6670 + }, + { + "epoch": 0.3718298868513461, + "grad_norm": 0.5821951627731323, + "learning_rate": 7.113382720231454e-05, + "loss": 1.7955, + "step": 6671 + }, + { + "epoch": 0.37188562510450923, + "grad_norm": 0.5134814381599426, + "learning_rate": 7.112575713054936e-05, + "loss": 1.4315, + "step": 6672 + }, + { + "epoch": 0.37194136335767236, + "grad_norm": 0.5751433968544006, + "learning_rate": 7.111768638879833e-05, + "loss": 1.566, + "step": 6673 + }, + { + "epoch": 0.3719971016108355, + "grad_norm": 0.5614348649978638, + "learning_rate": 7.110961497731742e-05, + "loss": 1.8572, + "step": 6674 + }, + { + "epoch": 0.3720528398639987, + "grad_norm": 0.5680375099182129, + "learning_rate": 7.110154289636259e-05, + "loss": 2.0372, + "step": 6675 + }, + { + "epoch": 0.3721085781171618, + "grad_norm": 0.5367892980575562, + "learning_rate": 7.109347014618985e-05, + "loss": 1.6665, + "step": 6676 + }, + { + "epoch": 0.37216431637032493, + "grad_norm": 0.563017725944519, + "learning_rate": 7.108539672705523e-05, + "loss": 1.747, + "step": 6677 + }, + { + "epoch": 0.3722200546234881, + "grad_norm": 0.5716055631637573, + "learning_rate": 7.107732263921475e-05, + "loss": 1.4182, + "step": 6678 + }, + { + "epoch": 0.37227579287665125, + "grad_norm": 0.514310896396637, + "learning_rate": 7.106924788292448e-05, + "loss": 1.6223, + "step": 6679 + }, + { + "epoch": 0.3723315311298144, + "grad_norm": 0.5039160251617432, + "learning_rate": 7.106117245844054e-05, + "loss": 1.5979, + "step": 6680 + }, + { + "epoch": 0.37238726938297756, + "grad_norm": 0.5815281867980957, + "learning_rate": 7.105309636601898e-05, + "loss": 1.9983, + "step": 6681 + }, + { + "epoch": 0.3724430076361407, + "grad_norm": 0.5450384616851807, + "learning_rate": 7.104501960591595e-05, + "loss": 1.5488, + "step": 6682 + }, + { + "epoch": 0.3724987458893038, + "grad_norm": 0.5386560559272766, + "learning_rate": 7.103694217838761e-05, + "loss": 1.6376, + "step": 6683 + }, + { + "epoch": 0.372554484142467, + "grad_norm": 0.5220578908920288, + "learning_rate": 7.102886408369012e-05, + "loss": 1.4654, + "step": 6684 + }, + { + "epoch": 0.37261022239563013, + "grad_norm": 0.5630038976669312, + "learning_rate": 7.102078532207966e-05, + "loss": 1.7554, + "step": 6685 + }, + { + "epoch": 0.37266596064879326, + "grad_norm": 0.5405006408691406, + "learning_rate": 7.101270589381245e-05, + "loss": 1.8247, + "step": 6686 + }, + { + "epoch": 0.3727216989019564, + "grad_norm": 0.5460960865020752, + "learning_rate": 7.100462579914474e-05, + "loss": 1.7902, + "step": 6687 + }, + { + "epoch": 0.3727774371551196, + "grad_norm": 0.5519078969955444, + "learning_rate": 7.099654503833273e-05, + "loss": 1.7138, + "step": 6688 + }, + { + "epoch": 0.3728331754082827, + "grad_norm": 0.5574856400489807, + "learning_rate": 7.098846361163273e-05, + "loss": 1.6607, + "step": 6689 + }, + { + "epoch": 0.37288891366144583, + "grad_norm": 0.5525651574134827, + "learning_rate": 7.098038151930107e-05, + "loss": 1.8834, + "step": 6690 + }, + { + "epoch": 0.372944651914609, + "grad_norm": 0.5278156399726868, + "learning_rate": 7.097229876159401e-05, + "loss": 1.67, + "step": 6691 + }, + { + "epoch": 0.37300039016777214, + "grad_norm": 0.5362699627876282, + "learning_rate": 7.096421533876792e-05, + "loss": 1.6881, + "step": 6692 + }, + { + "epoch": 0.3730561284209353, + "grad_norm": 0.522748589515686, + "learning_rate": 7.095613125107915e-05, + "loss": 1.6077, + "step": 6693 + }, + { + "epoch": 0.37311186667409846, + "grad_norm": 0.5335802435874939, + "learning_rate": 7.094804649878407e-05, + "loss": 1.6124, + "step": 6694 + }, + { + "epoch": 0.3731676049272616, + "grad_norm": 0.5322664976119995, + "learning_rate": 7.093996108213909e-05, + "loss": 1.735, + "step": 6695 + }, + { + "epoch": 0.3732233431804247, + "grad_norm": 0.5863260626792908, + "learning_rate": 7.093187500140064e-05, + "loss": 1.9465, + "step": 6696 + }, + { + "epoch": 0.37327908143358784, + "grad_norm": 0.5546720623970032, + "learning_rate": 7.092378825682517e-05, + "loss": 1.6817, + "step": 6697 + }, + { + "epoch": 0.37333481968675103, + "grad_norm": 0.5397077798843384, + "learning_rate": 7.091570084866909e-05, + "loss": 1.7072, + "step": 6698 + }, + { + "epoch": 0.37339055793991416, + "grad_norm": 0.5567345023155212, + "learning_rate": 7.090761277718897e-05, + "loss": 1.7315, + "step": 6699 + }, + { + "epoch": 0.3734462961930773, + "grad_norm": 0.5560916662216187, + "learning_rate": 7.089952404264126e-05, + "loss": 1.5599, + "step": 6700 + }, + { + "epoch": 0.37350203444624047, + "grad_norm": 0.5497678518295288, + "learning_rate": 7.089143464528249e-05, + "loss": 1.6328, + "step": 6701 + }, + { + "epoch": 0.3735577726994036, + "grad_norm": 0.5806947946548462, + "learning_rate": 7.088334458536921e-05, + "loss": 1.8025, + "step": 6702 + }, + { + "epoch": 0.37361351095256673, + "grad_norm": 0.6178561449050903, + "learning_rate": 7.087525386315802e-05, + "loss": 1.6715, + "step": 6703 + }, + { + "epoch": 0.3736692492057299, + "grad_norm": 0.5702304244041443, + "learning_rate": 7.086716247890548e-05, + "loss": 1.7321, + "step": 6704 + }, + { + "epoch": 0.37372498745889304, + "grad_norm": 0.5194035172462463, + "learning_rate": 7.08590704328682e-05, + "loss": 1.5648, + "step": 6705 + }, + { + "epoch": 0.37378072571205617, + "grad_norm": 0.5901757478713989, + "learning_rate": 7.085097772530283e-05, + "loss": 1.9348, + "step": 6706 + }, + { + "epoch": 0.37383646396521936, + "grad_norm": 0.7031030654907227, + "learning_rate": 7.084288435646603e-05, + "loss": 1.5634, + "step": 6707 + }, + { + "epoch": 0.3738922022183825, + "grad_norm": 0.5556403398513794, + "learning_rate": 7.083479032661445e-05, + "loss": 1.6525, + "step": 6708 + }, + { + "epoch": 0.3739479404715456, + "grad_norm": 0.5691899061203003, + "learning_rate": 7.082669563600478e-05, + "loss": 1.885, + "step": 6709 + }, + { + "epoch": 0.37400367872470874, + "grad_norm": 0.5547059774398804, + "learning_rate": 7.081860028489377e-05, + "loss": 1.8645, + "step": 6710 + }, + { + "epoch": 0.37405941697787193, + "grad_norm": 0.5635570287704468, + "learning_rate": 7.081050427353814e-05, + "loss": 1.8752, + "step": 6711 + }, + { + "epoch": 0.37411515523103506, + "grad_norm": 0.5423487424850464, + "learning_rate": 7.080240760219465e-05, + "loss": 1.5953, + "step": 6712 + }, + { + "epoch": 0.3741708934841982, + "grad_norm": 0.5141568183898926, + "learning_rate": 7.079431027112006e-05, + "loss": 1.4812, + "step": 6713 + }, + { + "epoch": 0.37422663173736137, + "grad_norm": 0.5988462567329407, + "learning_rate": 7.078621228057121e-05, + "loss": 1.8588, + "step": 6714 + }, + { + "epoch": 0.3742823699905245, + "grad_norm": 0.5320055484771729, + "learning_rate": 7.077811363080489e-05, + "loss": 1.745, + "step": 6715 + }, + { + "epoch": 0.37433810824368763, + "grad_norm": 0.5388814806938171, + "learning_rate": 7.077001432207795e-05, + "loss": 1.511, + "step": 6716 + }, + { + "epoch": 0.3743938464968508, + "grad_norm": 0.537324070930481, + "learning_rate": 7.076191435464725e-05, + "loss": 1.6644, + "step": 6717 + }, + { + "epoch": 0.37444958475001394, + "grad_norm": 0.533687174320221, + "learning_rate": 7.075381372876967e-05, + "loss": 1.73, + "step": 6718 + }, + { + "epoch": 0.37450532300317707, + "grad_norm": 0.5057275295257568, + "learning_rate": 7.074571244470214e-05, + "loss": 1.6284, + "step": 6719 + }, + { + "epoch": 0.3745610612563402, + "grad_norm": 0.6067156195640564, + "learning_rate": 7.073761050270156e-05, + "loss": 1.84, + "step": 6720 + }, + { + "epoch": 0.3746167995095034, + "grad_norm": 0.5253334641456604, + "learning_rate": 7.072950790302487e-05, + "loss": 1.4598, + "step": 6721 + }, + { + "epoch": 0.3746725377626665, + "grad_norm": 0.521193265914917, + "learning_rate": 7.072140464592907e-05, + "loss": 1.5442, + "step": 6722 + }, + { + "epoch": 0.37472827601582964, + "grad_norm": 0.5262565612792969, + "learning_rate": 7.071330073167112e-05, + "loss": 1.6898, + "step": 6723 + }, + { + "epoch": 0.3747840142689928, + "grad_norm": 0.6259338855743408, + "learning_rate": 7.070519616050804e-05, + "loss": 1.731, + "step": 6724 + }, + { + "epoch": 0.37483975252215596, + "grad_norm": 0.5520288348197937, + "learning_rate": 7.069709093269687e-05, + "loss": 1.796, + "step": 6725 + }, + { + "epoch": 0.3748954907753191, + "grad_norm": 0.5660863518714905, + "learning_rate": 7.068898504849462e-05, + "loss": 1.656, + "step": 6726 + }, + { + "epoch": 0.37495122902848227, + "grad_norm": 0.5522897839546204, + "learning_rate": 7.06808785081584e-05, + "loss": 1.6656, + "step": 6727 + }, + { + "epoch": 0.3750069672816454, + "grad_norm": 0.6100639700889587, + "learning_rate": 7.067277131194529e-05, + "loss": 1.7658, + "step": 6728 + }, + { + "epoch": 0.3750627055348085, + "grad_norm": 0.5829086899757385, + "learning_rate": 7.066466346011242e-05, + "loss": 1.6342, + "step": 6729 + }, + { + "epoch": 0.3751184437879717, + "grad_norm": 0.6315231323242188, + "learning_rate": 7.06565549529169e-05, + "loss": 1.7829, + "step": 6730 + }, + { + "epoch": 0.37517418204113484, + "grad_norm": 0.6006489992141724, + "learning_rate": 7.064844579061588e-05, + "loss": 1.8819, + "step": 6731 + }, + { + "epoch": 0.37522992029429797, + "grad_norm": 0.5952304005622864, + "learning_rate": 7.064033597346658e-05, + "loss": 1.6654, + "step": 6732 + }, + { + "epoch": 0.3752856585474611, + "grad_norm": 0.5768652558326721, + "learning_rate": 7.063222550172612e-05, + "loss": 1.6577, + "step": 6733 + }, + { + "epoch": 0.3753413968006243, + "grad_norm": 0.5706788301467896, + "learning_rate": 7.062411437565179e-05, + "loss": 1.7532, + "step": 6734 + }, + { + "epoch": 0.3753971350537874, + "grad_norm": 0.6298890113830566, + "learning_rate": 7.06160025955008e-05, + "loss": 1.7744, + "step": 6735 + }, + { + "epoch": 0.37545287330695054, + "grad_norm": 0.5873239636421204, + "learning_rate": 7.06078901615304e-05, + "loss": 1.9847, + "step": 6736 + }, + { + "epoch": 0.3755086115601137, + "grad_norm": 0.5103023648262024, + "learning_rate": 7.059977707399787e-05, + "loss": 1.4559, + "step": 6737 + }, + { + "epoch": 0.37556434981327685, + "grad_norm": 0.521653950214386, + "learning_rate": 7.059166333316054e-05, + "loss": 1.6796, + "step": 6738 + }, + { + "epoch": 0.37562008806644, + "grad_norm": 0.5209727883338928, + "learning_rate": 7.058354893927568e-05, + "loss": 1.5015, + "step": 6739 + }, + { + "epoch": 0.37567582631960317, + "grad_norm": 0.6425443887710571, + "learning_rate": 7.057543389260068e-05, + "loss": 1.8178, + "step": 6740 + }, + { + "epoch": 0.3757315645727663, + "grad_norm": 0.5647505521774292, + "learning_rate": 7.056731819339286e-05, + "loss": 1.7513, + "step": 6741 + }, + { + "epoch": 0.3757873028259294, + "grad_norm": 0.5992183089256287, + "learning_rate": 7.055920184190964e-05, + "loss": 1.6351, + "step": 6742 + }, + { + "epoch": 0.37584304107909255, + "grad_norm": 0.5495748519897461, + "learning_rate": 7.055108483840839e-05, + "loss": 1.6854, + "step": 6743 + }, + { + "epoch": 0.37589877933225574, + "grad_norm": 0.5780972242355347, + "learning_rate": 7.054296718314656e-05, + "loss": 1.7937, + "step": 6744 + }, + { + "epoch": 0.37595451758541887, + "grad_norm": 0.5518954992294312, + "learning_rate": 7.053484887638158e-05, + "loss": 1.6708, + "step": 6745 + }, + { + "epoch": 0.376010255838582, + "grad_norm": 0.5211352109909058, + "learning_rate": 7.052672991837093e-05, + "loss": 1.6565, + "step": 6746 + }, + { + "epoch": 0.3760659940917452, + "grad_norm": 0.5192275643348694, + "learning_rate": 7.051861030937207e-05, + "loss": 1.5376, + "step": 6747 + }, + { + "epoch": 0.3761217323449083, + "grad_norm": 0.5492019057273865, + "learning_rate": 7.051049004964254e-05, + "loss": 1.7518, + "step": 6748 + }, + { + "epoch": 0.37617747059807144, + "grad_norm": 0.5412474274635315, + "learning_rate": 7.050236913943984e-05, + "loss": 1.5384, + "step": 6749 + }, + { + "epoch": 0.3762332088512346, + "grad_norm": 0.5172974467277527, + "learning_rate": 7.049424757902153e-05, + "loss": 1.6072, + "step": 6750 + }, + { + "epoch": 0.37628894710439775, + "grad_norm": 0.5415205955505371, + "learning_rate": 7.048612536864517e-05, + "loss": 1.8342, + "step": 6751 + }, + { + "epoch": 0.3763446853575609, + "grad_norm": 0.5428817868232727, + "learning_rate": 7.047800250856837e-05, + "loss": 1.6988, + "step": 6752 + }, + { + "epoch": 0.37640042361072407, + "grad_norm": 0.5195114612579346, + "learning_rate": 7.046987899904871e-05, + "loss": 1.4647, + "step": 6753 + }, + { + "epoch": 0.3764561618638872, + "grad_norm": 0.5440792441368103, + "learning_rate": 7.046175484034384e-05, + "loss": 1.5224, + "step": 6754 + }, + { + "epoch": 0.3765119001170503, + "grad_norm": 0.5353301763534546, + "learning_rate": 7.045363003271141e-05, + "loss": 1.664, + "step": 6755 + }, + { + "epoch": 0.37656763837021345, + "grad_norm": 0.5722842812538147, + "learning_rate": 7.044550457640909e-05, + "loss": 1.6422, + "step": 6756 + }, + { + "epoch": 0.37662337662337664, + "grad_norm": 0.5732778906822205, + "learning_rate": 7.043737847169455e-05, + "loss": 2.0161, + "step": 6757 + }, + { + "epoch": 0.37667911487653977, + "grad_norm": 0.5180158019065857, + "learning_rate": 7.042925171882557e-05, + "loss": 1.6388, + "step": 6758 + }, + { + "epoch": 0.3767348531297029, + "grad_norm": 0.530694305896759, + "learning_rate": 7.042112431805979e-05, + "loss": 1.546, + "step": 6759 + }, + { + "epoch": 0.3767905913828661, + "grad_norm": 0.5620813965797424, + "learning_rate": 7.041299626965503e-05, + "loss": 1.6727, + "step": 6760 + }, + { + "epoch": 0.3768463296360292, + "grad_norm": 0.5627542734146118, + "learning_rate": 7.040486757386904e-05, + "loss": 1.7527, + "step": 6761 + }, + { + "epoch": 0.37690206788919234, + "grad_norm": 0.588291347026825, + "learning_rate": 7.039673823095963e-05, + "loss": 1.9415, + "step": 6762 + }, + { + "epoch": 0.3769578061423555, + "grad_norm": 0.5307551026344299, + "learning_rate": 7.03886082411846e-05, + "loss": 1.6316, + "step": 6763 + }, + { + "epoch": 0.37701354439551865, + "grad_norm": 0.5484150648117065, + "learning_rate": 7.038047760480179e-05, + "loss": 1.6363, + "step": 6764 + }, + { + "epoch": 0.3770692826486818, + "grad_norm": 0.5301684737205505, + "learning_rate": 7.037234632206905e-05, + "loss": 1.7243, + "step": 6765 + }, + { + "epoch": 0.3771250209018449, + "grad_norm": 0.5907619595527649, + "learning_rate": 7.036421439324427e-05, + "loss": 1.6807, + "step": 6766 + }, + { + "epoch": 0.3771807591550081, + "grad_norm": 0.5170425772666931, + "learning_rate": 7.035608181858533e-05, + "loss": 1.6641, + "step": 6767 + }, + { + "epoch": 0.3772364974081712, + "grad_norm": 0.5344756245613098, + "learning_rate": 7.034794859835016e-05, + "loss": 1.8226, + "step": 6768 + }, + { + "epoch": 0.37729223566133435, + "grad_norm": 0.5386238098144531, + "learning_rate": 7.033981473279672e-05, + "loss": 1.6291, + "step": 6769 + }, + { + "epoch": 0.37734797391449754, + "grad_norm": 0.5417985916137695, + "learning_rate": 7.033168022218292e-05, + "loss": 1.5797, + "step": 6770 + }, + { + "epoch": 0.37740371216766067, + "grad_norm": 0.5583431124687195, + "learning_rate": 7.032354506676678e-05, + "loss": 1.7165, + "step": 6771 + }, + { + "epoch": 0.3774594504208238, + "grad_norm": 0.5974751114845276, + "learning_rate": 7.031540926680627e-05, + "loss": 1.9454, + "step": 6772 + }, + { + "epoch": 0.377515188673987, + "grad_norm": 0.5629299283027649, + "learning_rate": 7.030727282255944e-05, + "loss": 1.9527, + "step": 6773 + }, + { + "epoch": 0.3775709269271501, + "grad_norm": 0.49648937582969666, + "learning_rate": 7.02991357342843e-05, + "loss": 1.4055, + "step": 6774 + }, + { + "epoch": 0.37762666518031324, + "grad_norm": 0.5776923298835754, + "learning_rate": 7.029099800223895e-05, + "loss": 1.5683, + "step": 6775 + }, + { + "epoch": 0.3776824034334764, + "grad_norm": 0.5667086839675903, + "learning_rate": 7.028285962668144e-05, + "loss": 1.6576, + "step": 6776 + }, + { + "epoch": 0.37773814168663955, + "grad_norm": 0.51173996925354, + "learning_rate": 7.027472060786988e-05, + "loss": 1.6046, + "step": 6777 + }, + { + "epoch": 0.3777938799398027, + "grad_norm": 0.6762179732322693, + "learning_rate": 7.026658094606238e-05, + "loss": 1.8251, + "step": 6778 + }, + { + "epoch": 0.3778496181929658, + "grad_norm": 0.6333464980125427, + "learning_rate": 7.02584406415171e-05, + "loss": 1.9974, + "step": 6779 + }, + { + "epoch": 0.377905356446129, + "grad_norm": 0.5379152297973633, + "learning_rate": 7.02502996944922e-05, + "loss": 1.5211, + "step": 6780 + }, + { + "epoch": 0.3779610946992921, + "grad_norm": 0.5208351016044617, + "learning_rate": 7.024215810524586e-05, + "loss": 1.7317, + "step": 6781 + }, + { + "epoch": 0.37801683295245525, + "grad_norm": 0.5434418320655823, + "learning_rate": 7.023401587403629e-05, + "loss": 1.6749, + "step": 6782 + }, + { + "epoch": 0.37807257120561844, + "grad_norm": 0.5639735460281372, + "learning_rate": 7.022587300112171e-05, + "loss": 1.7105, + "step": 6783 + }, + { + "epoch": 0.37812830945878156, + "grad_norm": 0.600032389163971, + "learning_rate": 7.021772948676037e-05, + "loss": 1.8057, + "step": 6784 + }, + { + "epoch": 0.3781840477119447, + "grad_norm": 0.5152847766876221, + "learning_rate": 7.020958533121051e-05, + "loss": 1.6275, + "step": 6785 + }, + { + "epoch": 0.3782397859651079, + "grad_norm": 0.5553915500640869, + "learning_rate": 7.020144053473044e-05, + "loss": 1.786, + "step": 6786 + }, + { + "epoch": 0.378295524218271, + "grad_norm": 0.5452811121940613, + "learning_rate": 7.019329509757845e-05, + "loss": 1.6452, + "step": 6787 + }, + { + "epoch": 0.37835126247143414, + "grad_norm": 0.5100104212760925, + "learning_rate": 7.01851490200129e-05, + "loss": 1.5128, + "step": 6788 + }, + { + "epoch": 0.37840700072459726, + "grad_norm": 0.6309191584587097, + "learning_rate": 7.017700230229208e-05, + "loss": 1.4683, + "step": 6789 + }, + { + "epoch": 0.37846273897776045, + "grad_norm": 0.5344750881195068, + "learning_rate": 7.01688549446744e-05, + "loss": 1.6131, + "step": 6790 + }, + { + "epoch": 0.3785184772309236, + "grad_norm": 0.5286291837692261, + "learning_rate": 7.016070694741824e-05, + "loss": 1.6499, + "step": 6791 + }, + { + "epoch": 0.3785742154840867, + "grad_norm": 0.5597365498542786, + "learning_rate": 7.015255831078201e-05, + "loss": 1.6677, + "step": 6792 + }, + { + "epoch": 0.3786299537372499, + "grad_norm": 0.5482022166252136, + "learning_rate": 7.01444090350241e-05, + "loss": 1.6498, + "step": 6793 + }, + { + "epoch": 0.378685691990413, + "grad_norm": 0.6198036670684814, + "learning_rate": 7.0136259120403e-05, + "loss": 1.8393, + "step": 6794 + }, + { + "epoch": 0.37874143024357615, + "grad_norm": 0.555736243724823, + "learning_rate": 7.012810856717717e-05, + "loss": 1.5817, + "step": 6795 + }, + { + "epoch": 0.37879716849673933, + "grad_norm": 0.5894885659217834, + "learning_rate": 7.011995737560507e-05, + "loss": 1.736, + "step": 6796 + }, + { + "epoch": 0.37885290674990246, + "grad_norm": 0.5784539580345154, + "learning_rate": 7.011180554594525e-05, + "loss": 1.7195, + "step": 6797 + }, + { + "epoch": 0.3789086450030656, + "grad_norm": 0.5761838555335999, + "learning_rate": 7.010365307845621e-05, + "loss": 1.5784, + "step": 6798 + }, + { + "epoch": 0.3789643832562288, + "grad_norm": 0.5359389185905457, + "learning_rate": 7.00954999733965e-05, + "loss": 1.4703, + "step": 6799 + }, + { + "epoch": 0.3790201215093919, + "grad_norm": 0.5606504678726196, + "learning_rate": 7.008734623102471e-05, + "loss": 1.7026, + "step": 6800 + }, + { + "epoch": 0.37907585976255503, + "grad_norm": 0.5452861785888672, + "learning_rate": 7.007919185159942e-05, + "loss": 1.6358, + "step": 6801 + }, + { + "epoch": 0.37913159801571816, + "grad_norm": 0.533334493637085, + "learning_rate": 7.007103683537922e-05, + "loss": 1.5224, + "step": 6802 + }, + { + "epoch": 0.37918733626888135, + "grad_norm": 0.5216323137283325, + "learning_rate": 7.006288118262277e-05, + "loss": 1.5611, + "step": 6803 + }, + { + "epoch": 0.3792430745220445, + "grad_norm": 0.6083248853683472, + "learning_rate": 7.005472489358868e-05, + "loss": 1.9112, + "step": 6804 + }, + { + "epoch": 0.3792988127752076, + "grad_norm": 0.5337701439857483, + "learning_rate": 7.004656796853565e-05, + "loss": 1.678, + "step": 6805 + }, + { + "epoch": 0.3793545510283708, + "grad_norm": 0.5296239256858826, + "learning_rate": 7.003841040772237e-05, + "loss": 1.6372, + "step": 6806 + }, + { + "epoch": 0.3794102892815339, + "grad_norm": 0.5512758493423462, + "learning_rate": 7.003025221140754e-05, + "loss": 1.7838, + "step": 6807 + }, + { + "epoch": 0.37946602753469705, + "grad_norm": 0.5666672587394714, + "learning_rate": 7.00220933798499e-05, + "loss": 1.8518, + "step": 6808 + }, + { + "epoch": 0.37952176578786023, + "grad_norm": 0.5516249537467957, + "learning_rate": 7.001393391330819e-05, + "loss": 1.4928, + "step": 6809 + }, + { + "epoch": 0.37957750404102336, + "grad_norm": 0.5139819979667664, + "learning_rate": 7.000577381204118e-05, + "loss": 1.5464, + "step": 6810 + }, + { + "epoch": 0.3796332422941865, + "grad_norm": 0.5297854542732239, + "learning_rate": 6.999761307630767e-05, + "loss": 1.4929, + "step": 6811 + }, + { + "epoch": 0.3796889805473496, + "grad_norm": 0.5862724184989929, + "learning_rate": 6.998945170636647e-05, + "loss": 1.8435, + "step": 6812 + }, + { + "epoch": 0.3797447188005128, + "grad_norm": 0.5517110228538513, + "learning_rate": 6.998128970247641e-05, + "loss": 1.5962, + "step": 6813 + }, + { + "epoch": 0.37980045705367593, + "grad_norm": 0.5306249260902405, + "learning_rate": 6.997312706489634e-05, + "loss": 1.4978, + "step": 6814 + }, + { + "epoch": 0.37985619530683906, + "grad_norm": 0.5715779662132263, + "learning_rate": 6.996496379388512e-05, + "loss": 1.7663, + "step": 6815 + }, + { + "epoch": 0.37991193356000225, + "grad_norm": 0.5692317485809326, + "learning_rate": 6.995679988970167e-05, + "loss": 1.9011, + "step": 6816 + }, + { + "epoch": 0.3799676718131654, + "grad_norm": 0.5604211091995239, + "learning_rate": 6.994863535260488e-05, + "loss": 1.5928, + "step": 6817 + }, + { + "epoch": 0.3800234100663285, + "grad_norm": 0.5591232776641846, + "learning_rate": 6.994047018285368e-05, + "loss": 1.6347, + "step": 6818 + }, + { + "epoch": 0.3800791483194917, + "grad_norm": 0.515835702419281, + "learning_rate": 6.993230438070702e-05, + "loss": 1.4441, + "step": 6819 + }, + { + "epoch": 0.3801348865726548, + "grad_norm": 0.5194911360740662, + "learning_rate": 6.99241379464239e-05, + "loss": 1.6603, + "step": 6820 + }, + { + "epoch": 0.38019062482581795, + "grad_norm": 0.49259036779403687, + "learning_rate": 6.991597088026327e-05, + "loss": 1.5785, + "step": 6821 + }, + { + "epoch": 0.38024636307898113, + "grad_norm": 0.5865880846977234, + "learning_rate": 6.990780318248416e-05, + "loss": 1.7017, + "step": 6822 + }, + { + "epoch": 0.38030210133214426, + "grad_norm": 0.532753050327301, + "learning_rate": 6.989963485334562e-05, + "loss": 1.7205, + "step": 6823 + }, + { + "epoch": 0.3803578395853074, + "grad_norm": 0.6024113297462463, + "learning_rate": 6.989146589310667e-05, + "loss": 1.8499, + "step": 6824 + }, + { + "epoch": 0.3804135778384705, + "grad_norm": 0.5912168622016907, + "learning_rate": 6.988329630202641e-05, + "loss": 1.7783, + "step": 6825 + }, + { + "epoch": 0.3804693160916337, + "grad_norm": 0.5647505521774292, + "learning_rate": 6.98751260803639e-05, + "loss": 1.6106, + "step": 6826 + }, + { + "epoch": 0.38052505434479683, + "grad_norm": 0.5149972438812256, + "learning_rate": 6.98669552283783e-05, + "loss": 1.652, + "step": 6827 + }, + { + "epoch": 0.38058079259795996, + "grad_norm": 0.5642407536506653, + "learning_rate": 6.98587837463287e-05, + "loss": 1.6075, + "step": 6828 + }, + { + "epoch": 0.38063653085112314, + "grad_norm": 0.6054338812828064, + "learning_rate": 6.985061163447426e-05, + "loss": 1.7205, + "step": 6829 + }, + { + "epoch": 0.3806922691042863, + "grad_norm": 0.5490162372589111, + "learning_rate": 6.984243889307415e-05, + "loss": 1.605, + "step": 6830 + }, + { + "epoch": 0.3807480073574494, + "grad_norm": 0.5481693744659424, + "learning_rate": 6.983426552238756e-05, + "loss": 1.6532, + "step": 6831 + }, + { + "epoch": 0.3808037456106126, + "grad_norm": 0.5470540523529053, + "learning_rate": 6.982609152267374e-05, + "loss": 1.856, + "step": 6832 + }, + { + "epoch": 0.3808594838637757, + "grad_norm": 0.5047014355659485, + "learning_rate": 6.981791689419186e-05, + "loss": 1.5632, + "step": 6833 + }, + { + "epoch": 0.38091522211693885, + "grad_norm": 0.5213363766670227, + "learning_rate": 6.980974163720123e-05, + "loss": 1.648, + "step": 6834 + }, + { + "epoch": 0.380970960370102, + "grad_norm": 0.5108797550201416, + "learning_rate": 6.980156575196107e-05, + "loss": 1.7048, + "step": 6835 + }, + { + "epoch": 0.38102669862326516, + "grad_norm": 0.5571927428245544, + "learning_rate": 6.979338923873073e-05, + "loss": 1.7984, + "step": 6836 + }, + { + "epoch": 0.3810824368764283, + "grad_norm": 0.5656031966209412, + "learning_rate": 6.978521209776945e-05, + "loss": 1.6214, + "step": 6837 + }, + { + "epoch": 0.3811381751295914, + "grad_norm": 0.5520498752593994, + "learning_rate": 6.977703432933661e-05, + "loss": 1.5048, + "step": 6838 + }, + { + "epoch": 0.3811939133827546, + "grad_norm": 0.5377273559570312, + "learning_rate": 6.976885593369155e-05, + "loss": 1.4111, + "step": 6839 + }, + { + "epoch": 0.38124965163591773, + "grad_norm": 0.5396257042884827, + "learning_rate": 6.976067691109365e-05, + "loss": 1.6715, + "step": 6840 + }, + { + "epoch": 0.38130538988908086, + "grad_norm": 0.5259842872619629, + "learning_rate": 6.975249726180227e-05, + "loss": 1.586, + "step": 6841 + }, + { + "epoch": 0.38136112814224404, + "grad_norm": 0.5793870091438293, + "learning_rate": 6.974431698607686e-05, + "loss": 1.8532, + "step": 6842 + }, + { + "epoch": 0.3814168663954072, + "grad_norm": 0.6075243353843689, + "learning_rate": 6.973613608417683e-05, + "loss": 1.8658, + "step": 6843 + }, + { + "epoch": 0.3814726046485703, + "grad_norm": 0.5244048833847046, + "learning_rate": 6.972795455636163e-05, + "loss": 1.5298, + "step": 6844 + }, + { + "epoch": 0.3815283429017335, + "grad_norm": 0.5625903010368347, + "learning_rate": 6.971977240289073e-05, + "loss": 1.7494, + "step": 6845 + }, + { + "epoch": 0.3815840811548966, + "grad_norm": 0.5776612758636475, + "learning_rate": 6.971158962402362e-05, + "loss": 1.9495, + "step": 6846 + }, + { + "epoch": 0.38163981940805974, + "grad_norm": 0.5811514258384705, + "learning_rate": 6.970340622001983e-05, + "loss": 1.6167, + "step": 6847 + }, + { + "epoch": 0.3816955576612229, + "grad_norm": 0.5879440307617188, + "learning_rate": 6.969522219113886e-05, + "loss": 1.7636, + "step": 6848 + }, + { + "epoch": 0.38175129591438606, + "grad_norm": 0.6386079788208008, + "learning_rate": 6.968703753764027e-05, + "loss": 1.779, + "step": 6849 + }, + { + "epoch": 0.3818070341675492, + "grad_norm": 0.5324746966362, + "learning_rate": 6.967885225978365e-05, + "loss": 1.5693, + "step": 6850 + }, + { + "epoch": 0.3818627724207123, + "grad_norm": 0.6155705451965332, + "learning_rate": 6.967066635782855e-05, + "loss": 1.8075, + "step": 6851 + }, + { + "epoch": 0.3819185106738755, + "grad_norm": 0.5880451202392578, + "learning_rate": 6.966247983203462e-05, + "loss": 1.8192, + "step": 6852 + }, + { + "epoch": 0.38197424892703863, + "grad_norm": 0.5279741287231445, + "learning_rate": 6.965429268266147e-05, + "loss": 1.5787, + "step": 6853 + }, + { + "epoch": 0.38202998718020176, + "grad_norm": 0.5816035270690918, + "learning_rate": 6.964610490996874e-05, + "loss": 1.7935, + "step": 6854 + }, + { + "epoch": 0.38208572543336494, + "grad_norm": 0.5708805918693542, + "learning_rate": 6.963791651421612e-05, + "loss": 1.6204, + "step": 6855 + }, + { + "epoch": 0.38214146368652807, + "grad_norm": 0.5362871885299683, + "learning_rate": 6.962972749566326e-05, + "loss": 1.6198, + "step": 6856 + }, + { + "epoch": 0.3821972019396912, + "grad_norm": 0.5008870363235474, + "learning_rate": 6.962153785456991e-05, + "loss": 1.3949, + "step": 6857 + }, + { + "epoch": 0.38225294019285433, + "grad_norm": 0.5772041082382202, + "learning_rate": 6.961334759119577e-05, + "loss": 1.7137, + "step": 6858 + }, + { + "epoch": 0.3823086784460175, + "grad_norm": 0.5443426966667175, + "learning_rate": 6.960515670580061e-05, + "loss": 1.809, + "step": 6859 + }, + { + "epoch": 0.38236441669918064, + "grad_norm": 0.6082087755203247, + "learning_rate": 6.959696519864418e-05, + "loss": 1.8777, + "step": 6860 + }, + { + "epoch": 0.38242015495234377, + "grad_norm": 0.5430213809013367, + "learning_rate": 6.958877306998627e-05, + "loss": 1.7168, + "step": 6861 + }, + { + "epoch": 0.38247589320550696, + "grad_norm": 0.5611394047737122, + "learning_rate": 6.95805803200867e-05, + "loss": 1.7136, + "step": 6862 + }, + { + "epoch": 0.3825316314586701, + "grad_norm": 0.5467121005058289, + "learning_rate": 6.957238694920527e-05, + "loss": 1.7348, + "step": 6863 + }, + { + "epoch": 0.3825873697118332, + "grad_norm": 0.5907519459724426, + "learning_rate": 6.956419295760184e-05, + "loss": 1.8087, + "step": 6864 + }, + { + "epoch": 0.3826431079649964, + "grad_norm": 0.4940342307090759, + "learning_rate": 6.95559983455363e-05, + "loss": 1.226, + "step": 6865 + }, + { + "epoch": 0.3826988462181595, + "grad_norm": 0.525205135345459, + "learning_rate": 6.954780311326849e-05, + "loss": 1.6166, + "step": 6866 + }, + { + "epoch": 0.38275458447132266, + "grad_norm": 0.5510271191596985, + "learning_rate": 6.953960726105835e-05, + "loss": 1.6143, + "step": 6867 + }, + { + "epoch": 0.38281032272448584, + "grad_norm": 0.5778586268424988, + "learning_rate": 6.953141078916578e-05, + "loss": 1.8417, + "step": 6868 + }, + { + "epoch": 0.38286606097764897, + "grad_norm": 0.5931724309921265, + "learning_rate": 6.952321369785075e-05, + "loss": 1.6908, + "step": 6869 + }, + { + "epoch": 0.3829217992308121, + "grad_norm": 0.5995519161224365, + "learning_rate": 6.951501598737318e-05, + "loss": 1.9328, + "step": 6870 + }, + { + "epoch": 0.38297753748397523, + "grad_norm": 0.5441159009933472, + "learning_rate": 6.95068176579931e-05, + "loss": 1.7226, + "step": 6871 + }, + { + "epoch": 0.3830332757371384, + "grad_norm": 0.5795645117759705, + "learning_rate": 6.94986187099705e-05, + "loss": 1.8162, + "step": 6872 + }, + { + "epoch": 0.38308901399030154, + "grad_norm": 0.5668213367462158, + "learning_rate": 6.949041914356541e-05, + "loss": 1.5981, + "step": 6873 + }, + { + "epoch": 0.38314475224346467, + "grad_norm": 0.6034721732139587, + "learning_rate": 6.948221895903784e-05, + "loss": 1.688, + "step": 6874 + }, + { + "epoch": 0.38320049049662785, + "grad_norm": 0.5386607050895691, + "learning_rate": 6.94740181566479e-05, + "loss": 1.6411, + "step": 6875 + }, + { + "epoch": 0.383256228749791, + "grad_norm": 0.5482555627822876, + "learning_rate": 6.946581673665561e-05, + "loss": 1.3411, + "step": 6876 + }, + { + "epoch": 0.3833119670029541, + "grad_norm": 0.5288286805152893, + "learning_rate": 6.945761469932114e-05, + "loss": 1.5896, + "step": 6877 + }, + { + "epoch": 0.3833677052561173, + "grad_norm": 0.5721820592880249, + "learning_rate": 6.944941204490456e-05, + "loss": 1.7555, + "step": 6878 + }, + { + "epoch": 0.3834234435092804, + "grad_norm": 0.5338029861450195, + "learning_rate": 6.944120877366604e-05, + "loss": 1.8117, + "step": 6879 + }, + { + "epoch": 0.38347918176244356, + "grad_norm": 0.5430106520652771, + "learning_rate": 6.943300488586572e-05, + "loss": 1.5363, + "step": 6880 + }, + { + "epoch": 0.3835349200156067, + "grad_norm": 0.5485236644744873, + "learning_rate": 6.942480038176379e-05, + "loss": 1.4549, + "step": 6881 + }, + { + "epoch": 0.38359065826876987, + "grad_norm": 0.5767553448677063, + "learning_rate": 6.941659526162045e-05, + "loss": 1.5041, + "step": 6882 + }, + { + "epoch": 0.383646396521933, + "grad_norm": 0.5788490176200867, + "learning_rate": 6.940838952569589e-05, + "loss": 1.8509, + "step": 6883 + }, + { + "epoch": 0.3837021347750961, + "grad_norm": 0.5562904477119446, + "learning_rate": 6.94001831742504e-05, + "loss": 1.6337, + "step": 6884 + }, + { + "epoch": 0.3837578730282593, + "grad_norm": 0.5514802932739258, + "learning_rate": 6.939197620754419e-05, + "loss": 1.6887, + "step": 6885 + }, + { + "epoch": 0.38381361128142244, + "grad_norm": 0.6278872489929199, + "learning_rate": 6.938376862583757e-05, + "loss": 1.6762, + "step": 6886 + }, + { + "epoch": 0.38386934953458557, + "grad_norm": 0.5348507761955261, + "learning_rate": 6.937556042939083e-05, + "loss": 1.5778, + "step": 6887 + }, + { + "epoch": 0.38392508778774875, + "grad_norm": 0.555674135684967, + "learning_rate": 6.936735161846429e-05, + "loss": 1.6806, + "step": 6888 + }, + { + "epoch": 0.3839808260409119, + "grad_norm": 0.5161069631576538, + "learning_rate": 6.935914219331825e-05, + "loss": 1.5607, + "step": 6889 + }, + { + "epoch": 0.384036564294075, + "grad_norm": 0.5375397205352783, + "learning_rate": 6.93509321542131e-05, + "loss": 1.6835, + "step": 6890 + }, + { + "epoch": 0.3840923025472382, + "grad_norm": 0.4695841073989868, + "learning_rate": 6.934272150140921e-05, + "loss": 1.3228, + "step": 6891 + }, + { + "epoch": 0.3841480408004013, + "grad_norm": 0.5479111075401306, + "learning_rate": 6.933451023516697e-05, + "loss": 1.6331, + "step": 6892 + }, + { + "epoch": 0.38420377905356445, + "grad_norm": 0.5705395936965942, + "learning_rate": 6.932629835574679e-05, + "loss": 1.7666, + "step": 6893 + }, + { + "epoch": 0.3842595173067276, + "grad_norm": 0.5568275451660156, + "learning_rate": 6.93180858634091e-05, + "loss": 1.5809, + "step": 6894 + }, + { + "epoch": 0.38431525555989077, + "grad_norm": 0.6088882088661194, + "learning_rate": 6.930987275841439e-05, + "loss": 1.7695, + "step": 6895 + }, + { + "epoch": 0.3843709938130539, + "grad_norm": 0.5949798822402954, + "learning_rate": 6.930165904102305e-05, + "loss": 1.8917, + "step": 6896 + }, + { + "epoch": 0.384426732066217, + "grad_norm": 0.557823657989502, + "learning_rate": 6.929344471149566e-05, + "loss": 1.8922, + "step": 6897 + }, + { + "epoch": 0.3844824703193802, + "grad_norm": 0.5406614542007446, + "learning_rate": 6.928522977009268e-05, + "loss": 1.6488, + "step": 6898 + }, + { + "epoch": 0.38453820857254334, + "grad_norm": 0.5692750811576843, + "learning_rate": 6.927701421707466e-05, + "loss": 1.6886, + "step": 6899 + }, + { + "epoch": 0.38459394682570647, + "grad_norm": 0.5827295780181885, + "learning_rate": 6.926879805270212e-05, + "loss": 1.6532, + "step": 6900 + }, + { + "epoch": 0.38464968507886965, + "grad_norm": 0.5955531001091003, + "learning_rate": 6.926058127723568e-05, + "loss": 1.6202, + "step": 6901 + }, + { + "epoch": 0.3847054233320328, + "grad_norm": 0.5544630885124207, + "learning_rate": 6.925236389093588e-05, + "loss": 1.1835, + "step": 6902 + }, + { + "epoch": 0.3847611615851959, + "grad_norm": 0.6354855298995972, + "learning_rate": 6.924414589406335e-05, + "loss": 1.9214, + "step": 6903 + }, + { + "epoch": 0.38481689983835904, + "grad_norm": 0.6088757514953613, + "learning_rate": 6.923592728687871e-05, + "loss": 1.8236, + "step": 6904 + }, + { + "epoch": 0.3848726380915222, + "grad_norm": 0.5689512491226196, + "learning_rate": 6.922770806964263e-05, + "loss": 1.5128, + "step": 6905 + }, + { + "epoch": 0.38492837634468535, + "grad_norm": 0.5286409854888916, + "learning_rate": 6.921948824261573e-05, + "loss": 1.5956, + "step": 6906 + }, + { + "epoch": 0.3849841145978485, + "grad_norm": 0.5316895842552185, + "learning_rate": 6.921126780605873e-05, + "loss": 1.5846, + "step": 6907 + }, + { + "epoch": 0.38503985285101167, + "grad_norm": 0.5461425185203552, + "learning_rate": 6.920304676023233e-05, + "loss": 1.6645, + "step": 6908 + }, + { + "epoch": 0.3850955911041748, + "grad_norm": 0.5628203749656677, + "learning_rate": 6.919482510539723e-05, + "loss": 1.6028, + "step": 6909 + }, + { + "epoch": 0.3851513293573379, + "grad_norm": 0.5715482234954834, + "learning_rate": 6.918660284181421e-05, + "loss": 1.7378, + "step": 6910 + }, + { + "epoch": 0.3852070676105011, + "grad_norm": 0.6020052433013916, + "learning_rate": 6.9178379969744e-05, + "loss": 1.8591, + "step": 6911 + }, + { + "epoch": 0.38526280586366424, + "grad_norm": 0.5738694071769714, + "learning_rate": 6.917015648944741e-05, + "loss": 1.527, + "step": 6912 + }, + { + "epoch": 0.38531854411682737, + "grad_norm": 0.5757240653038025, + "learning_rate": 6.916193240118522e-05, + "loss": 1.6982, + "step": 6913 + }, + { + "epoch": 0.38537428236999055, + "grad_norm": 0.5647144913673401, + "learning_rate": 6.915370770521825e-05, + "loss": 1.6709, + "step": 6914 + }, + { + "epoch": 0.3854300206231537, + "grad_norm": 0.5539698004722595, + "learning_rate": 6.914548240180736e-05, + "loss": 1.8178, + "step": 6915 + }, + { + "epoch": 0.3854857588763168, + "grad_norm": 0.5621739625930786, + "learning_rate": 6.913725649121337e-05, + "loss": 1.8038, + "step": 6916 + }, + { + "epoch": 0.38554149712947994, + "grad_norm": 0.5707613229751587, + "learning_rate": 6.91290299736972e-05, + "loss": 1.7155, + "step": 6917 + }, + { + "epoch": 0.3855972353826431, + "grad_norm": 0.5707844495773315, + "learning_rate": 6.912080284951972e-05, + "loss": 1.7316, + "step": 6918 + }, + { + "epoch": 0.38565297363580625, + "grad_norm": 0.5531010627746582, + "learning_rate": 6.911257511894188e-05, + "loss": 1.7607, + "step": 6919 + }, + { + "epoch": 0.3857087118889694, + "grad_norm": 0.6005899906158447, + "learning_rate": 6.910434678222457e-05, + "loss": 1.8731, + "step": 6920 + }, + { + "epoch": 0.38576445014213256, + "grad_norm": 0.5527727603912354, + "learning_rate": 6.909611783962877e-05, + "loss": 1.3704, + "step": 6921 + }, + { + "epoch": 0.3858201883952957, + "grad_norm": 0.5586572885513306, + "learning_rate": 6.908788829141544e-05, + "loss": 1.6253, + "step": 6922 + }, + { + "epoch": 0.3858759266484588, + "grad_norm": 0.6035952568054199, + "learning_rate": 6.907965813784558e-05, + "loss": 1.9226, + "step": 6923 + }, + { + "epoch": 0.385931664901622, + "grad_norm": 0.5370834469795227, + "learning_rate": 6.907142737918023e-05, + "loss": 1.5934, + "step": 6924 + }, + { + "epoch": 0.38598740315478514, + "grad_norm": 0.5954363346099854, + "learning_rate": 6.906319601568038e-05, + "loss": 1.8197, + "step": 6925 + }, + { + "epoch": 0.38604314140794826, + "grad_norm": 0.5880860686302185, + "learning_rate": 6.90549640476071e-05, + "loss": 1.9775, + "step": 6926 + }, + { + "epoch": 0.3860988796611114, + "grad_norm": 0.6047815084457397, + "learning_rate": 6.904673147522147e-05, + "loss": 1.9008, + "step": 6927 + }, + { + "epoch": 0.3861546179142746, + "grad_norm": 0.6101181507110596, + "learning_rate": 6.903849829878457e-05, + "loss": 1.9632, + "step": 6928 + }, + { + "epoch": 0.3862103561674377, + "grad_norm": 0.5670501589775085, + "learning_rate": 6.903026451855748e-05, + "loss": 1.7489, + "step": 6929 + }, + { + "epoch": 0.38626609442060084, + "grad_norm": 0.6123764514923096, + "learning_rate": 6.902203013480137e-05, + "loss": 1.7719, + "step": 6930 + }, + { + "epoch": 0.386321832673764, + "grad_norm": 0.53583824634552, + "learning_rate": 6.901379514777739e-05, + "loss": 1.5504, + "step": 6931 + }, + { + "epoch": 0.38637757092692715, + "grad_norm": 0.5257768630981445, + "learning_rate": 6.900555955774666e-05, + "loss": 1.6045, + "step": 6932 + }, + { + "epoch": 0.3864333091800903, + "grad_norm": 0.5276762843132019, + "learning_rate": 6.899732336497038e-05, + "loss": 1.7366, + "step": 6933 + }, + { + "epoch": 0.38648904743325346, + "grad_norm": 0.555980384349823, + "learning_rate": 6.898908656970979e-05, + "loss": 1.3954, + "step": 6934 + }, + { + "epoch": 0.3865447856864166, + "grad_norm": 0.5937703847885132, + "learning_rate": 6.898084917222609e-05, + "loss": 1.791, + "step": 6935 + }, + { + "epoch": 0.3866005239395797, + "grad_norm": 0.5324926376342773, + "learning_rate": 6.89726111727805e-05, + "loss": 1.7835, + "step": 6936 + }, + { + "epoch": 0.3866562621927429, + "grad_norm": 0.569644033908844, + "learning_rate": 6.896437257163432e-05, + "loss": 1.651, + "step": 6937 + }, + { + "epoch": 0.38671200044590603, + "grad_norm": 0.5893319249153137, + "learning_rate": 6.89561333690488e-05, + "loss": 1.8836, + "step": 6938 + }, + { + "epoch": 0.38676773869906916, + "grad_norm": 0.5247541666030884, + "learning_rate": 6.894789356528526e-05, + "loss": 1.5643, + "step": 6939 + }, + { + "epoch": 0.3868234769522323, + "grad_norm": 0.5343844890594482, + "learning_rate": 6.893965316060501e-05, + "loss": 1.6483, + "step": 6940 + }, + { + "epoch": 0.3868792152053955, + "grad_norm": 0.5714672803878784, + "learning_rate": 6.893141215526938e-05, + "loss": 1.5949, + "step": 6941 + }, + { + "epoch": 0.3869349534585586, + "grad_norm": 0.5850149989128113, + "learning_rate": 6.892317054953975e-05, + "loss": 1.7971, + "step": 6942 + }, + { + "epoch": 0.38699069171172173, + "grad_norm": 0.570669412612915, + "learning_rate": 6.891492834367746e-05, + "loss": 1.8339, + "step": 6943 + }, + { + "epoch": 0.3870464299648849, + "grad_norm": 0.5296490788459778, + "learning_rate": 6.890668553794392e-05, + "loss": 1.6175, + "step": 6944 + }, + { + "epoch": 0.38710216821804805, + "grad_norm": 0.5491392612457275, + "learning_rate": 6.889844213260057e-05, + "loss": 1.7679, + "step": 6945 + }, + { + "epoch": 0.3871579064712112, + "grad_norm": 0.5886465907096863, + "learning_rate": 6.88901981279088e-05, + "loss": 1.5769, + "step": 6946 + }, + { + "epoch": 0.38721364472437436, + "grad_norm": 0.5220004916191101, + "learning_rate": 6.88819535241301e-05, + "loss": 1.4678, + "step": 6947 + }, + { + "epoch": 0.3872693829775375, + "grad_norm": 0.5555586814880371, + "learning_rate": 6.887370832152592e-05, + "loss": 1.6784, + "step": 6948 + }, + { + "epoch": 0.3873251212307006, + "grad_norm": 0.5332651138305664, + "learning_rate": 6.886546252035775e-05, + "loss": 1.6139, + "step": 6949 + }, + { + "epoch": 0.38738085948386375, + "grad_norm": 0.5473794341087341, + "learning_rate": 6.88572161208871e-05, + "loss": 1.8137, + "step": 6950 + }, + { + "epoch": 0.38743659773702693, + "grad_norm": 0.5803813934326172, + "learning_rate": 6.88489691233755e-05, + "loss": 1.5237, + "step": 6951 + }, + { + "epoch": 0.38749233599019006, + "grad_norm": 0.5329601168632507, + "learning_rate": 6.884072152808451e-05, + "loss": 1.686, + "step": 6952 + }, + { + "epoch": 0.3875480742433532, + "grad_norm": 0.5633809566497803, + "learning_rate": 6.883247333527567e-05, + "loss": 1.9771, + "step": 6953 + }, + { + "epoch": 0.3876038124965164, + "grad_norm": 0.6174986958503723, + "learning_rate": 6.882422454521058e-05, + "loss": 1.7549, + "step": 6954 + }, + { + "epoch": 0.3876595507496795, + "grad_norm": 0.5496551394462585, + "learning_rate": 6.881597515815084e-05, + "loss": 1.7045, + "step": 6955 + }, + { + "epoch": 0.38771528900284263, + "grad_norm": 0.5577127933502197, + "learning_rate": 6.880772517435807e-05, + "loss": 1.5901, + "step": 6956 + }, + { + "epoch": 0.3877710272560058, + "grad_norm": 0.5230315327644348, + "learning_rate": 6.879947459409393e-05, + "loss": 1.5849, + "step": 6957 + }, + { + "epoch": 0.38782676550916895, + "grad_norm": 0.5241686105728149, + "learning_rate": 6.879122341762003e-05, + "loss": 1.8152, + "step": 6958 + }, + { + "epoch": 0.3878825037623321, + "grad_norm": 0.5810775756835938, + "learning_rate": 6.878297164519812e-05, + "loss": 1.7573, + "step": 6959 + }, + { + "epoch": 0.38793824201549526, + "grad_norm": 0.5543670058250427, + "learning_rate": 6.877471927708985e-05, + "loss": 1.7487, + "step": 6960 + }, + { + "epoch": 0.3879939802686584, + "grad_norm": 0.5780448317527771, + "learning_rate": 6.876646631355693e-05, + "loss": 1.8512, + "step": 6961 + }, + { + "epoch": 0.3880497185218215, + "grad_norm": 0.6595468521118164, + "learning_rate": 6.875821275486113e-05, + "loss": 2.1185, + "step": 6962 + }, + { + "epoch": 0.38810545677498465, + "grad_norm": 0.5663919448852539, + "learning_rate": 6.874995860126419e-05, + "loss": 1.6607, + "step": 6963 + }, + { + "epoch": 0.38816119502814783, + "grad_norm": 0.6084817051887512, + "learning_rate": 6.874170385302789e-05, + "loss": 1.4841, + "step": 6964 + }, + { + "epoch": 0.38821693328131096, + "grad_norm": 0.5507417321205139, + "learning_rate": 6.8733448510414e-05, + "loss": 1.7557, + "step": 6965 + }, + { + "epoch": 0.3882726715344741, + "grad_norm": 0.5766531825065613, + "learning_rate": 6.872519257368437e-05, + "loss": 1.7722, + "step": 6966 + }, + { + "epoch": 0.3883284097876373, + "grad_norm": 0.5653195381164551, + "learning_rate": 6.871693604310077e-05, + "loss": 1.8058, + "step": 6967 + }, + { + "epoch": 0.3883841480408004, + "grad_norm": 0.6037474274635315, + "learning_rate": 6.87086789189251e-05, + "loss": 1.8542, + "step": 6968 + }, + { + "epoch": 0.38843988629396353, + "grad_norm": 0.5463787317276001, + "learning_rate": 6.870042120141923e-05, + "loss": 1.7221, + "step": 6969 + }, + { + "epoch": 0.3884956245471267, + "grad_norm": 0.5135644674301147, + "learning_rate": 6.869216289084503e-05, + "loss": 1.5492, + "step": 6970 + }, + { + "epoch": 0.38855136280028985, + "grad_norm": 0.5640287399291992, + "learning_rate": 6.86839039874644e-05, + "loss": 1.4507, + "step": 6971 + }, + { + "epoch": 0.388607101053453, + "grad_norm": 0.5661764144897461, + "learning_rate": 6.867564449153925e-05, + "loss": 1.7683, + "step": 6972 + }, + { + "epoch": 0.3886628393066161, + "grad_norm": 0.5671542882919312, + "learning_rate": 6.866738440333157e-05, + "loss": 1.7076, + "step": 6973 + }, + { + "epoch": 0.3887185775597793, + "grad_norm": 0.5259964466094971, + "learning_rate": 6.865912372310328e-05, + "loss": 1.542, + "step": 6974 + }, + { + "epoch": 0.3887743158129424, + "grad_norm": 0.5321882963180542, + "learning_rate": 6.865086245111638e-05, + "loss": 1.6909, + "step": 6975 + }, + { + "epoch": 0.38883005406610555, + "grad_norm": 0.5812041759490967, + "learning_rate": 6.864260058763286e-05, + "loss": 1.8409, + "step": 6976 + }, + { + "epoch": 0.38888579231926873, + "grad_norm": 0.5516645312309265, + "learning_rate": 6.863433813291477e-05, + "loss": 1.5931, + "step": 6977 + }, + { + "epoch": 0.38894153057243186, + "grad_norm": 0.612776517868042, + "learning_rate": 6.86260750872241e-05, + "loss": 1.7741, + "step": 6978 + }, + { + "epoch": 0.388997268825595, + "grad_norm": 0.5400133728981018, + "learning_rate": 6.861781145082293e-05, + "loss": 1.6731, + "step": 6979 + }, + { + "epoch": 0.3890530070787582, + "grad_norm": 0.5253887176513672, + "learning_rate": 6.860954722397332e-05, + "loss": 1.6809, + "step": 6980 + }, + { + "epoch": 0.3891087453319213, + "grad_norm": 0.5338975191116333, + "learning_rate": 6.860128240693737e-05, + "loss": 1.7078, + "step": 6981 + }, + { + "epoch": 0.38916448358508443, + "grad_norm": 0.6083932518959045, + "learning_rate": 6.85930169999772e-05, + "loss": 1.7694, + "step": 6982 + }, + { + "epoch": 0.3892202218382476, + "grad_norm": 0.5741243958473206, + "learning_rate": 6.858475100335496e-05, + "loss": 1.7516, + "step": 6983 + }, + { + "epoch": 0.38927596009141074, + "grad_norm": 0.5835102200508118, + "learning_rate": 6.857648441733275e-05, + "loss": 1.7409, + "step": 6984 + }, + { + "epoch": 0.3893316983445739, + "grad_norm": 0.5485714673995972, + "learning_rate": 6.856821724217276e-05, + "loss": 1.7237, + "step": 6985 + }, + { + "epoch": 0.389387436597737, + "grad_norm": 0.5908092856407166, + "learning_rate": 6.855994947813719e-05, + "loss": 1.8842, + "step": 6986 + }, + { + "epoch": 0.3894431748509002, + "grad_norm": 0.5635112524032593, + "learning_rate": 6.855168112548823e-05, + "loss": 1.8356, + "step": 6987 + }, + { + "epoch": 0.3894989131040633, + "grad_norm": 0.6175239086151123, + "learning_rate": 6.85434121844881e-05, + "loss": 2.1173, + "step": 6988 + }, + { + "epoch": 0.38955465135722644, + "grad_norm": 0.5377556085586548, + "learning_rate": 6.853514265539907e-05, + "loss": 1.6531, + "step": 6989 + }, + { + "epoch": 0.38961038961038963, + "grad_norm": 0.5529573559761047, + "learning_rate": 6.852687253848337e-05, + "loss": 1.7125, + "step": 6990 + }, + { + "epoch": 0.38966612786355276, + "grad_norm": 0.5733687877655029, + "learning_rate": 6.85186018340033e-05, + "loss": 1.8723, + "step": 6991 + }, + { + "epoch": 0.3897218661167159, + "grad_norm": 0.5605233311653137, + "learning_rate": 6.851033054222115e-05, + "loss": 1.9066, + "step": 6992 + }, + { + "epoch": 0.38977760436987907, + "grad_norm": 0.5196309089660645, + "learning_rate": 6.850205866339923e-05, + "loss": 1.6027, + "step": 6993 + }, + { + "epoch": 0.3898333426230422, + "grad_norm": 0.5691904425621033, + "learning_rate": 6.849378619779989e-05, + "loss": 1.7806, + "step": 6994 + }, + { + "epoch": 0.38988908087620533, + "grad_norm": 0.5791077017784119, + "learning_rate": 6.848551314568548e-05, + "loss": 1.8153, + "step": 6995 + }, + { + "epoch": 0.38994481912936846, + "grad_norm": 0.5611302256584167, + "learning_rate": 6.847723950731837e-05, + "loss": 1.7705, + "step": 6996 + }, + { + "epoch": 0.39000055738253164, + "grad_norm": 0.6004642248153687, + "learning_rate": 6.846896528296094e-05, + "loss": 1.6717, + "step": 6997 + }, + { + "epoch": 0.39005629563569477, + "grad_norm": 0.5229793787002563, + "learning_rate": 6.846069047287562e-05, + "loss": 1.6567, + "step": 6998 + }, + { + "epoch": 0.3901120338888579, + "grad_norm": 0.5206711888313293, + "learning_rate": 6.845241507732483e-05, + "loss": 1.3903, + "step": 6999 + }, + { + "epoch": 0.3901677721420211, + "grad_norm": 0.6022440791130066, + "learning_rate": 6.844413909657104e-05, + "loss": 1.8607, + "step": 7000 + }, + { + "epoch": 0.3902235103951842, + "grad_norm": 0.5634634494781494, + "learning_rate": 6.843586253087666e-05, + "loss": 1.6199, + "step": 7001 + }, + { + "epoch": 0.39027924864834734, + "grad_norm": 0.5622709393501282, + "learning_rate": 6.842758538050422e-05, + "loss": 1.5923, + "step": 7002 + }, + { + "epoch": 0.39033498690151053, + "grad_norm": 0.5336858034133911, + "learning_rate": 6.841930764571623e-05, + "loss": 1.6086, + "step": 7003 + }, + { + "epoch": 0.39039072515467366, + "grad_norm": 0.6216438412666321, + "learning_rate": 6.841102932677517e-05, + "loss": 1.8973, + "step": 7004 + }, + { + "epoch": 0.3904464634078368, + "grad_norm": 0.5596641898155212, + "learning_rate": 6.840275042394363e-05, + "loss": 1.4897, + "step": 7005 + }, + { + "epoch": 0.39050220166099997, + "grad_norm": 0.5638755559921265, + "learning_rate": 6.839447093748413e-05, + "loss": 1.7267, + "step": 7006 + }, + { + "epoch": 0.3905579399141631, + "grad_norm": 0.5759851932525635, + "learning_rate": 6.838619086765925e-05, + "loss": 1.9025, + "step": 7007 + }, + { + "epoch": 0.39061367816732623, + "grad_norm": 0.5657535791397095, + "learning_rate": 6.83779102147316e-05, + "loss": 1.6509, + "step": 7008 + }, + { + "epoch": 0.39066941642048936, + "grad_norm": 0.5276607275009155, + "learning_rate": 6.83696289789638e-05, + "loss": 1.6244, + "step": 7009 + }, + { + "epoch": 0.39072515467365254, + "grad_norm": 0.6091243624687195, + "learning_rate": 6.836134716061845e-05, + "loss": 1.7403, + "step": 7010 + }, + { + "epoch": 0.39078089292681567, + "grad_norm": 0.5518734455108643, + "learning_rate": 6.835306475995823e-05, + "loss": 1.6201, + "step": 7011 + }, + { + "epoch": 0.3908366311799788, + "grad_norm": 0.5169443488121033, + "learning_rate": 6.834478177724581e-05, + "loss": 1.5593, + "step": 7012 + }, + { + "epoch": 0.390892369433142, + "grad_norm": 0.5405734181404114, + "learning_rate": 6.833649821274386e-05, + "loss": 1.6275, + "step": 7013 + }, + { + "epoch": 0.3909481076863051, + "grad_norm": 0.639498233795166, + "learning_rate": 6.83282140667151e-05, + "loss": 1.9288, + "step": 7014 + }, + { + "epoch": 0.39100384593946824, + "grad_norm": 0.5509902238845825, + "learning_rate": 6.831992933942225e-05, + "loss": 1.6756, + "step": 7015 + }, + { + "epoch": 0.3910595841926314, + "grad_norm": 0.6026686429977417, + "learning_rate": 6.831164403112806e-05, + "loss": 1.8422, + "step": 7016 + }, + { + "epoch": 0.39111532244579456, + "grad_norm": 0.4942910969257355, + "learning_rate": 6.830335814209527e-05, + "loss": 1.407, + "step": 7017 + }, + { + "epoch": 0.3911710606989577, + "grad_norm": 0.5921064615249634, + "learning_rate": 6.829507167258671e-05, + "loss": 1.7507, + "step": 7018 + }, + { + "epoch": 0.3912267989521208, + "grad_norm": 0.5901893377304077, + "learning_rate": 6.828678462286511e-05, + "loss": 1.9612, + "step": 7019 + }, + { + "epoch": 0.391282537205284, + "grad_norm": 0.5834552049636841, + "learning_rate": 6.827849699319333e-05, + "loss": 1.8656, + "step": 7020 + }, + { + "epoch": 0.3913382754584471, + "grad_norm": 0.5791158080101013, + "learning_rate": 6.827020878383418e-05, + "loss": 1.6849, + "step": 7021 + }, + { + "epoch": 0.39139401371161026, + "grad_norm": 0.6698895692825317, + "learning_rate": 6.826191999505056e-05, + "loss": 1.9619, + "step": 7022 + }, + { + "epoch": 0.39144975196477344, + "grad_norm": 0.5854638814926147, + "learning_rate": 6.82536306271053e-05, + "loss": 1.6066, + "step": 7023 + }, + { + "epoch": 0.39150549021793657, + "grad_norm": 0.5511733293533325, + "learning_rate": 6.82453406802613e-05, + "loss": 1.8761, + "step": 7024 + }, + { + "epoch": 0.3915612284710997, + "grad_norm": 0.5574920177459717, + "learning_rate": 6.823705015478148e-05, + "loss": 1.494, + "step": 7025 + }, + { + "epoch": 0.3916169667242629, + "grad_norm": 0.5293987989425659, + "learning_rate": 6.822875905092876e-05, + "loss": 1.4918, + "step": 7026 + }, + { + "epoch": 0.391672704977426, + "grad_norm": 0.5626353621482849, + "learning_rate": 6.822046736896607e-05, + "loss": 1.7521, + "step": 7027 + }, + { + "epoch": 0.39172844323058914, + "grad_norm": 0.5664160847663879, + "learning_rate": 6.821217510915639e-05, + "loss": 1.5782, + "step": 7028 + }, + { + "epoch": 0.3917841814837523, + "grad_norm": 0.5288576483726501, + "learning_rate": 6.820388227176271e-05, + "loss": 1.4754, + "step": 7029 + }, + { + "epoch": 0.39183991973691545, + "grad_norm": 0.5488860607147217, + "learning_rate": 6.819558885704801e-05, + "loss": 1.6245, + "step": 7030 + }, + { + "epoch": 0.3918956579900786, + "grad_norm": 0.5747123956680298, + "learning_rate": 6.818729486527533e-05, + "loss": 1.7134, + "step": 7031 + }, + { + "epoch": 0.3919513962432417, + "grad_norm": 0.5334782600402832, + "learning_rate": 6.817900029670769e-05, + "loss": 1.6473, + "step": 7032 + }, + { + "epoch": 0.3920071344964049, + "grad_norm": 0.5332539081573486, + "learning_rate": 6.817070515160815e-05, + "loss": 1.4961, + "step": 7033 + }, + { + "epoch": 0.392062872749568, + "grad_norm": 0.5700680017471313, + "learning_rate": 6.816240943023977e-05, + "loss": 1.8336, + "step": 7034 + }, + { + "epoch": 0.39211861100273115, + "grad_norm": 0.5893431901931763, + "learning_rate": 6.815411313286568e-05, + "loss": 1.8517, + "step": 7035 + }, + { + "epoch": 0.39217434925589434, + "grad_norm": 0.5954105854034424, + "learning_rate": 6.814581625974897e-05, + "loss": 1.8405, + "step": 7036 + }, + { + "epoch": 0.39223008750905747, + "grad_norm": 0.5694375038146973, + "learning_rate": 6.813751881115275e-05, + "loss": 1.7636, + "step": 7037 + }, + { + "epoch": 0.3922858257622206, + "grad_norm": 0.6035060286521912, + "learning_rate": 6.812922078734019e-05, + "loss": 1.8142, + "step": 7038 + }, + { + "epoch": 0.3923415640153838, + "grad_norm": 0.6111207008361816, + "learning_rate": 6.812092218857444e-05, + "loss": 1.7048, + "step": 7039 + }, + { + "epoch": 0.3923973022685469, + "grad_norm": 0.5596774220466614, + "learning_rate": 6.811262301511869e-05, + "loss": 1.652, + "step": 7040 + }, + { + "epoch": 0.39245304052171004, + "grad_norm": 0.5244095921516418, + "learning_rate": 6.810432326723615e-05, + "loss": 1.325, + "step": 7041 + }, + { + "epoch": 0.39250877877487317, + "grad_norm": 0.5797486305236816, + "learning_rate": 6.809602294519004e-05, + "loss": 1.7832, + "step": 7042 + }, + { + "epoch": 0.39256451702803635, + "grad_norm": 0.5226321816444397, + "learning_rate": 6.808772204924357e-05, + "loss": 1.6449, + "step": 7043 + }, + { + "epoch": 0.3926202552811995, + "grad_norm": 0.5220246911048889, + "learning_rate": 6.807942057966003e-05, + "loss": 1.6308, + "step": 7044 + }, + { + "epoch": 0.3926759935343626, + "grad_norm": 0.7185441255569458, + "learning_rate": 6.807111853670268e-05, + "loss": 1.6675, + "step": 7045 + }, + { + "epoch": 0.3927317317875258, + "grad_norm": 0.6072642803192139, + "learning_rate": 6.806281592063481e-05, + "loss": 1.8951, + "step": 7046 + }, + { + "epoch": 0.3927874700406889, + "grad_norm": 0.5583004355430603, + "learning_rate": 6.805451273171972e-05, + "loss": 1.686, + "step": 7047 + }, + { + "epoch": 0.39284320829385205, + "grad_norm": 0.5066385865211487, + "learning_rate": 6.804620897022076e-05, + "loss": 1.407, + "step": 7048 + }, + { + "epoch": 0.39289894654701524, + "grad_norm": 0.5519012212753296, + "learning_rate": 6.803790463640127e-05, + "loss": 1.8137, + "step": 7049 + }, + { + "epoch": 0.39295468480017837, + "grad_norm": 0.5573792457580566, + "learning_rate": 6.802959973052461e-05, + "loss": 1.7861, + "step": 7050 + }, + { + "epoch": 0.3930104230533415, + "grad_norm": 0.5672924518585205, + "learning_rate": 6.802129425285417e-05, + "loss": 1.6572, + "step": 7051 + }, + { + "epoch": 0.3930661613065047, + "grad_norm": 0.5737549066543579, + "learning_rate": 6.801298820365333e-05, + "loss": 1.7467, + "step": 7052 + }, + { + "epoch": 0.3931218995596678, + "grad_norm": 0.5474954843521118, + "learning_rate": 6.800468158318554e-05, + "loss": 1.7429, + "step": 7053 + }, + { + "epoch": 0.39317763781283094, + "grad_norm": 0.549497127532959, + "learning_rate": 6.799637439171424e-05, + "loss": 1.764, + "step": 7054 + }, + { + "epoch": 0.39323337606599407, + "grad_norm": 0.5415019392967224, + "learning_rate": 6.798806662950286e-05, + "loss": 1.4691, + "step": 7055 + }, + { + "epoch": 0.39328911431915725, + "grad_norm": 0.5431099534034729, + "learning_rate": 6.797975829681487e-05, + "loss": 1.5577, + "step": 7056 + }, + { + "epoch": 0.3933448525723204, + "grad_norm": 0.549314558506012, + "learning_rate": 6.79714493939138e-05, + "loss": 1.7471, + "step": 7057 + }, + { + "epoch": 0.3934005908254835, + "grad_norm": 0.5444470047950745, + "learning_rate": 6.796313992106313e-05, + "loss": 1.765, + "step": 7058 + }, + { + "epoch": 0.3934563290786467, + "grad_norm": 0.57083660364151, + "learning_rate": 6.795482987852638e-05, + "loss": 1.9101, + "step": 7059 + }, + { + "epoch": 0.3935120673318098, + "grad_norm": 0.5475842952728271, + "learning_rate": 6.794651926656711e-05, + "loss": 1.8193, + "step": 7060 + }, + { + "epoch": 0.39356780558497295, + "grad_norm": 0.5259652733802795, + "learning_rate": 6.793820808544891e-05, + "loss": 1.3794, + "step": 7061 + }, + { + "epoch": 0.39362354383813614, + "grad_norm": 0.5105850100517273, + "learning_rate": 6.792989633543531e-05, + "loss": 1.5634, + "step": 7062 + }, + { + "epoch": 0.39367928209129927, + "grad_norm": 0.5771433711051941, + "learning_rate": 6.792158401678994e-05, + "loss": 1.6858, + "step": 7063 + }, + { + "epoch": 0.3937350203444624, + "grad_norm": 0.5675138235092163, + "learning_rate": 6.791327112977644e-05, + "loss": 1.8272, + "step": 7064 + }, + { + "epoch": 0.3937907585976255, + "grad_norm": 0.5633112788200378, + "learning_rate": 6.790495767465839e-05, + "loss": 1.7226, + "step": 7065 + }, + { + "epoch": 0.3938464968507887, + "grad_norm": 0.5350648760795593, + "learning_rate": 6.789664365169947e-05, + "loss": 1.5082, + "step": 7066 + }, + { + "epoch": 0.39390223510395184, + "grad_norm": 0.5656428337097168, + "learning_rate": 6.788832906116338e-05, + "loss": 1.4914, + "step": 7067 + }, + { + "epoch": 0.39395797335711497, + "grad_norm": 0.5312878489494324, + "learning_rate": 6.78800139033138e-05, + "loss": 1.5864, + "step": 7068 + }, + { + "epoch": 0.39401371161027815, + "grad_norm": 0.6321331262588501, + "learning_rate": 6.787169817841442e-05, + "loss": 1.9452, + "step": 7069 + }, + { + "epoch": 0.3940694498634413, + "grad_norm": 0.5593883991241455, + "learning_rate": 6.786338188672896e-05, + "loss": 1.7637, + "step": 7070 + }, + { + "epoch": 0.3941251881166044, + "grad_norm": 0.5405465960502625, + "learning_rate": 6.785506502852118e-05, + "loss": 1.6875, + "step": 7071 + }, + { + "epoch": 0.3941809263697676, + "grad_norm": 0.5527162551879883, + "learning_rate": 6.784674760405482e-05, + "loss": 1.6496, + "step": 7072 + }, + { + "epoch": 0.3942366646229307, + "grad_norm": 0.5357568264007568, + "learning_rate": 6.78384296135937e-05, + "loss": 1.7234, + "step": 7073 + }, + { + "epoch": 0.39429240287609385, + "grad_norm": 0.5588380694389343, + "learning_rate": 6.783011105740162e-05, + "loss": 1.9166, + "step": 7074 + }, + { + "epoch": 0.39434814112925703, + "grad_norm": 0.7392244338989258, + "learning_rate": 6.782179193574234e-05, + "loss": 1.6746, + "step": 7075 + }, + { + "epoch": 0.39440387938242016, + "grad_norm": 0.5365987420082092, + "learning_rate": 6.781347224887974e-05, + "loss": 1.6615, + "step": 7076 + }, + { + "epoch": 0.3944596176355833, + "grad_norm": 0.5493837594985962, + "learning_rate": 6.780515199707766e-05, + "loss": 1.7271, + "step": 7077 + }, + { + "epoch": 0.3945153558887464, + "grad_norm": 0.5309239029884338, + "learning_rate": 6.779683118059997e-05, + "loss": 1.5172, + "step": 7078 + }, + { + "epoch": 0.3945710941419096, + "grad_norm": 0.5167561769485474, + "learning_rate": 6.778850979971057e-05, + "loss": 1.5777, + "step": 7079 + }, + { + "epoch": 0.39462683239507274, + "grad_norm": 0.5119823217391968, + "learning_rate": 6.778018785467332e-05, + "loss": 1.5685, + "step": 7080 + }, + { + "epoch": 0.39468257064823586, + "grad_norm": 0.5578561425209045, + "learning_rate": 6.777186534575222e-05, + "loss": 1.6626, + "step": 7081 + }, + { + "epoch": 0.39473830890139905, + "grad_norm": 0.535065233707428, + "learning_rate": 6.776354227321114e-05, + "loss": 1.5554, + "step": 7082 + }, + { + "epoch": 0.3947940471545622, + "grad_norm": 0.5996119976043701, + "learning_rate": 6.775521863731408e-05, + "loss": 1.613, + "step": 7083 + }, + { + "epoch": 0.3948497854077253, + "grad_norm": 0.5490982532501221, + "learning_rate": 6.7746894438325e-05, + "loss": 1.6554, + "step": 7084 + }, + { + "epoch": 0.3949055236608885, + "grad_norm": 0.5607420802116394, + "learning_rate": 6.773856967650789e-05, + "loss": 1.7542, + "step": 7085 + }, + { + "epoch": 0.3949612619140516, + "grad_norm": 0.594559907913208, + "learning_rate": 6.773024435212678e-05, + "loss": 1.8008, + "step": 7086 + }, + { + "epoch": 0.39501700016721475, + "grad_norm": 0.5436771512031555, + "learning_rate": 6.77219184654457e-05, + "loss": 1.6853, + "step": 7087 + }, + { + "epoch": 0.3950727384203779, + "grad_norm": 0.6430955529212952, + "learning_rate": 6.771359201672868e-05, + "loss": 1.877, + "step": 7088 + }, + { + "epoch": 0.39512847667354106, + "grad_norm": 0.5667055249214172, + "learning_rate": 6.770526500623982e-05, + "loss": 1.5347, + "step": 7089 + }, + { + "epoch": 0.3951842149267042, + "grad_norm": 0.5299628376960754, + "learning_rate": 6.769693743424317e-05, + "loss": 1.6611, + "step": 7090 + }, + { + "epoch": 0.3952399531798673, + "grad_norm": 0.6088326573371887, + "learning_rate": 6.768860930100285e-05, + "loss": 1.991, + "step": 7091 + }, + { + "epoch": 0.3952956914330305, + "grad_norm": 0.5899388790130615, + "learning_rate": 6.768028060678296e-05, + "loss": 1.8402, + "step": 7092 + }, + { + "epoch": 0.39535142968619363, + "grad_norm": 0.5693525075912476, + "learning_rate": 6.767195135184765e-05, + "loss": 1.6969, + "step": 7093 + }, + { + "epoch": 0.39540716793935676, + "grad_norm": 0.5347588658332825, + "learning_rate": 6.766362153646111e-05, + "loss": 1.6525, + "step": 7094 + }, + { + "epoch": 0.39546290619251995, + "grad_norm": 0.5795377492904663, + "learning_rate": 6.765529116088745e-05, + "loss": 1.7744, + "step": 7095 + }, + { + "epoch": 0.3955186444456831, + "grad_norm": 0.5230005979537964, + "learning_rate": 6.764696022539091e-05, + "loss": 1.6068, + "step": 7096 + }, + { + "epoch": 0.3955743826988462, + "grad_norm": 0.5676483511924744, + "learning_rate": 6.763862873023567e-05, + "loss": 1.6501, + "step": 7097 + }, + { + "epoch": 0.3956301209520094, + "grad_norm": 0.5104279518127441, + "learning_rate": 6.763029667568597e-05, + "loss": 1.5805, + "step": 7098 + }, + { + "epoch": 0.3956858592051725, + "grad_norm": 0.575018048286438, + "learning_rate": 6.762196406200604e-05, + "loss": 1.7185, + "step": 7099 + }, + { + "epoch": 0.39574159745833565, + "grad_norm": 0.5459030270576477, + "learning_rate": 6.761363088946017e-05, + "loss": 1.7264, + "step": 7100 + }, + { + "epoch": 0.3957973357114988, + "grad_norm": 0.5303768515586853, + "learning_rate": 6.760529715831262e-05, + "loss": 1.6626, + "step": 7101 + }, + { + "epoch": 0.39585307396466196, + "grad_norm": 0.5729551911354065, + "learning_rate": 6.759696286882769e-05, + "loss": 1.827, + "step": 7102 + }, + { + "epoch": 0.3959088122178251, + "grad_norm": 0.578536331653595, + "learning_rate": 6.758862802126969e-05, + "loss": 1.8003, + "step": 7103 + }, + { + "epoch": 0.3959645504709882, + "grad_norm": 0.5476341247558594, + "learning_rate": 6.758029261590296e-05, + "loss": 1.7641, + "step": 7104 + }, + { + "epoch": 0.3960202887241514, + "grad_norm": 0.5585542917251587, + "learning_rate": 6.757195665299186e-05, + "loss": 1.6907, + "step": 7105 + }, + { + "epoch": 0.39607602697731453, + "grad_norm": 0.5314999222755432, + "learning_rate": 6.756362013280072e-05, + "loss": 1.5457, + "step": 7106 + }, + { + "epoch": 0.39613176523047766, + "grad_norm": 0.5275375247001648, + "learning_rate": 6.755528305559398e-05, + "loss": 1.6021, + "step": 7107 + }, + { + "epoch": 0.39618750348364085, + "grad_norm": 0.5544595122337341, + "learning_rate": 6.7546945421636e-05, + "loss": 1.5837, + "step": 7108 + }, + { + "epoch": 0.396243241736804, + "grad_norm": 0.6334085464477539, + "learning_rate": 6.753860723119122e-05, + "loss": 2.096, + "step": 7109 + }, + { + "epoch": 0.3962989799899671, + "grad_norm": 0.5980644822120667, + "learning_rate": 6.753026848452407e-05, + "loss": 1.9298, + "step": 7110 + }, + { + "epoch": 0.39635471824313023, + "grad_norm": 0.5179347991943359, + "learning_rate": 6.752192918189902e-05, + "loss": 1.702, + "step": 7111 + }, + { + "epoch": 0.3964104564962934, + "grad_norm": 0.5576172471046448, + "learning_rate": 6.751358932358052e-05, + "loss": 1.6217, + "step": 7112 + }, + { + "epoch": 0.39646619474945655, + "grad_norm": 0.5886361002922058, + "learning_rate": 6.750524890983309e-05, + "loss": 1.9734, + "step": 7113 + }, + { + "epoch": 0.3965219330026197, + "grad_norm": 0.573229193687439, + "learning_rate": 6.749690794092125e-05, + "loss": 1.9415, + "step": 7114 + }, + { + "epoch": 0.39657767125578286, + "grad_norm": 1.0474965572357178, + "learning_rate": 6.748856641710948e-05, + "loss": 2.0009, + "step": 7115 + }, + { + "epoch": 0.396633409508946, + "grad_norm": 0.5304273366928101, + "learning_rate": 6.748022433866236e-05, + "loss": 1.7601, + "step": 7116 + }, + { + "epoch": 0.3966891477621091, + "grad_norm": 0.5350653529167175, + "learning_rate": 6.747188170584444e-05, + "loss": 1.7173, + "step": 7117 + }, + { + "epoch": 0.3967448860152723, + "grad_norm": 0.5216551423072815, + "learning_rate": 6.746353851892028e-05, + "loss": 1.7054, + "step": 7118 + }, + { + "epoch": 0.39680062426843543, + "grad_norm": 0.5482343435287476, + "learning_rate": 6.745519477815451e-05, + "loss": 1.6456, + "step": 7119 + }, + { + "epoch": 0.39685636252159856, + "grad_norm": 0.5794587135314941, + "learning_rate": 6.744685048381174e-05, + "loss": 1.7264, + "step": 7120 + }, + { + "epoch": 0.39691210077476174, + "grad_norm": 0.5834348797798157, + "learning_rate": 6.743850563615659e-05, + "loss": 1.7025, + "step": 7121 + }, + { + "epoch": 0.3969678390279249, + "grad_norm": 0.5380405187606812, + "learning_rate": 6.743016023545373e-05, + "loss": 1.5742, + "step": 7122 + }, + { + "epoch": 0.397023577281088, + "grad_norm": 0.5725619792938232, + "learning_rate": 6.742181428196777e-05, + "loss": 1.8845, + "step": 7123 + }, + { + "epoch": 0.39707931553425113, + "grad_norm": 0.5491376519203186, + "learning_rate": 6.741346777596347e-05, + "loss": 1.6998, + "step": 7124 + }, + { + "epoch": 0.3971350537874143, + "grad_norm": 0.5111629962921143, + "learning_rate": 6.74051207177055e-05, + "loss": 1.4712, + "step": 7125 + }, + { + "epoch": 0.39719079204057745, + "grad_norm": 0.5327715277671814, + "learning_rate": 6.739677310745856e-05, + "loss": 1.4259, + "step": 7126 + }, + { + "epoch": 0.3972465302937406, + "grad_norm": 0.585437536239624, + "learning_rate": 6.738842494548742e-05, + "loss": 1.6437, + "step": 7127 + }, + { + "epoch": 0.39730226854690376, + "grad_norm": 0.4905366599559784, + "learning_rate": 6.738007623205682e-05, + "loss": 1.537, + "step": 7128 + }, + { + "epoch": 0.3973580068000669, + "grad_norm": 0.578807532787323, + "learning_rate": 6.737172696743155e-05, + "loss": 1.7359, + "step": 7129 + }, + { + "epoch": 0.39741374505323, + "grad_norm": 0.5269452333450317, + "learning_rate": 6.736337715187638e-05, + "loss": 1.632, + "step": 7130 + }, + { + "epoch": 0.3974694833063932, + "grad_norm": 0.6212645769119263, + "learning_rate": 6.735502678565611e-05, + "loss": 1.6633, + "step": 7131 + }, + { + "epoch": 0.39752522155955633, + "grad_norm": 0.5281040668487549, + "learning_rate": 6.734667586903557e-05, + "loss": 1.6349, + "step": 7132 + }, + { + "epoch": 0.39758095981271946, + "grad_norm": 0.6241141557693481, + "learning_rate": 6.733832440227963e-05, + "loss": 1.8522, + "step": 7133 + }, + { + "epoch": 0.3976366980658826, + "grad_norm": 0.5351576805114746, + "learning_rate": 6.732997238565311e-05, + "loss": 1.8608, + "step": 7134 + }, + { + "epoch": 0.3976924363190458, + "grad_norm": 0.6173853278160095, + "learning_rate": 6.732161981942093e-05, + "loss": 1.7628, + "step": 7135 + }, + { + "epoch": 0.3977481745722089, + "grad_norm": 0.5938517451286316, + "learning_rate": 6.731326670384794e-05, + "loss": 1.7216, + "step": 7136 + }, + { + "epoch": 0.39780391282537203, + "grad_norm": 0.5863813161849976, + "learning_rate": 6.730491303919907e-05, + "loss": 1.6816, + "step": 7137 + }, + { + "epoch": 0.3978596510785352, + "grad_norm": 0.6825369596481323, + "learning_rate": 6.729655882573928e-05, + "loss": 1.9808, + "step": 7138 + }, + { + "epoch": 0.39791538933169834, + "grad_norm": 0.5284822583198547, + "learning_rate": 6.728820406373346e-05, + "loss": 1.8237, + "step": 7139 + }, + { + "epoch": 0.3979711275848615, + "grad_norm": 0.554270327091217, + "learning_rate": 6.727984875344663e-05, + "loss": 1.61, + "step": 7140 + }, + { + "epoch": 0.39802686583802466, + "grad_norm": 0.6326965093612671, + "learning_rate": 6.727149289514373e-05, + "loss": 2.1011, + "step": 7141 + }, + { + "epoch": 0.3980826040911878, + "grad_norm": 0.5701342225074768, + "learning_rate": 6.72631364890898e-05, + "loss": 1.6724, + "step": 7142 + }, + { + "epoch": 0.3981383423443509, + "grad_norm": 0.5414735078811646, + "learning_rate": 6.725477953554979e-05, + "loss": 1.5425, + "step": 7143 + }, + { + "epoch": 0.3981940805975141, + "grad_norm": 0.5954646468162537, + "learning_rate": 6.72464220347888e-05, + "loss": 1.6308, + "step": 7144 + }, + { + "epoch": 0.39824981885067723, + "grad_norm": 0.6013423204421997, + "learning_rate": 6.723806398707185e-05, + "loss": 1.8022, + "step": 7145 + }, + { + "epoch": 0.39830555710384036, + "grad_norm": 0.5645208954811096, + "learning_rate": 6.722970539266403e-05, + "loss": 1.4448, + "step": 7146 + }, + { + "epoch": 0.3983612953570035, + "grad_norm": 0.6153306365013123, + "learning_rate": 6.72213462518304e-05, + "loss": 1.7358, + "step": 7147 + }, + { + "epoch": 0.39841703361016667, + "grad_norm": 0.5638027191162109, + "learning_rate": 6.721298656483608e-05, + "loss": 1.4709, + "step": 7148 + }, + { + "epoch": 0.3984727718633298, + "grad_norm": 0.5619633197784424, + "learning_rate": 6.720462633194618e-05, + "loss": 1.6085, + "step": 7149 + }, + { + "epoch": 0.39852851011649293, + "grad_norm": 0.5597891211509705, + "learning_rate": 6.719626555342585e-05, + "loss": 1.8059, + "step": 7150 + }, + { + "epoch": 0.3985842483696561, + "grad_norm": 0.5170794725418091, + "learning_rate": 6.718790422954021e-05, + "loss": 1.7492, + "step": 7151 + }, + { + "epoch": 0.39863998662281924, + "grad_norm": 0.5071738362312317, + "learning_rate": 6.717954236055449e-05, + "loss": 1.6074, + "step": 7152 + }, + { + "epoch": 0.39869572487598237, + "grad_norm": 0.5328095555305481, + "learning_rate": 6.717117994673384e-05, + "loss": 1.3657, + "step": 7153 + }, + { + "epoch": 0.39875146312914556, + "grad_norm": 0.5484116673469543, + "learning_rate": 6.716281698834346e-05, + "loss": 1.6112, + "step": 7154 + }, + { + "epoch": 0.3988072013823087, + "grad_norm": 0.5871725678443909, + "learning_rate": 6.715445348564862e-05, + "loss": 1.9087, + "step": 7155 + }, + { + "epoch": 0.3988629396354718, + "grad_norm": 0.5913428068161011, + "learning_rate": 6.714608943891452e-05, + "loss": 2.0278, + "step": 7156 + }, + { + "epoch": 0.39891867788863494, + "grad_norm": 0.5644116997718811, + "learning_rate": 6.713772484840645e-05, + "loss": 1.63, + "step": 7157 + }, + { + "epoch": 0.3989744161417981, + "grad_norm": 0.5353809595108032, + "learning_rate": 6.712935971438962e-05, + "loss": 1.6313, + "step": 7158 + }, + { + "epoch": 0.39903015439496126, + "grad_norm": 0.5755419731140137, + "learning_rate": 6.712099403712942e-05, + "loss": 1.7367, + "step": 7159 + }, + { + "epoch": 0.3990858926481244, + "grad_norm": 0.5571795105934143, + "learning_rate": 6.711262781689109e-05, + "loss": 1.8337, + "step": 7160 + }, + { + "epoch": 0.39914163090128757, + "grad_norm": 0.5910276174545288, + "learning_rate": 6.710426105394e-05, + "loss": 1.8474, + "step": 7161 + }, + { + "epoch": 0.3991973691544507, + "grad_norm": 0.5713383555412292, + "learning_rate": 6.709589374854144e-05, + "loss": 1.4712, + "step": 7162 + }, + { + "epoch": 0.3992531074076138, + "grad_norm": 0.6179262399673462, + "learning_rate": 6.708752590096082e-05, + "loss": 1.6399, + "step": 7163 + }, + { + "epoch": 0.399308845660777, + "grad_norm": 0.5618530511856079, + "learning_rate": 6.707915751146351e-05, + "loss": 1.6822, + "step": 7164 + }, + { + "epoch": 0.39936458391394014, + "grad_norm": 0.5299525260925293, + "learning_rate": 6.70707885803149e-05, + "loss": 1.4796, + "step": 7165 + }, + { + "epoch": 0.39942032216710327, + "grad_norm": 0.5534185767173767, + "learning_rate": 6.706241910778041e-05, + "loss": 1.844, + "step": 7166 + }, + { + "epoch": 0.39947606042026645, + "grad_norm": 0.5665568709373474, + "learning_rate": 6.705404909412547e-05, + "loss": 1.787, + "step": 7167 + }, + { + "epoch": 0.3995317986734296, + "grad_norm": 0.6122377514839172, + "learning_rate": 6.704567853961552e-05, + "loss": 1.7695, + "step": 7168 + }, + { + "epoch": 0.3995875369265927, + "grad_norm": 0.5161054730415344, + "learning_rate": 6.703730744451601e-05, + "loss": 1.5939, + "step": 7169 + }, + { + "epoch": 0.39964327517975584, + "grad_norm": 0.569864809513092, + "learning_rate": 6.702893580909247e-05, + "loss": 1.7385, + "step": 7170 + }, + { + "epoch": 0.399699013432919, + "grad_norm": 0.5484759211540222, + "learning_rate": 6.702056363361036e-05, + "loss": 1.6495, + "step": 7171 + }, + { + "epoch": 0.39975475168608215, + "grad_norm": 0.5385055541992188, + "learning_rate": 6.701219091833522e-05, + "loss": 1.8867, + "step": 7172 + }, + { + "epoch": 0.3998104899392453, + "grad_norm": 0.5519033074378967, + "learning_rate": 6.700381766353255e-05, + "loss": 1.7746, + "step": 7173 + }, + { + "epoch": 0.39986622819240847, + "grad_norm": 0.6148980259895325, + "learning_rate": 6.699544386946795e-05, + "loss": 1.8656, + "step": 7174 + }, + { + "epoch": 0.3999219664455716, + "grad_norm": 0.569527268409729, + "learning_rate": 6.698706953640693e-05, + "loss": 1.6071, + "step": 7175 + }, + { + "epoch": 0.3999777046987347, + "grad_norm": 0.5626715421676636, + "learning_rate": 6.697869466461513e-05, + "loss": 1.8849, + "step": 7176 + }, + { + "epoch": 0.4000334429518979, + "grad_norm": 0.5838245153427124, + "learning_rate": 6.69703192543581e-05, + "loss": 1.7764, + "step": 7177 + }, + { + "epoch": 0.40008918120506104, + "grad_norm": 0.552139937877655, + "learning_rate": 6.696194330590151e-05, + "loss": 1.6598, + "step": 7178 + }, + { + "epoch": 0.40014491945822417, + "grad_norm": 0.5443406105041504, + "learning_rate": 6.695356681951099e-05, + "loss": 1.6139, + "step": 7179 + }, + { + "epoch": 0.4002006577113873, + "grad_norm": 0.5214937329292297, + "learning_rate": 6.694518979545214e-05, + "loss": 1.6783, + "step": 7180 + }, + { + "epoch": 0.4002563959645505, + "grad_norm": 0.5553892254829407, + "learning_rate": 6.69368122339907e-05, + "loss": 1.6699, + "step": 7181 + }, + { + "epoch": 0.4003121342177136, + "grad_norm": 0.5150647163391113, + "learning_rate": 6.692843413539229e-05, + "loss": 1.532, + "step": 7182 + }, + { + "epoch": 0.40036787247087674, + "grad_norm": 0.5763303637504578, + "learning_rate": 6.692005549992268e-05, + "loss": 1.9554, + "step": 7183 + }, + { + "epoch": 0.4004236107240399, + "grad_norm": 0.5533180832862854, + "learning_rate": 6.691167632784754e-05, + "loss": 1.4465, + "step": 7184 + }, + { + "epoch": 0.40047934897720305, + "grad_norm": 0.5495351552963257, + "learning_rate": 6.690329661943265e-05, + "loss": 1.6263, + "step": 7185 + }, + { + "epoch": 0.4005350872303662, + "grad_norm": 0.5440528988838196, + "learning_rate": 6.689491637494371e-05, + "loss": 1.8053, + "step": 7186 + }, + { + "epoch": 0.40059082548352937, + "grad_norm": 0.5240649580955505, + "learning_rate": 6.688653559464655e-05, + "loss": 1.6647, + "step": 7187 + }, + { + "epoch": 0.4006465637366925, + "grad_norm": 0.5496859550476074, + "learning_rate": 6.687815427880694e-05, + "loss": 1.7904, + "step": 7188 + }, + { + "epoch": 0.4007023019898556, + "grad_norm": 0.5740963816642761, + "learning_rate": 6.686977242769067e-05, + "loss": 1.8628, + "step": 7189 + }, + { + "epoch": 0.4007580402430188, + "grad_norm": 0.5899214148521423, + "learning_rate": 6.686139004156358e-05, + "loss": 1.6146, + "step": 7190 + }, + { + "epoch": 0.40081377849618194, + "grad_norm": 0.5265205502510071, + "learning_rate": 6.68530071206915e-05, + "loss": 1.683, + "step": 7191 + }, + { + "epoch": 0.40086951674934507, + "grad_norm": 0.560076892375946, + "learning_rate": 6.684462366534032e-05, + "loss": 1.6757, + "step": 7192 + }, + { + "epoch": 0.4009252550025082, + "grad_norm": 0.5472216010093689, + "learning_rate": 6.683623967577586e-05, + "loss": 1.7725, + "step": 7193 + }, + { + "epoch": 0.4009809932556714, + "grad_norm": 0.5014883875846863, + "learning_rate": 6.682785515226407e-05, + "loss": 1.4681, + "step": 7194 + }, + { + "epoch": 0.4010367315088345, + "grad_norm": 0.5076844692230225, + "learning_rate": 6.681947009507079e-05, + "loss": 1.4126, + "step": 7195 + }, + { + "epoch": 0.40109246976199764, + "grad_norm": 0.5327789187431335, + "learning_rate": 6.681108450446202e-05, + "loss": 1.6593, + "step": 7196 + }, + { + "epoch": 0.4011482080151608, + "grad_norm": 0.6164959073066711, + "learning_rate": 6.680269838070364e-05, + "loss": 1.9668, + "step": 7197 + }, + { + "epoch": 0.40120394626832395, + "grad_norm": 0.5150039792060852, + "learning_rate": 6.679431172406163e-05, + "loss": 1.4285, + "step": 7198 + }, + { + "epoch": 0.4012596845214871, + "grad_norm": 0.5839514136314392, + "learning_rate": 6.678592453480198e-05, + "loss": 1.8469, + "step": 7199 + }, + { + "epoch": 0.40131542277465027, + "grad_norm": 0.6449024677276611, + "learning_rate": 6.677753681319066e-05, + "loss": 2.1511, + "step": 7200 + }, + { + "epoch": 0.4013711610278134, + "grad_norm": 0.5425246357917786, + "learning_rate": 6.676914855949372e-05, + "loss": 1.8045, + "step": 7201 + }, + { + "epoch": 0.4014268992809765, + "grad_norm": 0.5886958241462708, + "learning_rate": 6.676075977397715e-05, + "loss": 1.7844, + "step": 7202 + }, + { + "epoch": 0.40148263753413965, + "grad_norm": 0.5560657382011414, + "learning_rate": 6.675237045690699e-05, + "loss": 1.7289, + "step": 7203 + }, + { + "epoch": 0.40153837578730284, + "grad_norm": 0.5133156776428223, + "learning_rate": 6.674398060854931e-05, + "loss": 1.4584, + "step": 7204 + }, + { + "epoch": 0.40159411404046597, + "grad_norm": 0.5923200845718384, + "learning_rate": 6.67355902291702e-05, + "loss": 1.8035, + "step": 7205 + }, + { + "epoch": 0.4016498522936291, + "grad_norm": 0.5706618428230286, + "learning_rate": 6.672719931903574e-05, + "loss": 1.781, + "step": 7206 + }, + { + "epoch": 0.4017055905467923, + "grad_norm": 0.548729419708252, + "learning_rate": 6.671880787841204e-05, + "loss": 1.7033, + "step": 7207 + }, + { + "epoch": 0.4017613287999554, + "grad_norm": 0.5980433225631714, + "learning_rate": 6.671041590756524e-05, + "loss": 1.7048, + "step": 7208 + }, + { + "epoch": 0.40181706705311854, + "grad_norm": 0.5054447054862976, + "learning_rate": 6.670202340676149e-05, + "loss": 1.6601, + "step": 7209 + }, + { + "epoch": 0.4018728053062817, + "grad_norm": 0.5414553880691528, + "learning_rate": 6.669363037626689e-05, + "loss": 1.619, + "step": 7210 + }, + { + "epoch": 0.40192854355944485, + "grad_norm": 0.5375347137451172, + "learning_rate": 6.66852368163477e-05, + "loss": 1.6898, + "step": 7211 + }, + { + "epoch": 0.401984281812608, + "grad_norm": 0.5620880722999573, + "learning_rate": 6.667684272727007e-05, + "loss": 1.4842, + "step": 7212 + }, + { + "epoch": 0.40204002006577116, + "grad_norm": 0.5257782936096191, + "learning_rate": 6.666844810930021e-05, + "loss": 1.5747, + "step": 7213 + }, + { + "epoch": 0.4020957583189343, + "grad_norm": 0.586007297039032, + "learning_rate": 6.666005296270439e-05, + "loss": 1.9183, + "step": 7214 + }, + { + "epoch": 0.4021514965720974, + "grad_norm": 0.5531460642814636, + "learning_rate": 6.66516572877488e-05, + "loss": 1.708, + "step": 7215 + }, + { + "epoch": 0.40220723482526055, + "grad_norm": 0.544386625289917, + "learning_rate": 6.664326108469974e-05, + "loss": 1.5666, + "step": 7216 + }, + { + "epoch": 0.40226297307842374, + "grad_norm": 0.5806384682655334, + "learning_rate": 6.663486435382347e-05, + "loss": 1.8389, + "step": 7217 + }, + { + "epoch": 0.40231871133158686, + "grad_norm": 0.6060808300971985, + "learning_rate": 6.66264670953863e-05, + "loss": 1.91, + "step": 7218 + }, + { + "epoch": 0.40237444958475, + "grad_norm": 0.5704980492591858, + "learning_rate": 6.661806930965452e-05, + "loss": 1.6892, + "step": 7219 + }, + { + "epoch": 0.4024301878379132, + "grad_norm": 0.5570072531700134, + "learning_rate": 6.660967099689448e-05, + "loss": 1.6718, + "step": 7220 + }, + { + "epoch": 0.4024859260910763, + "grad_norm": 0.5326122641563416, + "learning_rate": 6.66012721573725e-05, + "loss": 1.7055, + "step": 7221 + }, + { + "epoch": 0.40254166434423944, + "grad_norm": 0.5099365711212158, + "learning_rate": 6.659287279135499e-05, + "loss": 1.6732, + "step": 7222 + }, + { + "epoch": 0.4025974025974026, + "grad_norm": 0.5786659717559814, + "learning_rate": 6.658447289910827e-05, + "loss": 1.4223, + "step": 7223 + }, + { + "epoch": 0.40265314085056575, + "grad_norm": 0.5925951600074768, + "learning_rate": 6.657607248089879e-05, + "loss": 1.8696, + "step": 7224 + }, + { + "epoch": 0.4027088791037289, + "grad_norm": 0.5589519739151001, + "learning_rate": 6.65676715369929e-05, + "loss": 1.5046, + "step": 7225 + }, + { + "epoch": 0.402764617356892, + "grad_norm": 0.5450175404548645, + "learning_rate": 6.655927006765709e-05, + "loss": 1.6517, + "step": 7226 + }, + { + "epoch": 0.4028203556100552, + "grad_norm": 0.563928484916687, + "learning_rate": 6.655086807315778e-05, + "loss": 1.8544, + "step": 7227 + }, + { + "epoch": 0.4028760938632183, + "grad_norm": 0.5899096131324768, + "learning_rate": 6.654246555376144e-05, + "loss": 1.7556, + "step": 7228 + }, + { + "epoch": 0.40293183211638145, + "grad_norm": 0.5601338744163513, + "learning_rate": 6.653406250973451e-05, + "loss": 1.7469, + "step": 7229 + }, + { + "epoch": 0.40298757036954463, + "grad_norm": 0.5789577960968018, + "learning_rate": 6.652565894134355e-05, + "loss": 1.6428, + "step": 7230 + }, + { + "epoch": 0.40304330862270776, + "grad_norm": 0.5229625701904297, + "learning_rate": 6.651725484885503e-05, + "loss": 1.4699, + "step": 7231 + }, + { + "epoch": 0.4030990468758709, + "grad_norm": 0.5528407096862793, + "learning_rate": 6.650885023253548e-05, + "loss": 1.8881, + "step": 7232 + }, + { + "epoch": 0.4031547851290341, + "grad_norm": 0.5682995319366455, + "learning_rate": 6.650044509265147e-05, + "loss": 1.8263, + "step": 7233 + }, + { + "epoch": 0.4032105233821972, + "grad_norm": 0.5219863057136536, + "learning_rate": 6.649203942946954e-05, + "loss": 1.5232, + "step": 7234 + }, + { + "epoch": 0.40326626163536033, + "grad_norm": 0.5359931588172913, + "learning_rate": 6.648363324325627e-05, + "loss": 1.5617, + "step": 7235 + }, + { + "epoch": 0.4033219998885235, + "grad_norm": 0.5631711483001709, + "learning_rate": 6.647522653427825e-05, + "loss": 1.7428, + "step": 7236 + }, + { + "epoch": 0.40337773814168665, + "grad_norm": 0.5994919538497925, + "learning_rate": 6.646681930280211e-05, + "loss": 1.5538, + "step": 7237 + }, + { + "epoch": 0.4034334763948498, + "grad_norm": 0.5310835242271423, + "learning_rate": 6.645841154909448e-05, + "loss": 1.5501, + "step": 7238 + }, + { + "epoch": 0.4034892146480129, + "grad_norm": 0.7443162798881531, + "learning_rate": 6.6450003273422e-05, + "loss": 1.7322, + "step": 7239 + }, + { + "epoch": 0.4035449529011761, + "grad_norm": 0.5354825258255005, + "learning_rate": 6.644159447605131e-05, + "loss": 1.6913, + "step": 7240 + }, + { + "epoch": 0.4036006911543392, + "grad_norm": 0.5255858898162842, + "learning_rate": 6.64331851572491e-05, + "loss": 1.6574, + "step": 7241 + }, + { + "epoch": 0.40365642940750235, + "grad_norm": 0.531148374080658, + "learning_rate": 6.642477531728207e-05, + "loss": 1.5934, + "step": 7242 + }, + { + "epoch": 0.40371216766066553, + "grad_norm": 0.5981380939483643, + "learning_rate": 6.641636495641694e-05, + "loss": 1.8274, + "step": 7243 + }, + { + "epoch": 0.40376790591382866, + "grad_norm": 0.5403674840927124, + "learning_rate": 6.640795407492043e-05, + "loss": 1.4047, + "step": 7244 + }, + { + "epoch": 0.4038236441669918, + "grad_norm": 0.5610218048095703, + "learning_rate": 6.639954267305928e-05, + "loss": 1.8228, + "step": 7245 + }, + { + "epoch": 0.403879382420155, + "grad_norm": 0.5543003678321838, + "learning_rate": 6.639113075110025e-05, + "loss": 1.8899, + "step": 7246 + }, + { + "epoch": 0.4039351206733181, + "grad_norm": 0.5696173906326294, + "learning_rate": 6.63827183093101e-05, + "loss": 1.6491, + "step": 7247 + }, + { + "epoch": 0.40399085892648123, + "grad_norm": 0.5595298409461975, + "learning_rate": 6.637430534795567e-05, + "loss": 1.7502, + "step": 7248 + }, + { + "epoch": 0.40404659717964436, + "grad_norm": 0.5707483291625977, + "learning_rate": 6.636589186730373e-05, + "loss": 1.6643, + "step": 7249 + }, + { + "epoch": 0.40410233543280755, + "grad_norm": 0.5698502063751221, + "learning_rate": 6.635747786762113e-05, + "loss": 1.5516, + "step": 7250 + }, + { + "epoch": 0.4041580736859707, + "grad_norm": 0.5298511385917664, + "learning_rate": 6.63490633491747e-05, + "loss": 1.5581, + "step": 7251 + }, + { + "epoch": 0.4042138119391338, + "grad_norm": 0.5572474598884583, + "learning_rate": 6.63406483122313e-05, + "loss": 1.7449, + "step": 7252 + }, + { + "epoch": 0.404269550192297, + "grad_norm": 0.5807195901870728, + "learning_rate": 6.633223275705781e-05, + "loss": 1.6806, + "step": 7253 + }, + { + "epoch": 0.4043252884454601, + "grad_norm": 0.5467732548713684, + "learning_rate": 6.632381668392111e-05, + "loss": 1.742, + "step": 7254 + }, + { + "epoch": 0.40438102669862325, + "grad_norm": 0.5687143206596375, + "learning_rate": 6.631540009308813e-05, + "loss": 1.7586, + "step": 7255 + }, + { + "epoch": 0.40443676495178643, + "grad_norm": 0.5853325128555298, + "learning_rate": 6.630698298482578e-05, + "loss": 1.8601, + "step": 7256 + }, + { + "epoch": 0.40449250320494956, + "grad_norm": 0.5176242589950562, + "learning_rate": 6.629856535940101e-05, + "loss": 1.5131, + "step": 7257 + }, + { + "epoch": 0.4045482414581127, + "grad_norm": 0.5749338865280151, + "learning_rate": 6.629014721708076e-05, + "loss": 1.6167, + "step": 7258 + }, + { + "epoch": 0.4046039797112759, + "grad_norm": 0.6350910663604736, + "learning_rate": 6.628172855813203e-05, + "loss": 1.6698, + "step": 7259 + }, + { + "epoch": 0.404659717964439, + "grad_norm": 0.538773238658905, + "learning_rate": 6.627330938282182e-05, + "loss": 1.7449, + "step": 7260 + }, + { + "epoch": 0.40471545621760213, + "grad_norm": 0.5643429160118103, + "learning_rate": 6.62648896914171e-05, + "loss": 1.6906, + "step": 7261 + }, + { + "epoch": 0.40477119447076526, + "grad_norm": 0.5482378005981445, + "learning_rate": 6.62564694841849e-05, + "loss": 1.651, + "step": 7262 + }, + { + "epoch": 0.40482693272392845, + "grad_norm": 0.556492805480957, + "learning_rate": 6.624804876139227e-05, + "loss": 1.6232, + "step": 7263 + }, + { + "epoch": 0.4048826709770916, + "grad_norm": 0.5243347883224487, + "learning_rate": 6.623962752330627e-05, + "loss": 1.5745, + "step": 7264 + }, + { + "epoch": 0.4049384092302547, + "grad_norm": 0.5533580780029297, + "learning_rate": 6.623120577019396e-05, + "loss": 1.621, + "step": 7265 + }, + { + "epoch": 0.4049941474834179, + "grad_norm": 0.6168079376220703, + "learning_rate": 6.622278350232246e-05, + "loss": 1.8571, + "step": 7266 + }, + { + "epoch": 0.405049885736581, + "grad_norm": 0.5359664559364319, + "learning_rate": 6.621436071995884e-05, + "loss": 1.5815, + "step": 7267 + }, + { + "epoch": 0.40510562398974415, + "grad_norm": 0.6080171465873718, + "learning_rate": 6.620593742337022e-05, + "loss": 1.7069, + "step": 7268 + }, + { + "epoch": 0.40516136224290733, + "grad_norm": 0.5019293427467346, + "learning_rate": 6.619751361282377e-05, + "loss": 1.5408, + "step": 7269 + }, + { + "epoch": 0.40521710049607046, + "grad_norm": 0.5557806491851807, + "learning_rate": 6.618908928858663e-05, + "loss": 1.7405, + "step": 7270 + }, + { + "epoch": 0.4052728387492336, + "grad_norm": 0.5392197370529175, + "learning_rate": 6.618066445092595e-05, + "loss": 1.5968, + "step": 7271 + }, + { + "epoch": 0.4053285770023967, + "grad_norm": 0.621353030204773, + "learning_rate": 6.617223910010896e-05, + "loss": 1.8194, + "step": 7272 + }, + { + "epoch": 0.4053843152555599, + "grad_norm": 0.5642111301422119, + "learning_rate": 6.61638132364028e-05, + "loss": 1.4983, + "step": 7273 + }, + { + "epoch": 0.40544005350872303, + "grad_norm": 0.5767485499382019, + "learning_rate": 6.615538686007476e-05, + "loss": 1.6838, + "step": 7274 + }, + { + "epoch": 0.40549579176188616, + "grad_norm": 0.5635485649108887, + "learning_rate": 6.614695997139202e-05, + "loss": 1.87, + "step": 7275 + }, + { + "epoch": 0.40555153001504934, + "grad_norm": 0.617825448513031, + "learning_rate": 6.613853257062186e-05, + "loss": 1.839, + "step": 7276 + }, + { + "epoch": 0.4056072682682125, + "grad_norm": 0.5892661213874817, + "learning_rate": 6.613010465803153e-05, + "loss": 1.7833, + "step": 7277 + }, + { + "epoch": 0.4056630065213756, + "grad_norm": 0.6038499474525452, + "learning_rate": 6.612167623388834e-05, + "loss": 1.8361, + "step": 7278 + }, + { + "epoch": 0.4057187447745388, + "grad_norm": 0.5470013618469238, + "learning_rate": 6.611324729845958e-05, + "loss": 1.8218, + "step": 7279 + }, + { + "epoch": 0.4057744830277019, + "grad_norm": 0.5531765818595886, + "learning_rate": 6.610481785201254e-05, + "loss": 1.6214, + "step": 7280 + }, + { + "epoch": 0.40583022128086504, + "grad_norm": 0.5488517880439758, + "learning_rate": 6.60963878948146e-05, + "loss": 1.5644, + "step": 7281 + }, + { + "epoch": 0.40588595953402823, + "grad_norm": 0.5389445424079895, + "learning_rate": 6.608795742713306e-05, + "loss": 1.6407, + "step": 7282 + }, + { + "epoch": 0.40594169778719136, + "grad_norm": 0.5432456731796265, + "learning_rate": 6.607952644923534e-05, + "loss": 1.6906, + "step": 7283 + }, + { + "epoch": 0.4059974360403545, + "grad_norm": 0.5381740927696228, + "learning_rate": 6.607109496138877e-05, + "loss": 1.5545, + "step": 7284 + }, + { + "epoch": 0.4060531742935176, + "grad_norm": 0.5759360194206238, + "learning_rate": 6.606266296386078e-05, + "loss": 1.3279, + "step": 7285 + }, + { + "epoch": 0.4061089125466808, + "grad_norm": 0.5859653949737549, + "learning_rate": 6.605423045691875e-05, + "loss": 1.6515, + "step": 7286 + }, + { + "epoch": 0.40616465079984393, + "grad_norm": 0.5650625228881836, + "learning_rate": 6.604579744083015e-05, + "loss": 1.7375, + "step": 7287 + }, + { + "epoch": 0.40622038905300706, + "grad_norm": 0.5053606629371643, + "learning_rate": 6.60373639158624e-05, + "loss": 1.3345, + "step": 7288 + }, + { + "epoch": 0.40627612730617024, + "grad_norm": 0.559548020362854, + "learning_rate": 6.602892988228299e-05, + "loss": 1.5881, + "step": 7289 + }, + { + "epoch": 0.40633186555933337, + "grad_norm": 0.5711749196052551, + "learning_rate": 6.602049534035937e-05, + "loss": 1.6593, + "step": 7290 + }, + { + "epoch": 0.4063876038124965, + "grad_norm": 0.5415685176849365, + "learning_rate": 6.601206029035904e-05, + "loss": 1.7801, + "step": 7291 + }, + { + "epoch": 0.4064433420656597, + "grad_norm": 0.5906074643135071, + "learning_rate": 6.60036247325495e-05, + "loss": 1.8566, + "step": 7292 + }, + { + "epoch": 0.4064990803188228, + "grad_norm": 0.5831937789916992, + "learning_rate": 6.599518866719831e-05, + "loss": 1.6081, + "step": 7293 + }, + { + "epoch": 0.40655481857198594, + "grad_norm": 0.5068337917327881, + "learning_rate": 6.5986752094573e-05, + "loss": 1.5883, + "step": 7294 + }, + { + "epoch": 0.4066105568251491, + "grad_norm": 0.5402857065200806, + "learning_rate": 6.59783150149411e-05, + "loss": 1.7286, + "step": 7295 + }, + { + "epoch": 0.40666629507831226, + "grad_norm": 0.5793524980545044, + "learning_rate": 6.596987742857024e-05, + "loss": 1.782, + "step": 7296 + }, + { + "epoch": 0.4067220333314754, + "grad_norm": 0.5685024261474609, + "learning_rate": 6.596143933572795e-05, + "loss": 1.6989, + "step": 7297 + }, + { + "epoch": 0.4067777715846385, + "grad_norm": 0.5885668396949768, + "learning_rate": 6.595300073668188e-05, + "loss": 1.7724, + "step": 7298 + }, + { + "epoch": 0.4068335098378017, + "grad_norm": 0.5693629384040833, + "learning_rate": 6.594456163169963e-05, + "loss": 1.7927, + "step": 7299 + }, + { + "epoch": 0.40688924809096483, + "grad_norm": 0.6024751663208008, + "learning_rate": 6.593612202104885e-05, + "loss": 1.9269, + "step": 7300 + }, + { + "epoch": 0.40694498634412796, + "grad_norm": 0.5218265652656555, + "learning_rate": 6.59276819049972e-05, + "loss": 1.6254, + "step": 7301 + }, + { + "epoch": 0.40700072459729114, + "grad_norm": 0.6775539517402649, + "learning_rate": 6.591924128381234e-05, + "loss": 2.2446, + "step": 7302 + }, + { + "epoch": 0.40705646285045427, + "grad_norm": 0.5457693338394165, + "learning_rate": 6.591080015776196e-05, + "loss": 1.7268, + "step": 7303 + }, + { + "epoch": 0.4071122011036174, + "grad_norm": 0.5545173287391663, + "learning_rate": 6.590235852711377e-05, + "loss": 1.5403, + "step": 7304 + }, + { + "epoch": 0.4071679393567806, + "grad_norm": 0.5415998697280884, + "learning_rate": 6.589391639213549e-05, + "loss": 1.7487, + "step": 7305 + }, + { + "epoch": 0.4072236776099437, + "grad_norm": 0.535123884677887, + "learning_rate": 6.588547375309484e-05, + "loss": 1.8118, + "step": 7306 + }, + { + "epoch": 0.40727941586310684, + "grad_norm": 0.5559954643249512, + "learning_rate": 6.587703061025959e-05, + "loss": 1.7792, + "step": 7307 + }, + { + "epoch": 0.40733515411626997, + "grad_norm": 0.5952346920967102, + "learning_rate": 6.586858696389748e-05, + "loss": 1.8367, + "step": 7308 + }, + { + "epoch": 0.40739089236943316, + "grad_norm": 0.5658838152885437, + "learning_rate": 6.586014281427632e-05, + "loss": 1.8874, + "step": 7309 + }, + { + "epoch": 0.4074466306225963, + "grad_norm": 0.5443295240402222, + "learning_rate": 6.585169816166392e-05, + "loss": 1.6405, + "step": 7310 + }, + { + "epoch": 0.4075023688757594, + "grad_norm": 0.5414347648620605, + "learning_rate": 6.584325300632806e-05, + "loss": 1.7544, + "step": 7311 + }, + { + "epoch": 0.4075581071289226, + "grad_norm": 0.5387737154960632, + "learning_rate": 6.583480734853658e-05, + "loss": 1.6416, + "step": 7312 + }, + { + "epoch": 0.4076138453820857, + "grad_norm": 0.5518178343772888, + "learning_rate": 6.582636118855735e-05, + "loss": 1.7322, + "step": 7313 + }, + { + "epoch": 0.40766958363524886, + "grad_norm": 0.5452878475189209, + "learning_rate": 6.58179145266582e-05, + "loss": 1.7432, + "step": 7314 + }, + { + "epoch": 0.40772532188841204, + "grad_norm": 0.5074037313461304, + "learning_rate": 6.580946736310704e-05, + "loss": 1.6643, + "step": 7315 + }, + { + "epoch": 0.40778106014157517, + "grad_norm": 0.5745427012443542, + "learning_rate": 6.580101969817175e-05, + "loss": 1.8664, + "step": 7316 + }, + { + "epoch": 0.4078367983947383, + "grad_norm": 0.5891657471656799, + "learning_rate": 6.579257153212024e-05, + "loss": 1.8217, + "step": 7317 + }, + { + "epoch": 0.4078925366479015, + "grad_norm": 0.5395662188529968, + "learning_rate": 6.578412286522044e-05, + "loss": 1.5422, + "step": 7318 + }, + { + "epoch": 0.4079482749010646, + "grad_norm": 0.5738537907600403, + "learning_rate": 6.57756736977403e-05, + "loss": 1.753, + "step": 7319 + }, + { + "epoch": 0.40800401315422774, + "grad_norm": 0.5593982338905334, + "learning_rate": 6.576722402994775e-05, + "loss": 1.5805, + "step": 7320 + }, + { + "epoch": 0.40805975140739087, + "grad_norm": 0.6101201772689819, + "learning_rate": 6.575877386211077e-05, + "loss": 1.742, + "step": 7321 + }, + { + "epoch": 0.40811548966055405, + "grad_norm": 0.5429602265357971, + "learning_rate": 6.57503231944974e-05, + "loss": 1.7166, + "step": 7322 + }, + { + "epoch": 0.4081712279137172, + "grad_norm": 0.5799590349197388, + "learning_rate": 6.574187202737558e-05, + "loss": 1.8698, + "step": 7323 + }, + { + "epoch": 0.4082269661668803, + "grad_norm": 0.5671953558921814, + "learning_rate": 6.573342036101339e-05, + "loss": 1.5871, + "step": 7324 + }, + { + "epoch": 0.4082827044200435, + "grad_norm": 0.5521631836891174, + "learning_rate": 6.572496819567882e-05, + "loss": 1.6091, + "step": 7325 + }, + { + "epoch": 0.4083384426732066, + "grad_norm": 0.6058674454689026, + "learning_rate": 6.571651553163994e-05, + "loss": 1.9233, + "step": 7326 + }, + { + "epoch": 0.40839418092636975, + "grad_norm": 0.5595351457595825, + "learning_rate": 6.570806236916481e-05, + "loss": 1.681, + "step": 7327 + }, + { + "epoch": 0.40844991917953294, + "grad_norm": 0.5565963983535767, + "learning_rate": 6.569960870852156e-05, + "loss": 1.8081, + "step": 7328 + }, + { + "epoch": 0.40850565743269607, + "grad_norm": 0.5626837015151978, + "learning_rate": 6.569115454997823e-05, + "loss": 1.7268, + "step": 7329 + }, + { + "epoch": 0.4085613956858592, + "grad_norm": 0.5642188787460327, + "learning_rate": 6.568269989380296e-05, + "loss": 1.9007, + "step": 7330 + }, + { + "epoch": 0.4086171339390223, + "grad_norm": 0.5992141962051392, + "learning_rate": 6.56742447402639e-05, + "loss": 1.8163, + "step": 7331 + }, + { + "epoch": 0.4086728721921855, + "grad_norm": 0.5469499826431274, + "learning_rate": 6.566578908962918e-05, + "loss": 1.6564, + "step": 7332 + }, + { + "epoch": 0.40872861044534864, + "grad_norm": 0.5719706416130066, + "learning_rate": 6.565733294216697e-05, + "loss": 1.3752, + "step": 7333 + }, + { + "epoch": 0.40878434869851177, + "grad_norm": 0.5726919174194336, + "learning_rate": 6.564887629814543e-05, + "loss": 1.629, + "step": 7334 + }, + { + "epoch": 0.40884008695167495, + "grad_norm": 0.6024767160415649, + "learning_rate": 6.56404191578328e-05, + "loss": 1.6818, + "step": 7335 + }, + { + "epoch": 0.4088958252048381, + "grad_norm": 0.5598945021629333, + "learning_rate": 6.563196152149725e-05, + "loss": 1.6562, + "step": 7336 + }, + { + "epoch": 0.4089515634580012, + "grad_norm": 0.6022909283638, + "learning_rate": 6.562350338940704e-05, + "loss": 1.6497, + "step": 7337 + }, + { + "epoch": 0.4090073017111644, + "grad_norm": 0.5557130575180054, + "learning_rate": 6.561504476183037e-05, + "loss": 1.5777, + "step": 7338 + }, + { + "epoch": 0.4090630399643275, + "grad_norm": 0.556742787361145, + "learning_rate": 6.560658563903553e-05, + "loss": 1.6048, + "step": 7339 + }, + { + "epoch": 0.40911877821749065, + "grad_norm": 0.6215361952781677, + "learning_rate": 6.559812602129078e-05, + "loss": 1.85, + "step": 7340 + }, + { + "epoch": 0.40917451647065384, + "grad_norm": 0.5431729555130005, + "learning_rate": 6.558966590886443e-05, + "loss": 1.7366, + "step": 7341 + }, + { + "epoch": 0.40923025472381697, + "grad_norm": 0.5173145532608032, + "learning_rate": 6.558120530202476e-05, + "loss": 1.5962, + "step": 7342 + }, + { + "epoch": 0.4092859929769801, + "grad_norm": 0.558746874332428, + "learning_rate": 6.55727442010401e-05, + "loss": 1.6842, + "step": 7343 + }, + { + "epoch": 0.4093417312301432, + "grad_norm": 0.5484337210655212, + "learning_rate": 6.55642826061788e-05, + "loss": 1.8824, + "step": 7344 + }, + { + "epoch": 0.4093974694833064, + "grad_norm": 0.5415590405464172, + "learning_rate": 6.55558205177092e-05, + "loss": 1.7393, + "step": 7345 + }, + { + "epoch": 0.40945320773646954, + "grad_norm": 0.5736859440803528, + "learning_rate": 6.554735793589967e-05, + "loss": 1.6012, + "step": 7346 + }, + { + "epoch": 0.40950894598963267, + "grad_norm": 0.5511910319328308, + "learning_rate": 6.553889486101857e-05, + "loss": 1.6051, + "step": 7347 + }, + { + "epoch": 0.40956468424279585, + "grad_norm": 0.5481744408607483, + "learning_rate": 6.553043129333436e-05, + "loss": 1.6571, + "step": 7348 + }, + { + "epoch": 0.409620422495959, + "grad_norm": 0.7418869733810425, + "learning_rate": 6.55219672331154e-05, + "loss": 1.6247, + "step": 7349 + }, + { + "epoch": 0.4096761607491221, + "grad_norm": 0.5882282257080078, + "learning_rate": 6.551350268063015e-05, + "loss": 1.7125, + "step": 7350 + }, + { + "epoch": 0.4097318990022853, + "grad_norm": 0.6087817549705505, + "learning_rate": 6.550503763614702e-05, + "loss": 1.9143, + "step": 7351 + }, + { + "epoch": 0.4097876372554484, + "grad_norm": 0.5106980800628662, + "learning_rate": 6.549657209993452e-05, + "loss": 1.4884, + "step": 7352 + }, + { + "epoch": 0.40984337550861155, + "grad_norm": 0.5542812347412109, + "learning_rate": 6.548810607226109e-05, + "loss": 1.6739, + "step": 7353 + }, + { + "epoch": 0.4098991137617747, + "grad_norm": 0.6260994672775269, + "learning_rate": 6.547963955339526e-05, + "loss": 1.8902, + "step": 7354 + }, + { + "epoch": 0.40995485201493786, + "grad_norm": 0.5681547522544861, + "learning_rate": 6.547117254360549e-05, + "loss": 1.8688, + "step": 7355 + }, + { + "epoch": 0.410010590268101, + "grad_norm": 0.5453806519508362, + "learning_rate": 6.546270504316033e-05, + "loss": 1.7046, + "step": 7356 + }, + { + "epoch": 0.4100663285212641, + "grad_norm": 0.5230925679206848, + "learning_rate": 6.545423705232834e-05, + "loss": 1.6008, + "step": 7357 + }, + { + "epoch": 0.4101220667744273, + "grad_norm": 0.5534452795982361, + "learning_rate": 6.544576857137804e-05, + "loss": 1.806, + "step": 7358 + }, + { + "epoch": 0.41017780502759044, + "grad_norm": 0.586466908454895, + "learning_rate": 6.543729960057803e-05, + "loss": 1.8252, + "step": 7359 + }, + { + "epoch": 0.41023354328075357, + "grad_norm": 0.5712817311286926, + "learning_rate": 6.542883014019686e-05, + "loss": 1.6653, + "step": 7360 + }, + { + "epoch": 0.41028928153391675, + "grad_norm": 0.5666759014129639, + "learning_rate": 6.542036019050318e-05, + "loss": 1.7503, + "step": 7361 + }, + { + "epoch": 0.4103450197870799, + "grad_norm": 0.6092966198921204, + "learning_rate": 6.541188975176557e-05, + "loss": 2.0138, + "step": 7362 + }, + { + "epoch": 0.410400758040243, + "grad_norm": 0.5910922884941101, + "learning_rate": 6.540341882425267e-05, + "loss": 1.8193, + "step": 7363 + }, + { + "epoch": 0.4104564962934062, + "grad_norm": 0.5653868317604065, + "learning_rate": 6.539494740823313e-05, + "loss": 1.6905, + "step": 7364 + }, + { + "epoch": 0.4105122345465693, + "grad_norm": 0.5556957721710205, + "learning_rate": 6.538647550397563e-05, + "loss": 1.5966, + "step": 7365 + }, + { + "epoch": 0.41056797279973245, + "grad_norm": 0.6585522294044495, + "learning_rate": 6.537800311174882e-05, + "loss": 1.9665, + "step": 7366 + }, + { + "epoch": 0.4106237110528956, + "grad_norm": 0.5647701621055603, + "learning_rate": 6.536953023182143e-05, + "loss": 1.7119, + "step": 7367 + }, + { + "epoch": 0.41067944930605876, + "grad_norm": 0.5993644595146179, + "learning_rate": 6.536105686446214e-05, + "loss": 1.8307, + "step": 7368 + }, + { + "epoch": 0.4107351875592219, + "grad_norm": 0.5878274440765381, + "learning_rate": 6.535258300993969e-05, + "loss": 1.6834, + "step": 7369 + }, + { + "epoch": 0.410790925812385, + "grad_norm": 0.5731014609336853, + "learning_rate": 6.534410866852283e-05, + "loss": 1.7639, + "step": 7370 + }, + { + "epoch": 0.4108466640655482, + "grad_norm": 0.558718204498291, + "learning_rate": 6.533563384048029e-05, + "loss": 1.68, + "step": 7371 + }, + { + "epoch": 0.41090240231871134, + "grad_norm": 0.5906892418861389, + "learning_rate": 6.532715852608087e-05, + "loss": 1.6856, + "step": 7372 + }, + { + "epoch": 0.41095814057187446, + "grad_norm": 0.5575792193412781, + "learning_rate": 6.531868272559333e-05, + "loss": 1.6829, + "step": 7373 + }, + { + "epoch": 0.41101387882503765, + "grad_norm": 0.5349531769752502, + "learning_rate": 6.531020643928649e-05, + "loss": 1.666, + "step": 7374 + }, + { + "epoch": 0.4110696170782008, + "grad_norm": 0.5200047492980957, + "learning_rate": 6.530172966742918e-05, + "loss": 1.5504, + "step": 7375 + }, + { + "epoch": 0.4111253553313639, + "grad_norm": 0.599875271320343, + "learning_rate": 6.529325241029022e-05, + "loss": 1.8604, + "step": 7376 + }, + { + "epoch": 0.41118109358452704, + "grad_norm": 0.5267208814620972, + "learning_rate": 6.528477466813845e-05, + "loss": 1.5969, + "step": 7377 + }, + { + "epoch": 0.4112368318376902, + "grad_norm": 0.5209345817565918, + "learning_rate": 6.527629644124273e-05, + "loss": 1.5824, + "step": 7378 + }, + { + "epoch": 0.41129257009085335, + "grad_norm": 0.5929481983184814, + "learning_rate": 6.526781772987197e-05, + "loss": 1.9316, + "step": 7379 + }, + { + "epoch": 0.4113483083440165, + "grad_norm": 0.5629690885543823, + "learning_rate": 6.525933853429505e-05, + "loss": 1.6927, + "step": 7380 + }, + { + "epoch": 0.41140404659717966, + "grad_norm": 0.5802732110023499, + "learning_rate": 6.525085885478089e-05, + "loss": 1.7149, + "step": 7381 + }, + { + "epoch": 0.4114597848503428, + "grad_norm": 0.5767194032669067, + "learning_rate": 6.524237869159838e-05, + "loss": 1.6511, + "step": 7382 + }, + { + "epoch": 0.4115155231035059, + "grad_norm": 0.5414605140686035, + "learning_rate": 6.523389804501651e-05, + "loss": 1.5401, + "step": 7383 + }, + { + "epoch": 0.4115712613566691, + "grad_norm": 0.5376063585281372, + "learning_rate": 6.52254169153042e-05, + "loss": 1.6796, + "step": 7384 + }, + { + "epoch": 0.41162699960983223, + "grad_norm": 0.5899385809898376, + "learning_rate": 6.521693530273045e-05, + "loss": 1.7729, + "step": 7385 + }, + { + "epoch": 0.41168273786299536, + "grad_norm": 0.5602531433105469, + "learning_rate": 6.520845320756421e-05, + "loss": 1.6136, + "step": 7386 + }, + { + "epoch": 0.41173847611615855, + "grad_norm": 0.5425115823745728, + "learning_rate": 6.519997063007452e-05, + "loss": 1.5817, + "step": 7387 + }, + { + "epoch": 0.4117942143693217, + "grad_norm": 0.5449849963188171, + "learning_rate": 6.51914875705304e-05, + "loss": 1.6962, + "step": 7388 + }, + { + "epoch": 0.4118499526224848, + "grad_norm": 0.5851723551750183, + "learning_rate": 6.518300402920084e-05, + "loss": 2.035, + "step": 7389 + }, + { + "epoch": 0.41190569087564793, + "grad_norm": 0.5257713794708252, + "learning_rate": 6.517452000635493e-05, + "loss": 1.1806, + "step": 7390 + }, + { + "epoch": 0.4119614291288111, + "grad_norm": 0.5605010390281677, + "learning_rate": 6.516603550226171e-05, + "loss": 1.7513, + "step": 7391 + }, + { + "epoch": 0.41201716738197425, + "grad_norm": 0.6154865026473999, + "learning_rate": 6.515755051719026e-05, + "loss": 1.8616, + "step": 7392 + }, + { + "epoch": 0.4120729056351374, + "grad_norm": 0.5920423269271851, + "learning_rate": 6.51490650514097e-05, + "loss": 1.7594, + "step": 7393 + }, + { + "epoch": 0.41212864388830056, + "grad_norm": 0.545600414276123, + "learning_rate": 6.514057910518913e-05, + "loss": 1.5641, + "step": 7394 + }, + { + "epoch": 0.4121843821414637, + "grad_norm": 0.5568488836288452, + "learning_rate": 6.513209267879765e-05, + "loss": 1.6398, + "step": 7395 + }, + { + "epoch": 0.4122401203946268, + "grad_norm": 0.5209145545959473, + "learning_rate": 6.512360577250443e-05, + "loss": 1.4485, + "step": 7396 + }, + { + "epoch": 0.41229585864779, + "grad_norm": 0.5175876021385193, + "learning_rate": 6.511511838657859e-05, + "loss": 1.6851, + "step": 7397 + }, + { + "epoch": 0.41235159690095313, + "grad_norm": 0.5393850803375244, + "learning_rate": 6.510663052128934e-05, + "loss": 1.6724, + "step": 7398 + }, + { + "epoch": 0.41240733515411626, + "grad_norm": 0.5579698085784912, + "learning_rate": 6.509814217690582e-05, + "loss": 1.7999, + "step": 7399 + }, + { + "epoch": 0.4124630734072794, + "grad_norm": 0.5217966437339783, + "learning_rate": 6.508965335369729e-05, + "loss": 1.5216, + "step": 7400 + }, + { + "epoch": 0.4125188116604426, + "grad_norm": 0.5507352352142334, + "learning_rate": 6.508116405193292e-05, + "loss": 1.5396, + "step": 7401 + }, + { + "epoch": 0.4125745499136057, + "grad_norm": 0.5592759847640991, + "learning_rate": 6.507267427188197e-05, + "loss": 1.7238, + "step": 7402 + }, + { + "epoch": 0.41263028816676883, + "grad_norm": 0.5734774470329285, + "learning_rate": 6.506418401381365e-05, + "loss": 1.7004, + "step": 7403 + }, + { + "epoch": 0.412686026419932, + "grad_norm": 0.5572485327720642, + "learning_rate": 6.505569327799726e-05, + "loss": 1.5875, + "step": 7404 + }, + { + "epoch": 0.41274176467309515, + "grad_norm": 0.5783054232597351, + "learning_rate": 6.504720206470205e-05, + "loss": 1.806, + "step": 7405 + }, + { + "epoch": 0.4127975029262583, + "grad_norm": 0.5762080550193787, + "learning_rate": 6.503871037419731e-05, + "loss": 1.6241, + "step": 7406 + }, + { + "epoch": 0.41285324117942146, + "grad_norm": 0.5752031207084656, + "learning_rate": 6.50302182067524e-05, + "loss": 1.5105, + "step": 7407 + }, + { + "epoch": 0.4129089794325846, + "grad_norm": 0.5618080496788025, + "learning_rate": 6.502172556263656e-05, + "loss": 1.6661, + "step": 7408 + }, + { + "epoch": 0.4129647176857477, + "grad_norm": 0.5460039377212524, + "learning_rate": 6.501323244211919e-05, + "loss": 1.5414, + "step": 7409 + }, + { + "epoch": 0.4130204559389109, + "grad_norm": 0.5536362528800964, + "learning_rate": 6.500473884546962e-05, + "loss": 1.7312, + "step": 7410 + }, + { + "epoch": 0.41307619419207403, + "grad_norm": 0.5220944285392761, + "learning_rate": 6.499624477295722e-05, + "loss": 1.4628, + "step": 7411 + }, + { + "epoch": 0.41313193244523716, + "grad_norm": 0.5702623128890991, + "learning_rate": 6.498775022485134e-05, + "loss": 1.7568, + "step": 7412 + }, + { + "epoch": 0.4131876706984003, + "grad_norm": 0.5831007361412048, + "learning_rate": 6.497925520142143e-05, + "loss": 1.8805, + "step": 7413 + }, + { + "epoch": 0.4132434089515635, + "grad_norm": 0.5719270706176758, + "learning_rate": 6.497075970293688e-05, + "loss": 1.8406, + "step": 7414 + }, + { + "epoch": 0.4132991472047266, + "grad_norm": 0.5721832513809204, + "learning_rate": 6.496226372966711e-05, + "loss": 1.8577, + "step": 7415 + }, + { + "epoch": 0.41335488545788973, + "grad_norm": 0.5381945967674255, + "learning_rate": 6.495376728188159e-05, + "loss": 1.5441, + "step": 7416 + }, + { + "epoch": 0.4134106237110529, + "grad_norm": 0.5105479955673218, + "learning_rate": 6.494527035984974e-05, + "loss": 1.7383, + "step": 7417 + }, + { + "epoch": 0.41346636196421604, + "grad_norm": 0.5516504049301147, + "learning_rate": 6.493677296384106e-05, + "loss": 1.7542, + "step": 7418 + }, + { + "epoch": 0.4135221002173792, + "grad_norm": 0.5726693868637085, + "learning_rate": 6.492827509412501e-05, + "loss": 1.887, + "step": 7419 + }, + { + "epoch": 0.41357783847054236, + "grad_norm": 0.5425702333450317, + "learning_rate": 6.491977675097114e-05, + "loss": 1.6247, + "step": 7420 + }, + { + "epoch": 0.4136335767237055, + "grad_norm": 0.7511564493179321, + "learning_rate": 6.491127793464893e-05, + "loss": 1.7428, + "step": 7421 + }, + { + "epoch": 0.4136893149768686, + "grad_norm": 0.5151875019073486, + "learning_rate": 6.490277864542792e-05, + "loss": 1.6937, + "step": 7422 + }, + { + "epoch": 0.41374505323003175, + "grad_norm": 0.5558873414993286, + "learning_rate": 6.489427888357765e-05, + "loss": 1.7254, + "step": 7423 + }, + { + "epoch": 0.41380079148319493, + "grad_norm": 0.5704571008682251, + "learning_rate": 6.488577864936771e-05, + "loss": 1.6893, + "step": 7424 + }, + { + "epoch": 0.41385652973635806, + "grad_norm": 0.5515883564949036, + "learning_rate": 6.487727794306765e-05, + "loss": 1.5928, + "step": 7425 + }, + { + "epoch": 0.4139122679895212, + "grad_norm": 0.5346539616584778, + "learning_rate": 6.48687767649471e-05, + "loss": 1.5923, + "step": 7426 + }, + { + "epoch": 0.41396800624268437, + "grad_norm": 0.48073434829711914, + "learning_rate": 6.48602751152756e-05, + "loss": 1.5783, + "step": 7427 + }, + { + "epoch": 0.4140237444958475, + "grad_norm": 0.5613585114479065, + "learning_rate": 6.485177299432284e-05, + "loss": 1.7081, + "step": 7428 + }, + { + "epoch": 0.41407948274901063, + "grad_norm": 0.5521184206008911, + "learning_rate": 6.484327040235844e-05, + "loss": 1.8141, + "step": 7429 + }, + { + "epoch": 0.4141352210021738, + "grad_norm": 0.5570716857910156, + "learning_rate": 6.483476733965202e-05, + "loss": 1.8114, + "step": 7430 + }, + { + "epoch": 0.41419095925533694, + "grad_norm": 0.5927569270133972, + "learning_rate": 6.48262638064733e-05, + "loss": 1.8538, + "step": 7431 + }, + { + "epoch": 0.4142466975085001, + "grad_norm": 0.6198796629905701, + "learning_rate": 6.48177598030919e-05, + "loss": 1.8671, + "step": 7432 + }, + { + "epoch": 0.41430243576166326, + "grad_norm": 0.562487781047821, + "learning_rate": 6.480925532977758e-05, + "loss": 1.6247, + "step": 7433 + }, + { + "epoch": 0.4143581740148264, + "grad_norm": 0.5455536246299744, + "learning_rate": 6.480075038680002e-05, + "loss": 1.6946, + "step": 7434 + }, + { + "epoch": 0.4144139122679895, + "grad_norm": 0.6041662096977234, + "learning_rate": 6.479224497442897e-05, + "loss": 1.9345, + "step": 7435 + }, + { + "epoch": 0.41446965052115264, + "grad_norm": 0.5616452693939209, + "learning_rate": 6.478373909293412e-05, + "loss": 1.8108, + "step": 7436 + }, + { + "epoch": 0.41452538877431583, + "grad_norm": 0.5593286752700806, + "learning_rate": 6.477523274258528e-05, + "loss": 1.6404, + "step": 7437 + }, + { + "epoch": 0.41458112702747896, + "grad_norm": 0.5919610261917114, + "learning_rate": 6.47667259236522e-05, + "loss": 1.8287, + "step": 7438 + }, + { + "epoch": 0.4146368652806421, + "grad_norm": 0.6362894177436829, + "learning_rate": 6.475821863640467e-05, + "loss": 1.8535, + "step": 7439 + }, + { + "epoch": 0.41469260353380527, + "grad_norm": 0.4930521547794342, + "learning_rate": 6.474971088111248e-05, + "loss": 1.3973, + "step": 7440 + }, + { + "epoch": 0.4147483417869684, + "grad_norm": 0.5308540463447571, + "learning_rate": 6.474120265804549e-05, + "loss": 1.5271, + "step": 7441 + }, + { + "epoch": 0.41480408004013153, + "grad_norm": 0.5587360262870789, + "learning_rate": 6.473269396747346e-05, + "loss": 1.6953, + "step": 7442 + }, + { + "epoch": 0.4148598182932947, + "grad_norm": 0.5565241575241089, + "learning_rate": 6.47241848096663e-05, + "loss": 1.7807, + "step": 7443 + }, + { + "epoch": 0.41491555654645784, + "grad_norm": 0.6130486130714417, + "learning_rate": 6.471567518489383e-05, + "loss": 2.0551, + "step": 7444 + }, + { + "epoch": 0.41497129479962097, + "grad_norm": 0.5374565720558167, + "learning_rate": 6.470716509342594e-05, + "loss": 1.6525, + "step": 7445 + }, + { + "epoch": 0.4150270330527841, + "grad_norm": 0.5470364093780518, + "learning_rate": 6.469865453553254e-05, + "loss": 1.7753, + "step": 7446 + }, + { + "epoch": 0.4150827713059473, + "grad_norm": 0.5423111319541931, + "learning_rate": 6.46901435114835e-05, + "loss": 1.6718, + "step": 7447 + }, + { + "epoch": 0.4151385095591104, + "grad_norm": 0.630453884601593, + "learning_rate": 6.468163202154877e-05, + "loss": 1.7607, + "step": 7448 + }, + { + "epoch": 0.41519424781227354, + "grad_norm": 0.5870693325996399, + "learning_rate": 6.467312006599828e-05, + "loss": 1.8854, + "step": 7449 + }, + { + "epoch": 0.4152499860654367, + "grad_norm": 0.6026604771614075, + "learning_rate": 6.466460764510196e-05, + "loss": 1.6298, + "step": 7450 + }, + { + "epoch": 0.41530572431859986, + "grad_norm": 0.5341464281082153, + "learning_rate": 6.465609475912977e-05, + "loss": 1.5961, + "step": 7451 + }, + { + "epoch": 0.415361462571763, + "grad_norm": 0.5364176630973816, + "learning_rate": 6.464758140835173e-05, + "loss": 1.6091, + "step": 7452 + }, + { + "epoch": 0.41541720082492617, + "grad_norm": 0.5682061910629272, + "learning_rate": 6.463906759303779e-05, + "loss": 1.6807, + "step": 7453 + }, + { + "epoch": 0.4154729390780893, + "grad_norm": 0.5520201325416565, + "learning_rate": 6.463055331345798e-05, + "loss": 1.8693, + "step": 7454 + }, + { + "epoch": 0.4155286773312524, + "grad_norm": 0.5386977195739746, + "learning_rate": 6.462203856988233e-05, + "loss": 1.5473, + "step": 7455 + }, + { + "epoch": 0.4155844155844156, + "grad_norm": 0.5517452955245972, + "learning_rate": 6.461352336258088e-05, + "loss": 1.5523, + "step": 7456 + }, + { + "epoch": 0.41564015383757874, + "grad_norm": 0.6362208127975464, + "learning_rate": 6.460500769182365e-05, + "loss": 1.6515, + "step": 7457 + }, + { + "epoch": 0.41569589209074187, + "grad_norm": 0.5483435392379761, + "learning_rate": 6.459649155788075e-05, + "loss": 1.6962, + "step": 7458 + }, + { + "epoch": 0.415751630343905, + "grad_norm": 0.5627394914627075, + "learning_rate": 6.458797496102222e-05, + "loss": 1.5808, + "step": 7459 + }, + { + "epoch": 0.4158073685970682, + "grad_norm": 0.5749256610870361, + "learning_rate": 6.45794579015182e-05, + "loss": 1.6652, + "step": 7460 + }, + { + "epoch": 0.4158631068502313, + "grad_norm": 0.561033308506012, + "learning_rate": 6.457094037963877e-05, + "loss": 1.5447, + "step": 7461 + }, + { + "epoch": 0.41591884510339444, + "grad_norm": 0.6188123822212219, + "learning_rate": 6.456242239565405e-05, + "loss": 1.8373, + "step": 7462 + }, + { + "epoch": 0.4159745833565576, + "grad_norm": 0.5495220422744751, + "learning_rate": 6.455390394983422e-05, + "loss": 1.7338, + "step": 7463 + }, + { + "epoch": 0.41603032160972075, + "grad_norm": 0.5390871167182922, + "learning_rate": 6.454538504244938e-05, + "loss": 1.5552, + "step": 7464 + }, + { + "epoch": 0.4160860598628839, + "grad_norm": 0.5653820633888245, + "learning_rate": 6.453686567376976e-05, + "loss": 1.692, + "step": 7465 + }, + { + "epoch": 0.41614179811604707, + "grad_norm": 0.5153915286064148, + "learning_rate": 6.45283458440655e-05, + "loss": 1.6676, + "step": 7466 + }, + { + "epoch": 0.4161975363692102, + "grad_norm": 0.5695963501930237, + "learning_rate": 6.451982555360682e-05, + "loss": 1.6982, + "step": 7467 + }, + { + "epoch": 0.4162532746223733, + "grad_norm": 0.6078826785087585, + "learning_rate": 6.451130480266395e-05, + "loss": 1.762, + "step": 7468 + }, + { + "epoch": 0.41630901287553645, + "grad_norm": 0.5621688961982727, + "learning_rate": 6.450278359150708e-05, + "loss": 1.5914, + "step": 7469 + }, + { + "epoch": 0.41636475112869964, + "grad_norm": 0.5914077162742615, + "learning_rate": 6.449426192040649e-05, + "loss": 1.909, + "step": 7470 + }, + { + "epoch": 0.41642048938186277, + "grad_norm": 0.5638688802719116, + "learning_rate": 6.448573978963239e-05, + "loss": 1.8037, + "step": 7471 + }, + { + "epoch": 0.4164762276350259, + "grad_norm": 0.569990336894989, + "learning_rate": 6.44772171994551e-05, + "loss": 1.5707, + "step": 7472 + }, + { + "epoch": 0.4165319658881891, + "grad_norm": 0.5680502653121948, + "learning_rate": 6.446869415014488e-05, + "loss": 1.6062, + "step": 7473 + }, + { + "epoch": 0.4165877041413522, + "grad_norm": 0.5565951466560364, + "learning_rate": 6.446017064197205e-05, + "loss": 1.7973, + "step": 7474 + }, + { + "epoch": 0.41664344239451534, + "grad_norm": 0.5711973905563354, + "learning_rate": 6.445164667520691e-05, + "loss": 1.751, + "step": 7475 + }, + { + "epoch": 0.4166991806476785, + "grad_norm": 0.5332829356193542, + "learning_rate": 6.44431222501198e-05, + "loss": 1.5348, + "step": 7476 + }, + { + "epoch": 0.41675491890084165, + "grad_norm": 0.5311811566352844, + "learning_rate": 6.443459736698105e-05, + "loss": 1.7648, + "step": 7477 + }, + { + "epoch": 0.4168106571540048, + "grad_norm": 0.5389667749404907, + "learning_rate": 6.442607202606104e-05, + "loss": 1.5702, + "step": 7478 + }, + { + "epoch": 0.41686639540716797, + "grad_norm": 0.5450131297111511, + "learning_rate": 6.441754622763015e-05, + "loss": 1.5624, + "step": 7479 + }, + { + "epoch": 0.4169221336603311, + "grad_norm": 0.6195186376571655, + "learning_rate": 6.440901997195871e-05, + "loss": 1.8265, + "step": 7480 + }, + { + "epoch": 0.4169778719134942, + "grad_norm": 0.5652611255645752, + "learning_rate": 6.440049325931721e-05, + "loss": 1.6908, + "step": 7481 + }, + { + "epoch": 0.41703361016665735, + "grad_norm": 0.5675498843193054, + "learning_rate": 6.4391966089976e-05, + "loss": 1.8279, + "step": 7482 + }, + { + "epoch": 0.41708934841982054, + "grad_norm": 0.5133779048919678, + "learning_rate": 6.438343846420556e-05, + "loss": 1.4909, + "step": 7483 + }, + { + "epoch": 0.41714508667298367, + "grad_norm": 0.5815598964691162, + "learning_rate": 6.437491038227628e-05, + "loss": 1.6886, + "step": 7484 + }, + { + "epoch": 0.4172008249261468, + "grad_norm": 0.5756742358207703, + "learning_rate": 6.43663818444587e-05, + "loss": 1.5501, + "step": 7485 + }, + { + "epoch": 0.41725656317931, + "grad_norm": 0.5238984227180481, + "learning_rate": 6.435785285102321e-05, + "loss": 1.5227, + "step": 7486 + }, + { + "epoch": 0.4173123014324731, + "grad_norm": 0.6538522839546204, + "learning_rate": 6.434932340224036e-05, + "loss": 1.8644, + "step": 7487 + }, + { + "epoch": 0.41736803968563624, + "grad_norm": 0.5802149772644043, + "learning_rate": 6.434079349838062e-05, + "loss": 1.823, + "step": 7488 + }, + { + "epoch": 0.4174237779387994, + "grad_norm": 0.5617754459381104, + "learning_rate": 6.433226313971455e-05, + "loss": 1.6917, + "step": 7489 + }, + { + "epoch": 0.41747951619196255, + "grad_norm": 0.5967627763748169, + "learning_rate": 6.432373232651261e-05, + "loss": 1.8103, + "step": 7490 + }, + { + "epoch": 0.4175352544451257, + "grad_norm": 0.5762447714805603, + "learning_rate": 6.431520105904543e-05, + "loss": 1.6457, + "step": 7491 + }, + { + "epoch": 0.4175909926982888, + "grad_norm": 0.5717265009880066, + "learning_rate": 6.430666933758353e-05, + "loss": 1.7308, + "step": 7492 + }, + { + "epoch": 0.417646730951452, + "grad_norm": 0.5314132571220398, + "learning_rate": 6.429813716239747e-05, + "loss": 1.5346, + "step": 7493 + }, + { + "epoch": 0.4177024692046151, + "grad_norm": 0.5187550187110901, + "learning_rate": 6.42896045337579e-05, + "loss": 1.541, + "step": 7494 + }, + { + "epoch": 0.41775820745777825, + "grad_norm": 0.524467945098877, + "learning_rate": 6.428107145193535e-05, + "loss": 1.6209, + "step": 7495 + }, + { + "epoch": 0.41781394571094144, + "grad_norm": 0.5283476710319519, + "learning_rate": 6.427253791720051e-05, + "loss": 1.6333, + "step": 7496 + }, + { + "epoch": 0.41786968396410457, + "grad_norm": 0.5059264302253723, + "learning_rate": 6.426400392982396e-05, + "loss": 1.4312, + "step": 7497 + }, + { + "epoch": 0.4179254222172677, + "grad_norm": 0.5070070028305054, + "learning_rate": 6.425546949007639e-05, + "loss": 1.4918, + "step": 7498 + }, + { + "epoch": 0.4179811604704309, + "grad_norm": 0.5226110219955444, + "learning_rate": 6.424693459822842e-05, + "loss": 1.6224, + "step": 7499 + }, + { + "epoch": 0.418036898723594, + "grad_norm": 0.5620803833007812, + "learning_rate": 6.423839925455077e-05, + "loss": 1.815, + "step": 7500 + }, + { + "epoch": 0.41809263697675714, + "grad_norm": 0.5102522969245911, + "learning_rate": 6.422986345931411e-05, + "loss": 1.6608, + "step": 7501 + }, + { + "epoch": 0.4181483752299203, + "grad_norm": 0.5353087782859802, + "learning_rate": 6.422132721278915e-05, + "loss": 1.5651, + "step": 7502 + }, + { + "epoch": 0.41820411348308345, + "grad_norm": 0.6161815524101257, + "learning_rate": 6.421279051524658e-05, + "loss": 1.6941, + "step": 7503 + }, + { + "epoch": 0.4182598517362466, + "grad_norm": 0.6280367970466614, + "learning_rate": 6.420425336695719e-05, + "loss": 1.8122, + "step": 7504 + }, + { + "epoch": 0.4183155899894097, + "grad_norm": 0.5285361409187317, + "learning_rate": 6.419571576819168e-05, + "loss": 1.59, + "step": 7505 + }, + { + "epoch": 0.4183713282425729, + "grad_norm": 0.5601312518119812, + "learning_rate": 6.418717771922084e-05, + "loss": 1.6675, + "step": 7506 + }, + { + "epoch": 0.418427066495736, + "grad_norm": 0.6108425855636597, + "learning_rate": 6.417863922031544e-05, + "loss": 1.9184, + "step": 7507 + }, + { + "epoch": 0.41848280474889915, + "grad_norm": 0.5752027034759521, + "learning_rate": 6.417010027174627e-05, + "loss": 1.7789, + "step": 7508 + }, + { + "epoch": 0.41853854300206234, + "grad_norm": 0.5731359720230103, + "learning_rate": 6.416156087378415e-05, + "loss": 1.6246, + "step": 7509 + }, + { + "epoch": 0.41859428125522546, + "grad_norm": 0.5547140836715698, + "learning_rate": 6.415302102669987e-05, + "loss": 1.5967, + "step": 7510 + }, + { + "epoch": 0.4186500195083886, + "grad_norm": 0.5709370970726013, + "learning_rate": 6.414448073076429e-05, + "loss": 1.6613, + "step": 7511 + }, + { + "epoch": 0.4187057577615518, + "grad_norm": 0.5591392517089844, + "learning_rate": 6.413593998624824e-05, + "loss": 1.709, + "step": 7512 + }, + { + "epoch": 0.4187614960147149, + "grad_norm": 0.5560973286628723, + "learning_rate": 6.41273987934226e-05, + "loss": 1.6281, + "step": 7513 + }, + { + "epoch": 0.41881723426787804, + "grad_norm": 0.5822799205780029, + "learning_rate": 6.411885715255823e-05, + "loss": 1.7274, + "step": 7514 + }, + { + "epoch": 0.41887297252104116, + "grad_norm": 0.5955770611763, + "learning_rate": 6.411031506392605e-05, + "loss": 1.6704, + "step": 7515 + }, + { + "epoch": 0.41892871077420435, + "grad_norm": 0.5852923393249512, + "learning_rate": 6.410177252779692e-05, + "loss": 1.7526, + "step": 7516 + }, + { + "epoch": 0.4189844490273675, + "grad_norm": 0.5543795228004456, + "learning_rate": 6.409322954444179e-05, + "loss": 1.5793, + "step": 7517 + }, + { + "epoch": 0.4190401872805306, + "grad_norm": 0.5983227491378784, + "learning_rate": 6.408468611413159e-05, + "loss": 1.8319, + "step": 7518 + }, + { + "epoch": 0.4190959255336938, + "grad_norm": 0.5510286688804626, + "learning_rate": 6.407614223713727e-05, + "loss": 1.6506, + "step": 7519 + }, + { + "epoch": 0.4191516637868569, + "grad_norm": 0.5010602474212646, + "learning_rate": 6.40675979137298e-05, + "loss": 1.5807, + "step": 7520 + }, + { + "epoch": 0.41920740204002005, + "grad_norm": 0.5825363397598267, + "learning_rate": 6.405905314418013e-05, + "loss": 1.6839, + "step": 7521 + }, + { + "epoch": 0.41926314029318323, + "grad_norm": 0.5282953977584839, + "learning_rate": 6.405050792875926e-05, + "loss": 1.5602, + "step": 7522 + }, + { + "epoch": 0.41931887854634636, + "grad_norm": 0.5378554463386536, + "learning_rate": 6.40419622677382e-05, + "loss": 1.5204, + "step": 7523 + }, + { + "epoch": 0.4193746167995095, + "grad_norm": 0.548743486404419, + "learning_rate": 6.403341616138797e-05, + "loss": 1.7654, + "step": 7524 + }, + { + "epoch": 0.4194303550526727, + "grad_norm": 0.5437180399894714, + "learning_rate": 6.40248696099796e-05, + "loss": 1.7341, + "step": 7525 + }, + { + "epoch": 0.4194860933058358, + "grad_norm": 0.7081752419471741, + "learning_rate": 6.401632261378414e-05, + "loss": 1.3932, + "step": 7526 + }, + { + "epoch": 0.41954183155899893, + "grad_norm": 0.6215348243713379, + "learning_rate": 6.400777517307265e-05, + "loss": 1.9211, + "step": 7527 + }, + { + "epoch": 0.41959756981216206, + "grad_norm": 0.5972661375999451, + "learning_rate": 6.39992272881162e-05, + "loss": 1.848, + "step": 7528 + }, + { + "epoch": 0.41965330806532525, + "grad_norm": 0.5357066988945007, + "learning_rate": 6.399067895918587e-05, + "loss": 1.6233, + "step": 7529 + }, + { + "epoch": 0.4197090463184884, + "grad_norm": 0.5154542922973633, + "learning_rate": 6.39821301865528e-05, + "loss": 1.578, + "step": 7530 + }, + { + "epoch": 0.4197647845716515, + "grad_norm": 0.524694561958313, + "learning_rate": 6.397358097048806e-05, + "loss": 1.6923, + "step": 7531 + }, + { + "epoch": 0.4198205228248147, + "grad_norm": 0.5902459025382996, + "learning_rate": 6.39650313112628e-05, + "loss": 1.7314, + "step": 7532 + }, + { + "epoch": 0.4198762610779778, + "grad_norm": 0.5320487022399902, + "learning_rate": 6.39564812091482e-05, + "loss": 1.6396, + "step": 7533 + }, + { + "epoch": 0.41993199933114095, + "grad_norm": 0.5881032943725586, + "learning_rate": 6.394793066441534e-05, + "loss": 1.8865, + "step": 7534 + }, + { + "epoch": 0.41998773758430413, + "grad_norm": 0.5616896748542786, + "learning_rate": 6.393937967733548e-05, + "loss": 1.8735, + "step": 7535 + }, + { + "epoch": 0.42004347583746726, + "grad_norm": 0.5341779589653015, + "learning_rate": 6.393082824817974e-05, + "loss": 1.635, + "step": 7536 + }, + { + "epoch": 0.4200992140906304, + "grad_norm": 0.5636286735534668, + "learning_rate": 6.392227637721937e-05, + "loss": 1.797, + "step": 7537 + }, + { + "epoch": 0.4201549523437935, + "grad_norm": 0.5334611535072327, + "learning_rate": 6.391372406472557e-05, + "loss": 1.6705, + "step": 7538 + }, + { + "epoch": 0.4202106905969567, + "grad_norm": 0.588848888874054, + "learning_rate": 6.390517131096955e-05, + "loss": 1.7877, + "step": 7539 + }, + { + "epoch": 0.42026642885011983, + "grad_norm": 0.5427910685539246, + "learning_rate": 6.389661811622258e-05, + "loss": 1.5672, + "step": 7540 + }, + { + "epoch": 0.42032216710328296, + "grad_norm": 0.6046989560127258, + "learning_rate": 6.388806448075591e-05, + "loss": 1.8186, + "step": 7541 + }, + { + "epoch": 0.42037790535644615, + "grad_norm": 0.5373850464820862, + "learning_rate": 6.38795104048408e-05, + "loss": 1.5539, + "step": 7542 + }, + { + "epoch": 0.4204336436096093, + "grad_norm": 0.5726231336593628, + "learning_rate": 6.387095588874854e-05, + "loss": 1.6383, + "step": 7543 + }, + { + "epoch": 0.4204893818627724, + "grad_norm": 0.5964796543121338, + "learning_rate": 6.386240093275044e-05, + "loss": 1.9338, + "step": 7544 + }, + { + "epoch": 0.4205451201159356, + "grad_norm": 0.5379793047904968, + "learning_rate": 6.385384553711779e-05, + "loss": 1.5479, + "step": 7545 + }, + { + "epoch": 0.4206008583690987, + "grad_norm": 0.5321194529533386, + "learning_rate": 6.384528970212196e-05, + "loss": 1.6119, + "step": 7546 + }, + { + "epoch": 0.42065659662226185, + "grad_norm": 0.6583168506622314, + "learning_rate": 6.383673342803424e-05, + "loss": 1.7555, + "step": 7547 + }, + { + "epoch": 0.42071233487542503, + "grad_norm": 0.5755535364151001, + "learning_rate": 6.382817671512603e-05, + "loss": 1.629, + "step": 7548 + }, + { + "epoch": 0.42076807312858816, + "grad_norm": 0.614747941493988, + "learning_rate": 6.381961956366865e-05, + "loss": 2.0066, + "step": 7549 + }, + { + "epoch": 0.4208238113817513, + "grad_norm": 0.5643095374107361, + "learning_rate": 6.381106197393353e-05, + "loss": 1.7497, + "step": 7550 + }, + { + "epoch": 0.4208795496349144, + "grad_norm": 0.5332757234573364, + "learning_rate": 6.380250394619205e-05, + "loss": 1.4505, + "step": 7551 + }, + { + "epoch": 0.4209352878880776, + "grad_norm": 0.5462849736213684, + "learning_rate": 6.379394548071563e-05, + "loss": 1.7164, + "step": 7552 + }, + { + "epoch": 0.42099102614124073, + "grad_norm": 0.5277321338653564, + "learning_rate": 6.378538657777565e-05, + "loss": 1.4521, + "step": 7553 + }, + { + "epoch": 0.42104676439440386, + "grad_norm": 0.5687193274497986, + "learning_rate": 6.37768272376436e-05, + "loss": 1.6832, + "step": 7554 + }, + { + "epoch": 0.42110250264756705, + "grad_norm": 0.5538173913955688, + "learning_rate": 6.376826746059092e-05, + "loss": 1.5916, + "step": 7555 + }, + { + "epoch": 0.4211582409007302, + "grad_norm": 0.5794023871421814, + "learning_rate": 6.375970724688906e-05, + "loss": 1.5985, + "step": 7556 + }, + { + "epoch": 0.4212139791538933, + "grad_norm": 0.534807026386261, + "learning_rate": 6.375114659680951e-05, + "loss": 1.5822, + "step": 7557 + }, + { + "epoch": 0.4212697174070565, + "grad_norm": 0.5474613308906555, + "learning_rate": 6.374258551062378e-05, + "loss": 1.7155, + "step": 7558 + }, + { + "epoch": 0.4213254556602196, + "grad_norm": 0.558594286441803, + "learning_rate": 6.373402398860336e-05, + "loss": 1.7239, + "step": 7559 + }, + { + "epoch": 0.42138119391338275, + "grad_norm": 0.6263135671615601, + "learning_rate": 6.372546203101977e-05, + "loss": 1.8782, + "step": 7560 + }, + { + "epoch": 0.4214369321665459, + "grad_norm": 0.5759534239768982, + "learning_rate": 6.371689963814455e-05, + "loss": 1.798, + "step": 7561 + }, + { + "epoch": 0.42149267041970906, + "grad_norm": 0.582333505153656, + "learning_rate": 6.370833681024924e-05, + "loss": 1.679, + "step": 7562 + }, + { + "epoch": 0.4215484086728722, + "grad_norm": 0.5175591707229614, + "learning_rate": 6.369977354760541e-05, + "loss": 1.6172, + "step": 7563 + }, + { + "epoch": 0.4216041469260353, + "grad_norm": 0.6253464818000793, + "learning_rate": 6.369120985048464e-05, + "loss": 1.8897, + "step": 7564 + }, + { + "epoch": 0.4216598851791985, + "grad_norm": 0.6171419024467468, + "learning_rate": 6.368264571915854e-05, + "loss": 1.9296, + "step": 7565 + }, + { + "epoch": 0.42171562343236163, + "grad_norm": 0.5854969620704651, + "learning_rate": 6.367408115389868e-05, + "loss": 1.8127, + "step": 7566 + }, + { + "epoch": 0.42177136168552476, + "grad_norm": 0.5167074203491211, + "learning_rate": 6.366551615497669e-05, + "loss": 1.4419, + "step": 7567 + }, + { + "epoch": 0.42182709993868794, + "grad_norm": 0.5605902075767517, + "learning_rate": 6.36569507226642e-05, + "loss": 1.5106, + "step": 7568 + }, + { + "epoch": 0.4218828381918511, + "grad_norm": 0.5542864799499512, + "learning_rate": 6.364838485723286e-05, + "loss": 1.6104, + "step": 7569 + }, + { + "epoch": 0.4219385764450142, + "grad_norm": 0.5589380860328674, + "learning_rate": 6.363981855895433e-05, + "loss": 1.8112, + "step": 7570 + }, + { + "epoch": 0.4219943146981774, + "grad_norm": 0.5342586040496826, + "learning_rate": 6.363125182810028e-05, + "loss": 1.668, + "step": 7571 + }, + { + "epoch": 0.4220500529513405, + "grad_norm": 0.5474408268928528, + "learning_rate": 6.36226846649424e-05, + "loss": 1.477, + "step": 7572 + }, + { + "epoch": 0.42210579120450364, + "grad_norm": 0.549768328666687, + "learning_rate": 6.361411706975237e-05, + "loss": 1.6127, + "step": 7573 + }, + { + "epoch": 0.4221615294576668, + "grad_norm": 0.5820984244346619, + "learning_rate": 6.360554904280196e-05, + "loss": 1.7687, + "step": 7574 + }, + { + "epoch": 0.42221726771082996, + "grad_norm": 0.5574761033058167, + "learning_rate": 6.359698058436282e-05, + "loss": 1.7282, + "step": 7575 + }, + { + "epoch": 0.4222730059639931, + "grad_norm": 0.5506951808929443, + "learning_rate": 6.358841169470676e-05, + "loss": 1.6214, + "step": 7576 + }, + { + "epoch": 0.4223287442171562, + "grad_norm": 0.5659124851226807, + "learning_rate": 6.35798423741055e-05, + "loss": 1.6966, + "step": 7577 + }, + { + "epoch": 0.4223844824703194, + "grad_norm": 0.5484572052955627, + "learning_rate": 6.357127262283081e-05, + "loss": 1.6683, + "step": 7578 + }, + { + "epoch": 0.42244022072348253, + "grad_norm": 0.4761580526828766, + "learning_rate": 6.356270244115448e-05, + "loss": 1.3579, + "step": 7579 + }, + { + "epoch": 0.42249595897664566, + "grad_norm": 0.5656337738037109, + "learning_rate": 6.355413182934831e-05, + "loss": 1.7506, + "step": 7580 + }, + { + "epoch": 0.42255169722980884, + "grad_norm": 0.6253755688667297, + "learning_rate": 6.35455607876841e-05, + "loss": 1.5443, + "step": 7581 + }, + { + "epoch": 0.42260743548297197, + "grad_norm": 0.5522517561912537, + "learning_rate": 6.353698931643368e-05, + "loss": 1.7318, + "step": 7582 + }, + { + "epoch": 0.4226631737361351, + "grad_norm": 0.5824682712554932, + "learning_rate": 6.352841741586888e-05, + "loss": 1.9499, + "step": 7583 + }, + { + "epoch": 0.42271891198929823, + "grad_norm": 0.6166448593139648, + "learning_rate": 6.351984508626155e-05, + "loss": 1.6598, + "step": 7584 + }, + { + "epoch": 0.4227746502424614, + "grad_norm": 0.6640730500221252, + "learning_rate": 6.351127232788357e-05, + "loss": 1.9022, + "step": 7585 + }, + { + "epoch": 0.42283038849562454, + "grad_norm": 0.5395544171333313, + "learning_rate": 6.350269914100681e-05, + "loss": 1.8523, + "step": 7586 + }, + { + "epoch": 0.42288612674878767, + "grad_norm": 0.597951352596283, + "learning_rate": 6.349412552590317e-05, + "loss": 1.7423, + "step": 7587 + }, + { + "epoch": 0.42294186500195086, + "grad_norm": 0.5310340523719788, + "learning_rate": 6.348555148284452e-05, + "loss": 1.6669, + "step": 7588 + }, + { + "epoch": 0.422997603255114, + "grad_norm": 0.563275933265686, + "learning_rate": 6.347697701210281e-05, + "loss": 1.8138, + "step": 7589 + }, + { + "epoch": 0.4230533415082771, + "grad_norm": 0.5225051641464233, + "learning_rate": 6.346840211394998e-05, + "loss": 1.5228, + "step": 7590 + }, + { + "epoch": 0.4231090797614403, + "grad_norm": 0.5949013233184814, + "learning_rate": 6.345982678865795e-05, + "loss": 1.8378, + "step": 7591 + }, + { + "epoch": 0.4231648180146034, + "grad_norm": 0.6444050073623657, + "learning_rate": 6.345125103649869e-05, + "loss": 1.9561, + "step": 7592 + }, + { + "epoch": 0.42322055626776656, + "grad_norm": 0.538077712059021, + "learning_rate": 6.344267485774417e-05, + "loss": 1.6172, + "step": 7593 + }, + { + "epoch": 0.42327629452092974, + "grad_norm": 0.5770418047904968, + "learning_rate": 6.34340982526664e-05, + "loss": 1.7064, + "step": 7594 + }, + { + "epoch": 0.42333203277409287, + "grad_norm": 0.5491243600845337, + "learning_rate": 6.342552122153734e-05, + "loss": 1.5869, + "step": 7595 + }, + { + "epoch": 0.423387771027256, + "grad_norm": 0.5911741852760315, + "learning_rate": 6.3416943764629e-05, + "loss": 1.4539, + "step": 7596 + }, + { + "epoch": 0.42344350928041913, + "grad_norm": 0.5493375062942505, + "learning_rate": 6.340836588221347e-05, + "loss": 1.2324, + "step": 7597 + }, + { + "epoch": 0.4234992475335823, + "grad_norm": 0.5272154808044434, + "learning_rate": 6.339978757456274e-05, + "loss": 1.7336, + "step": 7598 + }, + { + "epoch": 0.42355498578674544, + "grad_norm": 0.6132648587226868, + "learning_rate": 6.339120884194886e-05, + "loss": 1.8399, + "step": 7599 + }, + { + "epoch": 0.42361072403990857, + "grad_norm": 0.6002299189567566, + "learning_rate": 6.338262968464394e-05, + "loss": 1.7355, + "step": 7600 + }, + { + "epoch": 0.42366646229307175, + "grad_norm": 0.5747309327125549, + "learning_rate": 6.337405010292e-05, + "loss": 1.5466, + "step": 7601 + }, + { + "epoch": 0.4237222005462349, + "grad_norm": 0.6044133901596069, + "learning_rate": 6.336547009704919e-05, + "loss": 1.894, + "step": 7602 + }, + { + "epoch": 0.423777938799398, + "grad_norm": 0.6029581427574158, + "learning_rate": 6.335688966730358e-05, + "loss": 1.7874, + "step": 7603 + }, + { + "epoch": 0.4238336770525612, + "grad_norm": 0.5374162197113037, + "learning_rate": 6.334830881395533e-05, + "loss": 1.4537, + "step": 7604 + }, + { + "epoch": 0.4238894153057243, + "grad_norm": 0.5794885158538818, + "learning_rate": 6.333972753727653e-05, + "loss": 1.6731, + "step": 7605 + }, + { + "epoch": 0.42394515355888746, + "grad_norm": 0.6136147379875183, + "learning_rate": 6.333114583753936e-05, + "loss": 2.0005, + "step": 7606 + }, + { + "epoch": 0.4240008918120506, + "grad_norm": 0.6465775370597839, + "learning_rate": 6.332256371501597e-05, + "loss": 1.7024, + "step": 7607 + }, + { + "epoch": 0.42405663006521377, + "grad_norm": 0.4953748285770416, + "learning_rate": 6.331398116997851e-05, + "loss": 1.4046, + "step": 7608 + }, + { + "epoch": 0.4241123683183769, + "grad_norm": 0.5147947669029236, + "learning_rate": 6.330539820269921e-05, + "loss": 1.7066, + "step": 7609 + }, + { + "epoch": 0.42416810657154, + "grad_norm": 0.5854727029800415, + "learning_rate": 6.329681481345026e-05, + "loss": 1.7871, + "step": 7610 + }, + { + "epoch": 0.4242238448247032, + "grad_norm": 0.5421152710914612, + "learning_rate": 6.328823100250386e-05, + "loss": 1.6782, + "step": 7611 + }, + { + "epoch": 0.42427958307786634, + "grad_norm": 0.5201201438903809, + "learning_rate": 6.327964677013224e-05, + "loss": 1.6405, + "step": 7612 + }, + { + "epoch": 0.42433532133102947, + "grad_norm": 0.5656992197036743, + "learning_rate": 6.327106211660769e-05, + "loss": 1.798, + "step": 7613 + }, + { + "epoch": 0.42439105958419265, + "grad_norm": 0.5751951336860657, + "learning_rate": 6.326247704220239e-05, + "loss": 1.6055, + "step": 7614 + }, + { + "epoch": 0.4244467978373558, + "grad_norm": 0.546371579170227, + "learning_rate": 6.325389154718865e-05, + "loss": 1.7596, + "step": 7615 + }, + { + "epoch": 0.4245025360905189, + "grad_norm": 0.5406731367111206, + "learning_rate": 6.324530563183875e-05, + "loss": 1.6401, + "step": 7616 + }, + { + "epoch": 0.4245582743436821, + "grad_norm": 0.5809882879257202, + "learning_rate": 6.323671929642498e-05, + "loss": 1.868, + "step": 7617 + }, + { + "epoch": 0.4246140125968452, + "grad_norm": 0.540643572807312, + "learning_rate": 6.322813254121964e-05, + "loss": 1.715, + "step": 7618 + }, + { + "epoch": 0.42466975085000835, + "grad_norm": 0.5267550945281982, + "learning_rate": 6.321954536649508e-05, + "loss": 1.5837, + "step": 7619 + }, + { + "epoch": 0.4247254891031715, + "grad_norm": 0.5602602958679199, + "learning_rate": 6.32109577725236e-05, + "loss": 1.7406, + "step": 7620 + }, + { + "epoch": 0.42478122735633467, + "grad_norm": 0.5607280731201172, + "learning_rate": 6.320236975957757e-05, + "loss": 1.6099, + "step": 7621 + }, + { + "epoch": 0.4248369656094978, + "grad_norm": 0.5364249348640442, + "learning_rate": 6.319378132792935e-05, + "loss": 1.5277, + "step": 7622 + }, + { + "epoch": 0.4248927038626609, + "grad_norm": 0.5527327656745911, + "learning_rate": 6.318519247785131e-05, + "loss": 1.7702, + "step": 7623 + }, + { + "epoch": 0.4249484421158241, + "grad_norm": 0.5770801901817322, + "learning_rate": 6.317660320961585e-05, + "loss": 1.6098, + "step": 7624 + }, + { + "epoch": 0.42500418036898724, + "grad_norm": 0.5606113076210022, + "learning_rate": 6.316801352349534e-05, + "loss": 1.6451, + "step": 7625 + }, + { + "epoch": 0.42505991862215037, + "grad_norm": 0.6124593615531921, + "learning_rate": 6.315942341976223e-05, + "loss": 1.9987, + "step": 7626 + }, + { + "epoch": 0.42511565687531355, + "grad_norm": 0.5524605512619019, + "learning_rate": 6.315083289868892e-05, + "loss": 1.6352, + "step": 7627 + }, + { + "epoch": 0.4251713951284767, + "grad_norm": 0.5734837651252747, + "learning_rate": 6.314224196054787e-05, + "loss": 1.8757, + "step": 7628 + }, + { + "epoch": 0.4252271333816398, + "grad_norm": 0.64513099193573, + "learning_rate": 6.313365060561153e-05, + "loss": 2.0665, + "step": 7629 + }, + { + "epoch": 0.42528287163480294, + "grad_norm": 0.5457690954208374, + "learning_rate": 6.312505883415238e-05, + "loss": 1.6602, + "step": 7630 + }, + { + "epoch": 0.4253386098879661, + "grad_norm": 0.6007886528968811, + "learning_rate": 6.311646664644288e-05, + "loss": 1.7241, + "step": 7631 + }, + { + "epoch": 0.42539434814112925, + "grad_norm": 0.5715931057929993, + "learning_rate": 6.310787404275553e-05, + "loss": 1.7581, + "step": 7632 + }, + { + "epoch": 0.4254500863942924, + "grad_norm": 0.5710930228233337, + "learning_rate": 6.309928102336284e-05, + "loss": 1.7147, + "step": 7633 + }, + { + "epoch": 0.42550582464745557, + "grad_norm": 0.5583118796348572, + "learning_rate": 6.309068758853732e-05, + "loss": 1.6103, + "step": 7634 + }, + { + "epoch": 0.4255615629006187, + "grad_norm": 0.5537952184677124, + "learning_rate": 6.308209373855154e-05, + "loss": 1.6947, + "step": 7635 + }, + { + "epoch": 0.4256173011537818, + "grad_norm": 0.5451967716217041, + "learning_rate": 6.3073499473678e-05, + "loss": 1.6384, + "step": 7636 + }, + { + "epoch": 0.425673039406945, + "grad_norm": 0.5317254066467285, + "learning_rate": 6.30649047941893e-05, + "loss": 1.5643, + "step": 7637 + }, + { + "epoch": 0.42572877766010814, + "grad_norm": 0.5423393845558167, + "learning_rate": 6.305630970035796e-05, + "loss": 1.5257, + "step": 7638 + }, + { + "epoch": 0.42578451591327127, + "grad_norm": 0.5897427797317505, + "learning_rate": 6.304771419245663e-05, + "loss": 1.8738, + "step": 7639 + }, + { + "epoch": 0.42584025416643445, + "grad_norm": 0.5559675097465515, + "learning_rate": 6.303911827075786e-05, + "loss": 1.8562, + "step": 7640 + }, + { + "epoch": 0.4258959924195976, + "grad_norm": 0.5857858061790466, + "learning_rate": 6.303052193553429e-05, + "loss": 1.7146, + "step": 7641 + }, + { + "epoch": 0.4259517306727607, + "grad_norm": 0.6495271325111389, + "learning_rate": 6.302192518705853e-05, + "loss": 1.7639, + "step": 7642 + }, + { + "epoch": 0.42600746892592384, + "grad_norm": 0.5638108253479004, + "learning_rate": 6.301332802560325e-05, + "loss": 1.5804, + "step": 7643 + }, + { + "epoch": 0.426063207179087, + "grad_norm": 0.5066633224487305, + "learning_rate": 6.300473045144107e-05, + "loss": 1.4344, + "step": 7644 + }, + { + "epoch": 0.42611894543225015, + "grad_norm": 0.5637665390968323, + "learning_rate": 6.299613246484464e-05, + "loss": 1.6573, + "step": 7645 + }, + { + "epoch": 0.4261746836854133, + "grad_norm": 0.5206940174102783, + "learning_rate": 6.298753406608668e-05, + "loss": 1.5995, + "step": 7646 + }, + { + "epoch": 0.42623042193857646, + "grad_norm": 0.5374553799629211, + "learning_rate": 6.297893525543986e-05, + "loss": 1.7107, + "step": 7647 + }, + { + "epoch": 0.4262861601917396, + "grad_norm": 0.5552041530609131, + "learning_rate": 6.297033603317689e-05, + "loss": 1.6734, + "step": 7648 + }, + { + "epoch": 0.4263418984449027, + "grad_norm": 0.5269225239753723, + "learning_rate": 6.296173639957045e-05, + "loss": 1.64, + "step": 7649 + }, + { + "epoch": 0.4263976366980659, + "grad_norm": 0.5553382635116577, + "learning_rate": 6.295313635489335e-05, + "loss": 1.3837, + "step": 7650 + }, + { + "epoch": 0.42645337495122904, + "grad_norm": 0.5205674171447754, + "learning_rate": 6.294453589941826e-05, + "loss": 1.6142, + "step": 7651 + }, + { + "epoch": 0.42650911320439217, + "grad_norm": 0.6198689937591553, + "learning_rate": 6.2935935033418e-05, + "loss": 1.7297, + "step": 7652 + }, + { + "epoch": 0.4265648514575553, + "grad_norm": 0.556909441947937, + "learning_rate": 6.292733375716526e-05, + "loss": 1.7119, + "step": 7653 + }, + { + "epoch": 0.4266205897107185, + "grad_norm": 0.5496246218681335, + "learning_rate": 6.291873207093287e-05, + "loss": 1.6478, + "step": 7654 + }, + { + "epoch": 0.4266763279638816, + "grad_norm": 0.5758047103881836, + "learning_rate": 6.291012997499362e-05, + "loss": 1.8439, + "step": 7655 + }, + { + "epoch": 0.42673206621704474, + "grad_norm": 0.5833730697631836, + "learning_rate": 6.290152746962034e-05, + "loss": 1.6251, + "step": 7656 + }, + { + "epoch": 0.4267878044702079, + "grad_norm": 0.509559690952301, + "learning_rate": 6.289292455508582e-05, + "loss": 1.6364, + "step": 7657 + }, + { + "epoch": 0.42684354272337105, + "grad_norm": 0.5244433879852295, + "learning_rate": 6.28843212316629e-05, + "loss": 1.4855, + "step": 7658 + }, + { + "epoch": 0.4268992809765342, + "grad_norm": 0.5262942314147949, + "learning_rate": 6.287571749962444e-05, + "loss": 1.6034, + "step": 7659 + }, + { + "epoch": 0.42695501922969736, + "grad_norm": 0.592850923538208, + "learning_rate": 6.286711335924326e-05, + "loss": 2.0333, + "step": 7660 + }, + { + "epoch": 0.4270107574828605, + "grad_norm": 0.5585233569145203, + "learning_rate": 6.28585088107923e-05, + "loss": 1.7037, + "step": 7661 + }, + { + "epoch": 0.4270664957360236, + "grad_norm": 0.5201496481895447, + "learning_rate": 6.284990385454439e-05, + "loss": 1.5226, + "step": 7662 + }, + { + "epoch": 0.4271222339891868, + "grad_norm": 0.5410779714584351, + "learning_rate": 6.284129849077247e-05, + "loss": 1.6186, + "step": 7663 + }, + { + "epoch": 0.42717797224234993, + "grad_norm": 0.5643417835235596, + "learning_rate": 6.283269271974941e-05, + "loss": 1.7211, + "step": 7664 + }, + { + "epoch": 0.42723371049551306, + "grad_norm": 0.5603637099266052, + "learning_rate": 6.282408654174818e-05, + "loss": 1.6978, + "step": 7665 + }, + { + "epoch": 0.4272894487486762, + "grad_norm": 0.5303884744644165, + "learning_rate": 6.281547995704168e-05, + "loss": 1.5544, + "step": 7666 + }, + { + "epoch": 0.4273451870018394, + "grad_norm": 0.5895907282829285, + "learning_rate": 6.280687296590287e-05, + "loss": 1.697, + "step": 7667 + }, + { + "epoch": 0.4274009252550025, + "grad_norm": 0.566055953502655, + "learning_rate": 6.279826556860472e-05, + "loss": 1.6596, + "step": 7668 + }, + { + "epoch": 0.42745666350816564, + "grad_norm": 0.5401179790496826, + "learning_rate": 6.278965776542021e-05, + "loss": 1.7029, + "step": 7669 + }, + { + "epoch": 0.4275124017613288, + "grad_norm": 0.6178464889526367, + "learning_rate": 6.278104955662234e-05, + "loss": 1.7344, + "step": 7670 + }, + { + "epoch": 0.42756814001449195, + "grad_norm": 0.5440572500228882, + "learning_rate": 6.277244094248407e-05, + "loss": 1.7182, + "step": 7671 + }, + { + "epoch": 0.4276238782676551, + "grad_norm": 0.5953531265258789, + "learning_rate": 6.276383192327846e-05, + "loss": 1.7045, + "step": 7672 + }, + { + "epoch": 0.42767961652081826, + "grad_norm": 0.5182901620864868, + "learning_rate": 6.27552224992785e-05, + "loss": 1.5657, + "step": 7673 + }, + { + "epoch": 0.4277353547739814, + "grad_norm": 0.5608685612678528, + "learning_rate": 6.274661267075728e-05, + "loss": 1.701, + "step": 7674 + }, + { + "epoch": 0.4277910930271445, + "grad_norm": 0.5933842658996582, + "learning_rate": 6.27380024379878e-05, + "loss": 1.804, + "step": 7675 + }, + { + "epoch": 0.42784683128030765, + "grad_norm": 0.586521327495575, + "learning_rate": 6.272939180124317e-05, + "loss": 1.7744, + "step": 7676 + }, + { + "epoch": 0.42790256953347083, + "grad_norm": 0.6096509695053101, + "learning_rate": 6.272078076079644e-05, + "loss": 1.9837, + "step": 7677 + }, + { + "epoch": 0.42795830778663396, + "grad_norm": 0.6212565302848816, + "learning_rate": 6.27121693169207e-05, + "loss": 1.8042, + "step": 7678 + }, + { + "epoch": 0.4280140460397971, + "grad_norm": 0.5542432069778442, + "learning_rate": 6.270355746988908e-05, + "loss": 1.6222, + "step": 7679 + }, + { + "epoch": 0.4280697842929603, + "grad_norm": 0.5913196802139282, + "learning_rate": 6.269494521997467e-05, + "loss": 1.6313, + "step": 7680 + }, + { + "epoch": 0.4281255225461234, + "grad_norm": 0.5573778748512268, + "learning_rate": 6.268633256745063e-05, + "loss": 1.7364, + "step": 7681 + }, + { + "epoch": 0.42818126079928653, + "grad_norm": 0.5151004195213318, + "learning_rate": 6.267771951259009e-05, + "loss": 1.8938, + "step": 7682 + }, + { + "epoch": 0.4282369990524497, + "grad_norm": 0.5424497127532959, + "learning_rate": 6.26691060556662e-05, + "loss": 1.706, + "step": 7683 + }, + { + "epoch": 0.42829273730561285, + "grad_norm": 0.5353766083717346, + "learning_rate": 6.266049219695211e-05, + "loss": 1.6015, + "step": 7684 + }, + { + "epoch": 0.428348475558776, + "grad_norm": 0.5848101377487183, + "learning_rate": 6.265187793672105e-05, + "loss": 1.9252, + "step": 7685 + }, + { + "epoch": 0.42840421381193916, + "grad_norm": 0.5816083550453186, + "learning_rate": 6.264326327524617e-05, + "loss": 1.7076, + "step": 7686 + }, + { + "epoch": 0.4284599520651023, + "grad_norm": 0.595378577709198, + "learning_rate": 6.263464821280071e-05, + "loss": 1.8343, + "step": 7687 + }, + { + "epoch": 0.4285156903182654, + "grad_norm": 0.5391969084739685, + "learning_rate": 6.262603274965786e-05, + "loss": 1.5771, + "step": 7688 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.5316036939620972, + "learning_rate": 6.261741688609087e-05, + "loss": 1.6646, + "step": 7689 + }, + { + "epoch": 0.42862716682459173, + "grad_norm": 0.5671446323394775, + "learning_rate": 6.260880062237299e-05, + "loss": 1.8235, + "step": 7690 + }, + { + "epoch": 0.42868290507775486, + "grad_norm": 0.5752628445625305, + "learning_rate": 6.260018395877747e-05, + "loss": 1.7776, + "step": 7691 + }, + { + "epoch": 0.428738643330918, + "grad_norm": 0.5416520833969116, + "learning_rate": 6.259156689557757e-05, + "loss": 1.5817, + "step": 7692 + }, + { + "epoch": 0.4287943815840812, + "grad_norm": 0.5795433521270752, + "learning_rate": 6.258294943304656e-05, + "loss": 1.6236, + "step": 7693 + }, + { + "epoch": 0.4288501198372443, + "grad_norm": 0.5906192064285278, + "learning_rate": 6.257433157145779e-05, + "loss": 1.8114, + "step": 7694 + }, + { + "epoch": 0.42890585809040743, + "grad_norm": 0.589847207069397, + "learning_rate": 6.256571331108454e-05, + "loss": 1.7796, + "step": 7695 + }, + { + "epoch": 0.4289615963435706, + "grad_norm": 0.5236275792121887, + "learning_rate": 6.25570946522001e-05, + "loss": 1.4089, + "step": 7696 + }, + { + "epoch": 0.42901733459673375, + "grad_norm": 0.5735291838645935, + "learning_rate": 6.254847559507783e-05, + "loss": 1.8332, + "step": 7697 + }, + { + "epoch": 0.4290730728498969, + "grad_norm": 0.5835584998130798, + "learning_rate": 6.253985613999111e-05, + "loss": 1.7905, + "step": 7698 + }, + { + "epoch": 0.42912881110306, + "grad_norm": 0.5706406831741333, + "learning_rate": 6.253123628721324e-05, + "loss": 1.7185, + "step": 7699 + }, + { + "epoch": 0.4291845493562232, + "grad_norm": 0.6053869724273682, + "learning_rate": 6.252261603701762e-05, + "loss": 1.6092, + "step": 7700 + }, + { + "epoch": 0.4292402876093863, + "grad_norm": 0.559517502784729, + "learning_rate": 6.251399538967764e-05, + "loss": 1.6353, + "step": 7701 + }, + { + "epoch": 0.42929602586254945, + "grad_norm": 0.5170453190803528, + "learning_rate": 6.250537434546668e-05, + "loss": 1.5933, + "step": 7702 + }, + { + "epoch": 0.42935176411571263, + "grad_norm": 0.5452066659927368, + "learning_rate": 6.249675290465817e-05, + "loss": 1.5875, + "step": 7703 + }, + { + "epoch": 0.42940750236887576, + "grad_norm": 0.5306586623191833, + "learning_rate": 6.248813106752551e-05, + "loss": 1.4277, + "step": 7704 + }, + { + "epoch": 0.4294632406220389, + "grad_norm": 0.601926863193512, + "learning_rate": 6.247950883434214e-05, + "loss": 1.667, + "step": 7705 + }, + { + "epoch": 0.4295189788752021, + "grad_norm": 0.6103541254997253, + "learning_rate": 6.24708862053815e-05, + "loss": 1.6387, + "step": 7706 + }, + { + "epoch": 0.4295747171283652, + "grad_norm": 0.5850464701652527, + "learning_rate": 6.246226318091708e-05, + "loss": 1.5703, + "step": 7707 + }, + { + "epoch": 0.42963045538152833, + "grad_norm": 0.564311683177948, + "learning_rate": 6.245363976122232e-05, + "loss": 1.4084, + "step": 7708 + }, + { + "epoch": 0.4296861936346915, + "grad_norm": 0.5692956447601318, + "learning_rate": 6.244501594657073e-05, + "loss": 1.6056, + "step": 7709 + }, + { + "epoch": 0.42974193188785464, + "grad_norm": 0.48438626527786255, + "learning_rate": 6.243639173723577e-05, + "loss": 1.3122, + "step": 7710 + }, + { + "epoch": 0.4297976701410178, + "grad_norm": 0.5293724536895752, + "learning_rate": 6.2427767133491e-05, + "loss": 1.5922, + "step": 7711 + }, + { + "epoch": 0.4298534083941809, + "grad_norm": 0.5632352232933044, + "learning_rate": 6.241914213560988e-05, + "loss": 1.7423, + "step": 7712 + }, + { + "epoch": 0.4299091466473441, + "grad_norm": 0.5172026753425598, + "learning_rate": 6.241051674386602e-05, + "loss": 1.4298, + "step": 7713 + }, + { + "epoch": 0.4299648849005072, + "grad_norm": 0.5803625583648682, + "learning_rate": 6.24018909585329e-05, + "loss": 1.6772, + "step": 7714 + }, + { + "epoch": 0.43002062315367034, + "grad_norm": 0.530988335609436, + "learning_rate": 6.239326477988413e-05, + "loss": 1.7007, + "step": 7715 + }, + { + "epoch": 0.43007636140683353, + "grad_norm": 0.5132483243942261, + "learning_rate": 6.238463820819325e-05, + "loss": 1.5829, + "step": 7716 + }, + { + "epoch": 0.43013209965999666, + "grad_norm": 0.6094499230384827, + "learning_rate": 6.237601124373385e-05, + "loss": 1.7885, + "step": 7717 + }, + { + "epoch": 0.4301878379131598, + "grad_norm": 0.5744908452033997, + "learning_rate": 6.236738388677952e-05, + "loss": 1.7993, + "step": 7718 + }, + { + "epoch": 0.43024357616632297, + "grad_norm": 0.6198621392250061, + "learning_rate": 6.23587561376039e-05, + "loss": 1.8437, + "step": 7719 + }, + { + "epoch": 0.4302993144194861, + "grad_norm": 0.5478682518005371, + "learning_rate": 6.235012799648057e-05, + "loss": 1.7246, + "step": 7720 + }, + { + "epoch": 0.43035505267264923, + "grad_norm": 0.5738255381584167, + "learning_rate": 6.23414994636832e-05, + "loss": 1.7322, + "step": 7721 + }, + { + "epoch": 0.43041079092581236, + "grad_norm": 0.6019119024276733, + "learning_rate": 6.233287053948543e-05, + "loss": 1.6743, + "step": 7722 + }, + { + "epoch": 0.43046652917897554, + "grad_norm": 0.5403818488121033, + "learning_rate": 6.23242412241609e-05, + "loss": 1.5439, + "step": 7723 + }, + { + "epoch": 0.43052226743213867, + "grad_norm": 0.5892661213874817, + "learning_rate": 6.23156115179833e-05, + "loss": 2.0254, + "step": 7724 + }, + { + "epoch": 0.4305780056853018, + "grad_norm": 0.6273830533027649, + "learning_rate": 6.230698142122629e-05, + "loss": 1.6787, + "step": 7725 + }, + { + "epoch": 0.430633743938465, + "grad_norm": 0.5560447573661804, + "learning_rate": 6.229835093416361e-05, + "loss": 1.711, + "step": 7726 + }, + { + "epoch": 0.4306894821916281, + "grad_norm": 0.5284225344657898, + "learning_rate": 6.228972005706893e-05, + "loss": 1.5921, + "step": 7727 + }, + { + "epoch": 0.43074522044479124, + "grad_norm": 0.5550575852394104, + "learning_rate": 6.228108879021599e-05, + "loss": 1.5798, + "step": 7728 + }, + { + "epoch": 0.43080095869795443, + "grad_norm": 0.5931698083877563, + "learning_rate": 6.22724571338785e-05, + "loss": 2.0899, + "step": 7729 + }, + { + "epoch": 0.43085669695111756, + "grad_norm": 0.5341006517410278, + "learning_rate": 6.226382508833026e-05, + "loss": 1.6937, + "step": 7730 + }, + { + "epoch": 0.4309124352042807, + "grad_norm": 0.5837813019752502, + "learning_rate": 6.225519265384495e-05, + "loss": 1.7363, + "step": 7731 + }, + { + "epoch": 0.43096817345744387, + "grad_norm": 0.5665456056594849, + "learning_rate": 6.22465598306964e-05, + "loss": 1.6438, + "step": 7732 + }, + { + "epoch": 0.431023911710607, + "grad_norm": 0.7508494257926941, + "learning_rate": 6.223792661915838e-05, + "loss": 1.6701, + "step": 7733 + }, + { + "epoch": 0.43107964996377013, + "grad_norm": 0.5742450952529907, + "learning_rate": 6.222929301950466e-05, + "loss": 1.6195, + "step": 7734 + }, + { + "epoch": 0.43113538821693326, + "grad_norm": 0.5885428190231323, + "learning_rate": 6.222065903200908e-05, + "loss": 1.852, + "step": 7735 + }, + { + "epoch": 0.43119112647009644, + "grad_norm": 0.6054401993751526, + "learning_rate": 6.221202465694545e-05, + "loss": 1.9739, + "step": 7736 + }, + { + "epoch": 0.43124686472325957, + "grad_norm": 0.5252482891082764, + "learning_rate": 6.22033898945876e-05, + "loss": 1.5755, + "step": 7737 + }, + { + "epoch": 0.4313026029764227, + "grad_norm": 0.5708329677581787, + "learning_rate": 6.219475474520936e-05, + "loss": 1.7666, + "step": 7738 + }, + { + "epoch": 0.4313583412295859, + "grad_norm": 0.5406473278999329, + "learning_rate": 6.218611920908461e-05, + "loss": 1.6721, + "step": 7739 + }, + { + "epoch": 0.431414079482749, + "grad_norm": 0.5870915055274963, + "learning_rate": 6.21774832864872e-05, + "loss": 1.635, + "step": 7740 + }, + { + "epoch": 0.43146981773591214, + "grad_norm": 0.5580663681030273, + "learning_rate": 6.216884697769104e-05, + "loss": 1.7878, + "step": 7741 + }, + { + "epoch": 0.4315255559890753, + "grad_norm": 0.6071598529815674, + "learning_rate": 6.216021028296999e-05, + "loss": 1.817, + "step": 7742 + }, + { + "epoch": 0.43158129424223846, + "grad_norm": 0.5742529630661011, + "learning_rate": 6.215157320259798e-05, + "loss": 1.6086, + "step": 7743 + }, + { + "epoch": 0.4316370324954016, + "grad_norm": 0.5802901387214661, + "learning_rate": 6.214293573684889e-05, + "loss": 1.7647, + "step": 7744 + }, + { + "epoch": 0.4316927707485647, + "grad_norm": 0.6176155209541321, + "learning_rate": 6.21342978859967e-05, + "loss": 2.0043, + "step": 7745 + }, + { + "epoch": 0.4317485090017279, + "grad_norm": 0.6097760200500488, + "learning_rate": 6.212565965031532e-05, + "loss": 1.7955, + "step": 7746 + }, + { + "epoch": 0.431804247254891, + "grad_norm": 0.5612444877624512, + "learning_rate": 6.211702103007871e-05, + "loss": 1.6242, + "step": 7747 + }, + { + "epoch": 0.43185998550805416, + "grad_norm": 0.6074878573417664, + "learning_rate": 6.210838202556085e-05, + "loss": 1.5951, + "step": 7748 + }, + { + "epoch": 0.43191572376121734, + "grad_norm": 0.5827562808990479, + "learning_rate": 6.209974263703569e-05, + "loss": 1.849, + "step": 7749 + }, + { + "epoch": 0.43197146201438047, + "grad_norm": 0.5888208746910095, + "learning_rate": 6.209110286477727e-05, + "loss": 1.7899, + "step": 7750 + }, + { + "epoch": 0.4320272002675436, + "grad_norm": 0.5709846019744873, + "learning_rate": 6.208246270905952e-05, + "loss": 1.8588, + "step": 7751 + }, + { + "epoch": 0.4320829385207068, + "grad_norm": 0.5687053203582764, + "learning_rate": 6.207382217015655e-05, + "loss": 1.7115, + "step": 7752 + }, + { + "epoch": 0.4321386767738699, + "grad_norm": 0.5730668306350708, + "learning_rate": 6.206518124834231e-05, + "loss": 1.7556, + "step": 7753 + }, + { + "epoch": 0.43219441502703304, + "grad_norm": 0.48593658208847046, + "learning_rate": 6.205653994389087e-05, + "loss": 1.4447, + "step": 7754 + }, + { + "epoch": 0.4322501532801962, + "grad_norm": 0.5364407896995544, + "learning_rate": 6.204789825707626e-05, + "loss": 1.7097, + "step": 7755 + }, + { + "epoch": 0.43230589153335935, + "grad_norm": 0.5474497079849243, + "learning_rate": 6.203925618817258e-05, + "loss": 1.6242, + "step": 7756 + }, + { + "epoch": 0.4323616297865225, + "grad_norm": 0.5366718173027039, + "learning_rate": 6.203061373745388e-05, + "loss": 1.6055, + "step": 7757 + }, + { + "epoch": 0.4324173680396856, + "grad_norm": 0.6138222813606262, + "learning_rate": 6.202197090519428e-05, + "loss": 1.6537, + "step": 7758 + }, + { + "epoch": 0.4324731062928488, + "grad_norm": 0.5678575038909912, + "learning_rate": 6.201332769166782e-05, + "loss": 1.5895, + "step": 7759 + }, + { + "epoch": 0.4325288445460119, + "grad_norm": 0.5866283178329468, + "learning_rate": 6.200468409714866e-05, + "loss": 1.6663, + "step": 7760 + }, + { + "epoch": 0.43258458279917505, + "grad_norm": 0.5652245879173279, + "learning_rate": 6.199604012191093e-05, + "loss": 1.6446, + "step": 7761 + }, + { + "epoch": 0.43264032105233824, + "grad_norm": 0.5838261842727661, + "learning_rate": 6.198739576622872e-05, + "loss": 1.8155, + "step": 7762 + }, + { + "epoch": 0.43269605930550137, + "grad_norm": 0.537699818611145, + "learning_rate": 6.197875103037623e-05, + "loss": 1.6124, + "step": 7763 + }, + { + "epoch": 0.4327517975586645, + "grad_norm": 0.6197475790977478, + "learning_rate": 6.197010591462758e-05, + "loss": 1.72, + "step": 7764 + }, + { + "epoch": 0.4328075358118277, + "grad_norm": 0.5581753253936768, + "learning_rate": 6.196146041925697e-05, + "loss": 1.6948, + "step": 7765 + }, + { + "epoch": 0.4328632740649908, + "grad_norm": 0.5555060505867004, + "learning_rate": 6.195281454453858e-05, + "loss": 1.5966, + "step": 7766 + }, + { + "epoch": 0.43291901231815394, + "grad_norm": 0.5592203140258789, + "learning_rate": 6.19441682907466e-05, + "loss": 1.8594, + "step": 7767 + }, + { + "epoch": 0.43297475057131707, + "grad_norm": 0.5492338538169861, + "learning_rate": 6.193552165815525e-05, + "loss": 1.707, + "step": 7768 + }, + { + "epoch": 0.43303048882448025, + "grad_norm": 0.5119403600692749, + "learning_rate": 6.192687464703873e-05, + "loss": 1.3713, + "step": 7769 + }, + { + "epoch": 0.4330862270776434, + "grad_norm": 0.6076398491859436, + "learning_rate": 6.191822725767129e-05, + "loss": 1.7667, + "step": 7770 + }, + { + "epoch": 0.4331419653308065, + "grad_norm": 0.5796701312065125, + "learning_rate": 6.190957949032716e-05, + "loss": 1.688, + "step": 7771 + }, + { + "epoch": 0.4331977035839697, + "grad_norm": 0.5363877415657043, + "learning_rate": 6.190093134528061e-05, + "loss": 1.6081, + "step": 7772 + }, + { + "epoch": 0.4332534418371328, + "grad_norm": 0.5938536524772644, + "learning_rate": 6.189228282280592e-05, + "loss": 1.7503, + "step": 7773 + }, + { + "epoch": 0.43330918009029595, + "grad_norm": 0.5643225312232971, + "learning_rate": 6.188363392317734e-05, + "loss": 1.7848, + "step": 7774 + }, + { + "epoch": 0.43336491834345914, + "grad_norm": 0.5852196216583252, + "learning_rate": 6.187498464666917e-05, + "loss": 1.8112, + "step": 7775 + }, + { + "epoch": 0.43342065659662227, + "grad_norm": 0.5774117112159729, + "learning_rate": 6.186633499355576e-05, + "loss": 1.5268, + "step": 7776 + }, + { + "epoch": 0.4334763948497854, + "grad_norm": 0.5480836033821106, + "learning_rate": 6.185768496411135e-05, + "loss": 1.6839, + "step": 7777 + }, + { + "epoch": 0.4335321331029486, + "grad_norm": 0.5210850834846497, + "learning_rate": 6.184903455861032e-05, + "loss": 1.592, + "step": 7778 + }, + { + "epoch": 0.4335878713561117, + "grad_norm": 0.532539427280426, + "learning_rate": 6.1840383777327e-05, + "loss": 1.7992, + "step": 7779 + }, + { + "epoch": 0.43364360960927484, + "grad_norm": 0.5546075105667114, + "learning_rate": 6.183173262053575e-05, + "loss": 1.76, + "step": 7780 + }, + { + "epoch": 0.43369934786243797, + "grad_norm": 0.5634498000144958, + "learning_rate": 6.182308108851091e-05, + "loss": 1.5548, + "step": 7781 + }, + { + "epoch": 0.43375508611560115, + "grad_norm": 0.5091983079910278, + "learning_rate": 6.18144291815269e-05, + "loss": 1.4981, + "step": 7782 + }, + { + "epoch": 0.4338108243687643, + "grad_norm": 0.550807535648346, + "learning_rate": 6.180577689985805e-05, + "loss": 1.6661, + "step": 7783 + }, + { + "epoch": 0.4338665626219274, + "grad_norm": 0.5441664457321167, + "learning_rate": 6.179712424377879e-05, + "loss": 1.6262, + "step": 7784 + }, + { + "epoch": 0.4339223008750906, + "grad_norm": 0.620506227016449, + "learning_rate": 6.178847121356353e-05, + "loss": 1.9091, + "step": 7785 + }, + { + "epoch": 0.4339780391282537, + "grad_norm": 0.6028100252151489, + "learning_rate": 6.17798178094867e-05, + "loss": 1.7357, + "step": 7786 + }, + { + "epoch": 0.43403377738141685, + "grad_norm": 0.549159049987793, + "learning_rate": 6.177116403182274e-05, + "loss": 1.6313, + "step": 7787 + }, + { + "epoch": 0.43408951563458004, + "grad_norm": 0.5400141477584839, + "learning_rate": 6.176250988084608e-05, + "loss": 1.605, + "step": 7788 + }, + { + "epoch": 0.43414525388774317, + "grad_norm": 0.5363699793815613, + "learning_rate": 6.17538553568312e-05, + "loss": 1.5072, + "step": 7789 + }, + { + "epoch": 0.4342009921409063, + "grad_norm": 0.5816105604171753, + "learning_rate": 6.174520046005253e-05, + "loss": 1.769, + "step": 7790 + }, + { + "epoch": 0.4342567303940694, + "grad_norm": 0.5653383731842041, + "learning_rate": 6.17365451907846e-05, + "loss": 1.6427, + "step": 7791 + }, + { + "epoch": 0.4343124686472326, + "grad_norm": 0.5933492183685303, + "learning_rate": 6.172788954930188e-05, + "loss": 1.7614, + "step": 7792 + }, + { + "epoch": 0.43436820690039574, + "grad_norm": 0.5355760455131531, + "learning_rate": 6.171923353587888e-05, + "loss": 1.7932, + "step": 7793 + }, + { + "epoch": 0.43442394515355887, + "grad_norm": 0.5630636811256409, + "learning_rate": 6.171057715079012e-05, + "loss": 1.5032, + "step": 7794 + }, + { + "epoch": 0.43447968340672205, + "grad_norm": 0.5832585692405701, + "learning_rate": 6.170192039431013e-05, + "loss": 1.7822, + "step": 7795 + }, + { + "epoch": 0.4345354216598852, + "grad_norm": 0.4809796214103699, + "learning_rate": 6.169326326671346e-05, + "loss": 1.4389, + "step": 7796 + }, + { + "epoch": 0.4345911599130483, + "grad_norm": 0.5459611415863037, + "learning_rate": 6.168460576827465e-05, + "loss": 1.6287, + "step": 7797 + }, + { + "epoch": 0.4346468981662115, + "grad_norm": 0.5732072591781616, + "learning_rate": 6.167594789926827e-05, + "loss": 1.9769, + "step": 7798 + }, + { + "epoch": 0.4347026364193746, + "grad_norm": 0.5578893423080444, + "learning_rate": 6.16672896599689e-05, + "loss": 1.8077, + "step": 7799 + }, + { + "epoch": 0.43475837467253775, + "grad_norm": 0.5882522463798523, + "learning_rate": 6.165863105065113e-05, + "loss": 1.7451, + "step": 7800 + }, + { + "epoch": 0.43481411292570094, + "grad_norm": 0.6155940890312195, + "learning_rate": 6.164997207158954e-05, + "loss": 1.809, + "step": 7801 + }, + { + "epoch": 0.43486985117886406, + "grad_norm": 0.5675914883613586, + "learning_rate": 6.164131272305878e-05, + "loss": 1.7839, + "step": 7802 + }, + { + "epoch": 0.4349255894320272, + "grad_norm": 0.5673891305923462, + "learning_rate": 6.163265300533345e-05, + "loss": 1.6121, + "step": 7803 + }, + { + "epoch": 0.4349813276851903, + "grad_norm": 0.5579030513763428, + "learning_rate": 6.162399291868819e-05, + "loss": 1.7024, + "step": 7804 + }, + { + "epoch": 0.4350370659383535, + "grad_norm": 0.5674803256988525, + "learning_rate": 6.161533246339764e-05, + "loss": 1.702, + "step": 7805 + }, + { + "epoch": 0.43509280419151664, + "grad_norm": 0.5546411275863647, + "learning_rate": 6.160667163973648e-05, + "loss": 1.7928, + "step": 7806 + }, + { + "epoch": 0.43514854244467976, + "grad_norm": 0.6025899648666382, + "learning_rate": 6.159801044797936e-05, + "loss": 1.7094, + "step": 7807 + }, + { + "epoch": 0.43520428069784295, + "grad_norm": 0.5264720916748047, + "learning_rate": 6.158934888840095e-05, + "loss": 1.4788, + "step": 7808 + }, + { + "epoch": 0.4352600189510061, + "grad_norm": 0.6098587512969971, + "learning_rate": 6.158068696127601e-05, + "loss": 1.789, + "step": 7809 + }, + { + "epoch": 0.4353157572041692, + "grad_norm": 0.5427471995353699, + "learning_rate": 6.157202466687916e-05, + "loss": 1.7309, + "step": 7810 + }, + { + "epoch": 0.4353714954573324, + "grad_norm": 0.5572206974029541, + "learning_rate": 6.156336200548517e-05, + "loss": 1.7018, + "step": 7811 + }, + { + "epoch": 0.4354272337104955, + "grad_norm": 0.5554936528205872, + "learning_rate": 6.155469897736874e-05, + "loss": 1.6621, + "step": 7812 + }, + { + "epoch": 0.43548297196365865, + "grad_norm": 0.5617427229881287, + "learning_rate": 6.154603558280466e-05, + "loss": 1.7123, + "step": 7813 + }, + { + "epoch": 0.4355387102168218, + "grad_norm": 0.572582483291626, + "learning_rate": 6.153737182206762e-05, + "loss": 1.7392, + "step": 7814 + }, + { + "epoch": 0.43559444846998496, + "grad_norm": 0.5278533101081848, + "learning_rate": 6.152870769543245e-05, + "loss": 1.5766, + "step": 7815 + }, + { + "epoch": 0.4356501867231481, + "grad_norm": 0.5663198232650757, + "learning_rate": 6.152004320317385e-05, + "loss": 1.6999, + "step": 7816 + }, + { + "epoch": 0.4357059249763112, + "grad_norm": 0.5262326598167419, + "learning_rate": 6.151137834556666e-05, + "loss": 1.569, + "step": 7817 + }, + { + "epoch": 0.4357616632294744, + "grad_norm": 0.6140465140342712, + "learning_rate": 6.150271312288566e-05, + "loss": 1.9939, + "step": 7818 + }, + { + "epoch": 0.43581740148263753, + "grad_norm": 0.5997401475906372, + "learning_rate": 6.149404753540567e-05, + "loss": 1.7254, + "step": 7819 + }, + { + "epoch": 0.43587313973580066, + "grad_norm": 0.6437683701515198, + "learning_rate": 6.14853815834015e-05, + "loss": 2.0098, + "step": 7820 + }, + { + "epoch": 0.43592887798896385, + "grad_norm": 0.6912010312080383, + "learning_rate": 6.1476715267148e-05, + "loss": 2.1957, + "step": 7821 + }, + { + "epoch": 0.435984616242127, + "grad_norm": 0.5197498202323914, + "learning_rate": 6.146804858692001e-05, + "loss": 1.614, + "step": 7822 + }, + { + "epoch": 0.4360403544952901, + "grad_norm": 0.5308524370193481, + "learning_rate": 6.145938154299237e-05, + "loss": 1.5681, + "step": 7823 + }, + { + "epoch": 0.4360960927484533, + "grad_norm": 0.5914180278778076, + "learning_rate": 6.145071413563996e-05, + "loss": 1.8961, + "step": 7824 + }, + { + "epoch": 0.4361518310016164, + "grad_norm": 0.583292543888092, + "learning_rate": 6.144204636513767e-05, + "loss": 1.7469, + "step": 7825 + }, + { + "epoch": 0.43620756925477955, + "grad_norm": 0.6572228074073792, + "learning_rate": 6.143337823176038e-05, + "loss": 1.8796, + "step": 7826 + }, + { + "epoch": 0.4362633075079427, + "grad_norm": 0.5719166994094849, + "learning_rate": 6.142470973578299e-05, + "loss": 1.8995, + "step": 7827 + }, + { + "epoch": 0.43631904576110586, + "grad_norm": 0.561431348323822, + "learning_rate": 6.141604087748043e-05, + "loss": 1.544, + "step": 7828 + }, + { + "epoch": 0.436374784014269, + "grad_norm": 0.5519416928291321, + "learning_rate": 6.14073716571276e-05, + "loss": 1.7948, + "step": 7829 + }, + { + "epoch": 0.4364305222674321, + "grad_norm": 0.5517488718032837, + "learning_rate": 6.139870207499945e-05, + "loss": 1.6391, + "step": 7830 + }, + { + "epoch": 0.4364862605205953, + "grad_norm": 0.5172828435897827, + "learning_rate": 6.139003213137092e-05, + "loss": 1.7099, + "step": 7831 + }, + { + "epoch": 0.43654199877375843, + "grad_norm": 0.5379384756088257, + "learning_rate": 6.1381361826517e-05, + "loss": 1.5748, + "step": 7832 + }, + { + "epoch": 0.43659773702692156, + "grad_norm": 0.5668090581893921, + "learning_rate": 6.137269116071263e-05, + "loss": 1.6389, + "step": 7833 + }, + { + "epoch": 0.43665347528008475, + "grad_norm": 0.5936790704727173, + "learning_rate": 6.13640201342328e-05, + "loss": 1.7916, + "step": 7834 + }, + { + "epoch": 0.4367092135332479, + "grad_norm": 0.5564102530479431, + "learning_rate": 6.135534874735253e-05, + "loss": 1.6772, + "step": 7835 + }, + { + "epoch": 0.436764951786411, + "grad_norm": 0.6297538876533508, + "learning_rate": 6.134667700034678e-05, + "loss": 1.6905, + "step": 7836 + }, + { + "epoch": 0.43682069003957413, + "grad_norm": 0.5488330125808716, + "learning_rate": 6.13380048934906e-05, + "loss": 1.4808, + "step": 7837 + }, + { + "epoch": 0.4368764282927373, + "grad_norm": 0.5490309000015259, + "learning_rate": 6.132933242705899e-05, + "loss": 1.4744, + "step": 7838 + }, + { + "epoch": 0.43693216654590045, + "grad_norm": 0.5560508370399475, + "learning_rate": 6.132065960132705e-05, + "loss": 1.5957, + "step": 7839 + }, + { + "epoch": 0.4369879047990636, + "grad_norm": 0.6161486506462097, + "learning_rate": 6.131198641656976e-05, + "loss": 1.7756, + "step": 7840 + }, + { + "epoch": 0.43704364305222676, + "grad_norm": 0.5948550701141357, + "learning_rate": 6.130331287306224e-05, + "loss": 1.8239, + "step": 7841 + }, + { + "epoch": 0.4370993813053899, + "grad_norm": 0.5820697546005249, + "learning_rate": 6.129463897107951e-05, + "loss": 1.5446, + "step": 7842 + }, + { + "epoch": 0.437155119558553, + "grad_norm": 0.5708462595939636, + "learning_rate": 6.128596471089669e-05, + "loss": 1.7479, + "step": 7843 + }, + { + "epoch": 0.4372108578117162, + "grad_norm": 0.543056309223175, + "learning_rate": 6.127729009278889e-05, + "loss": 1.5951, + "step": 7844 + }, + { + "epoch": 0.43726659606487933, + "grad_norm": 0.5421169400215149, + "learning_rate": 6.126861511703119e-05, + "loss": 1.7609, + "step": 7845 + }, + { + "epoch": 0.43732233431804246, + "grad_norm": 0.5461887121200562, + "learning_rate": 6.125993978389871e-05, + "loss": 1.559, + "step": 7846 + }, + { + "epoch": 0.43737807257120564, + "grad_norm": 0.5687921643257141, + "learning_rate": 6.12512640936666e-05, + "loss": 1.8498, + "step": 7847 + }, + { + "epoch": 0.4374338108243688, + "grad_norm": 0.571535050868988, + "learning_rate": 6.124258804660999e-05, + "loss": 1.6316, + "step": 7848 + }, + { + "epoch": 0.4374895490775319, + "grad_norm": 0.5363306999206543, + "learning_rate": 6.123391164300404e-05, + "loss": 1.5648, + "step": 7849 + }, + { + "epoch": 0.43754528733069503, + "grad_norm": 0.5810931324958801, + "learning_rate": 6.12252348831239e-05, + "loss": 1.6624, + "step": 7850 + }, + { + "epoch": 0.4376010255838582, + "grad_norm": 0.54121994972229, + "learning_rate": 6.121655776724475e-05, + "loss": 1.617, + "step": 7851 + }, + { + "epoch": 0.43765676383702135, + "grad_norm": 0.54410719871521, + "learning_rate": 6.120788029564181e-05, + "loss": 1.6805, + "step": 7852 + }, + { + "epoch": 0.4377125020901845, + "grad_norm": 0.5891941785812378, + "learning_rate": 6.119920246859024e-05, + "loss": 1.51, + "step": 7853 + }, + { + "epoch": 0.43776824034334766, + "grad_norm": 0.625268280506134, + "learning_rate": 6.119052428636529e-05, + "loss": 1.9405, + "step": 7854 + }, + { + "epoch": 0.4378239785965108, + "grad_norm": 0.5463603138923645, + "learning_rate": 6.118184574924212e-05, + "loss": 1.6922, + "step": 7855 + }, + { + "epoch": 0.4378797168496739, + "grad_norm": 0.6116244196891785, + "learning_rate": 6.1173166857496e-05, + "loss": 1.7829, + "step": 7856 + }, + { + "epoch": 0.4379354551028371, + "grad_norm": 0.60081547498703, + "learning_rate": 6.116448761140218e-05, + "loss": 1.9078, + "step": 7857 + }, + { + "epoch": 0.43799119335600023, + "grad_norm": 0.5881320238113403, + "learning_rate": 6.11558080112359e-05, + "loss": 1.4085, + "step": 7858 + }, + { + "epoch": 0.43804693160916336, + "grad_norm": 0.5768188238143921, + "learning_rate": 6.114712805727244e-05, + "loss": 1.8526, + "step": 7859 + }, + { + "epoch": 0.4381026698623265, + "grad_norm": 0.530643105506897, + "learning_rate": 6.113844774978706e-05, + "loss": 1.6052, + "step": 7860 + }, + { + "epoch": 0.4381584081154897, + "grad_norm": 0.5398595929145813, + "learning_rate": 6.112976708905508e-05, + "loss": 1.7706, + "step": 7861 + }, + { + "epoch": 0.4382141463686528, + "grad_norm": 0.5204975008964539, + "learning_rate": 6.112108607535176e-05, + "loss": 1.6883, + "step": 7862 + }, + { + "epoch": 0.43826988462181593, + "grad_norm": 0.7956941723823547, + "learning_rate": 6.111240470895245e-05, + "loss": 1.4164, + "step": 7863 + }, + { + "epoch": 0.4383256228749791, + "grad_norm": 0.5599929094314575, + "learning_rate": 6.110372299013243e-05, + "loss": 1.7575, + "step": 7864 + }, + { + "epoch": 0.43838136112814224, + "grad_norm": 0.5534434914588928, + "learning_rate": 6.109504091916707e-05, + "loss": 1.825, + "step": 7865 + }, + { + "epoch": 0.4384370993813054, + "grad_norm": 0.5528411269187927, + "learning_rate": 6.108635849633169e-05, + "loss": 1.5657, + "step": 7866 + }, + { + "epoch": 0.43849283763446856, + "grad_norm": 0.5750871300697327, + "learning_rate": 6.107767572190168e-05, + "loss": 2.019, + "step": 7867 + }, + { + "epoch": 0.4385485758876317, + "grad_norm": 0.5783527493476868, + "learning_rate": 6.106899259615236e-05, + "loss": 1.5383, + "step": 7868 + }, + { + "epoch": 0.4386043141407948, + "grad_norm": 0.5577226877212524, + "learning_rate": 6.106030911935913e-05, + "loss": 1.8226, + "step": 7869 + }, + { + "epoch": 0.438660052393958, + "grad_norm": 0.5514130592346191, + "learning_rate": 6.105162529179738e-05, + "loss": 1.8757, + "step": 7870 + }, + { + "epoch": 0.43871579064712113, + "grad_norm": 0.5459834337234497, + "learning_rate": 6.104294111374252e-05, + "loss": 1.6836, + "step": 7871 + }, + { + "epoch": 0.43877152890028426, + "grad_norm": 0.5836615562438965, + "learning_rate": 6.103425658546995e-05, + "loss": 1.7928, + "step": 7872 + }, + { + "epoch": 0.4388272671534474, + "grad_norm": 0.552156925201416, + "learning_rate": 6.1025571707255104e-05, + "loss": 1.7313, + "step": 7873 + }, + { + "epoch": 0.43888300540661057, + "grad_norm": 0.5519532561302185, + "learning_rate": 6.10168864793734e-05, + "loss": 1.7947, + "step": 7874 + }, + { + "epoch": 0.4389387436597737, + "grad_norm": 0.5163867473602295, + "learning_rate": 6.100820090210028e-05, + "loss": 1.5192, + "step": 7875 + }, + { + "epoch": 0.43899448191293683, + "grad_norm": 0.5566312074661255, + "learning_rate": 6.099951497571123e-05, + "loss": 1.5993, + "step": 7876 + }, + { + "epoch": 0.4390502201661, + "grad_norm": 0.5464503765106201, + "learning_rate": 6.099082870048168e-05, + "loss": 1.8421, + "step": 7877 + }, + { + "epoch": 0.43910595841926314, + "grad_norm": 0.5337437987327576, + "learning_rate": 6.098214207668713e-05, + "loss": 1.5466, + "step": 7878 + }, + { + "epoch": 0.43916169667242627, + "grad_norm": 0.6034952402114868, + "learning_rate": 6.097345510460307e-05, + "loss": 1.8151, + "step": 7879 + }, + { + "epoch": 0.43921743492558946, + "grad_norm": 0.5526003241539001, + "learning_rate": 6.0964767784504995e-05, + "loss": 1.6425, + "step": 7880 + }, + { + "epoch": 0.4392731731787526, + "grad_norm": 0.575605571269989, + "learning_rate": 6.09560801166684e-05, + "loss": 1.7276, + "step": 7881 + }, + { + "epoch": 0.4393289114319157, + "grad_norm": 0.6006867289543152, + "learning_rate": 6.094739210136883e-05, + "loss": 1.7726, + "step": 7882 + }, + { + "epoch": 0.43938464968507884, + "grad_norm": 0.5347257852554321, + "learning_rate": 6.093870373888181e-05, + "loss": 1.6228, + "step": 7883 + }, + { + "epoch": 0.439440387938242, + "grad_norm": 0.5642088651657104, + "learning_rate": 6.093001502948289e-05, + "loss": 1.7197, + "step": 7884 + }, + { + "epoch": 0.43949612619140516, + "grad_norm": 0.5518479943275452, + "learning_rate": 6.0921325973447604e-05, + "loss": 1.5778, + "step": 7885 + }, + { + "epoch": 0.4395518644445683, + "grad_norm": 0.6168820261955261, + "learning_rate": 6.091263657105155e-05, + "loss": 1.7891, + "step": 7886 + }, + { + "epoch": 0.43960760269773147, + "grad_norm": 0.5440758466720581, + "learning_rate": 6.090394682257029e-05, + "loss": 1.5781, + "step": 7887 + }, + { + "epoch": 0.4396633409508946, + "grad_norm": 0.5412326455116272, + "learning_rate": 6.08952567282794e-05, + "loss": 1.683, + "step": 7888 + }, + { + "epoch": 0.43971907920405773, + "grad_norm": 0.563556969165802, + "learning_rate": 6.0886566288454496e-05, + "loss": 1.5673, + "step": 7889 + }, + { + "epoch": 0.4397748174572209, + "grad_norm": 0.5224372148513794, + "learning_rate": 6.0877875503371176e-05, + "loss": 1.7352, + "step": 7890 + }, + { + "epoch": 0.43983055571038404, + "grad_norm": 0.5953571796417236, + "learning_rate": 6.086918437330508e-05, + "loss": 1.7736, + "step": 7891 + }, + { + "epoch": 0.43988629396354717, + "grad_norm": 0.5646018385887146, + "learning_rate": 6.086049289853182e-05, + "loss": 1.7542, + "step": 7892 + }, + { + "epoch": 0.43994203221671035, + "grad_norm": 0.6011926531791687, + "learning_rate": 6.0851801079327056e-05, + "loss": 1.7245, + "step": 7893 + }, + { + "epoch": 0.4399977704698735, + "grad_norm": 0.4823513627052307, + "learning_rate": 6.0843108915966415e-05, + "loss": 1.4047, + "step": 7894 + }, + { + "epoch": 0.4400535087230366, + "grad_norm": 0.6140894889831543, + "learning_rate": 6.083441640872558e-05, + "loss": 2.0188, + "step": 7895 + }, + { + "epoch": 0.44010924697619974, + "grad_norm": 0.5411475896835327, + "learning_rate": 6.082572355788023e-05, + "loss": 1.5408, + "step": 7896 + }, + { + "epoch": 0.4401649852293629, + "grad_norm": 0.6488401293754578, + "learning_rate": 6.081703036370606e-05, + "loss": 2.0136, + "step": 7897 + }, + { + "epoch": 0.44022072348252606, + "grad_norm": 0.7427087426185608, + "learning_rate": 6.080833682647874e-05, + "loss": 1.6615, + "step": 7898 + }, + { + "epoch": 0.4402764617356892, + "grad_norm": 0.6195456385612488, + "learning_rate": 6.0799642946473986e-05, + "loss": 1.5859, + "step": 7899 + }, + { + "epoch": 0.44033219998885237, + "grad_norm": 0.5988082885742188, + "learning_rate": 6.079094872396754e-05, + "loss": 1.7462, + "step": 7900 + }, + { + "epoch": 0.4403879382420155, + "grad_norm": 0.6001728177070618, + "learning_rate": 6.0782254159235116e-05, + "loss": 1.736, + "step": 7901 + }, + { + "epoch": 0.4404436764951786, + "grad_norm": 0.5472791790962219, + "learning_rate": 6.0773559252552446e-05, + "loss": 1.372, + "step": 7902 + }, + { + "epoch": 0.4404994147483418, + "grad_norm": 0.5791669487953186, + "learning_rate": 6.0764864004195286e-05, + "loss": 1.7732, + "step": 7903 + }, + { + "epoch": 0.44055515300150494, + "grad_norm": 0.5353814363479614, + "learning_rate": 6.075616841443943e-05, + "loss": 1.8002, + "step": 7904 + }, + { + "epoch": 0.44061089125466807, + "grad_norm": 0.5734871029853821, + "learning_rate": 6.07474724835606e-05, + "loss": 1.7832, + "step": 7905 + }, + { + "epoch": 0.4406666295078312, + "grad_norm": 0.6158138513565063, + "learning_rate": 6.0738776211834615e-05, + "loss": 1.9006, + "step": 7906 + }, + { + "epoch": 0.4407223677609944, + "grad_norm": 0.5585591793060303, + "learning_rate": 6.073007959953726e-05, + "loss": 1.8046, + "step": 7907 + }, + { + "epoch": 0.4407781060141575, + "grad_norm": 0.5921459794044495, + "learning_rate": 6.0721382646944326e-05, + "loss": 1.8318, + "step": 7908 + }, + { + "epoch": 0.44083384426732064, + "grad_norm": 0.5314304828643799, + "learning_rate": 6.0712685354331654e-05, + "loss": 1.4663, + "step": 7909 + }, + { + "epoch": 0.4408895825204838, + "grad_norm": 0.5642038583755493, + "learning_rate": 6.0703987721975076e-05, + "loss": 1.6231, + "step": 7910 + }, + { + "epoch": 0.44094532077364695, + "grad_norm": 0.598506510257721, + "learning_rate": 6.0695289750150394e-05, + "loss": 1.6668, + "step": 7911 + }, + { + "epoch": 0.4410010590268101, + "grad_norm": 0.5824127197265625, + "learning_rate": 6.068659143913349e-05, + "loss": 1.7711, + "step": 7912 + }, + { + "epoch": 0.44105679727997327, + "grad_norm": 0.5553746223449707, + "learning_rate": 6.0677892789200216e-05, + "loss": 1.7025, + "step": 7913 + }, + { + "epoch": 0.4411125355331364, + "grad_norm": 0.5868836641311646, + "learning_rate": 6.066919380062643e-05, + "loss": 1.7495, + "step": 7914 + }, + { + "epoch": 0.4411682737862995, + "grad_norm": 0.5977121591567993, + "learning_rate": 6.066049447368802e-05, + "loss": 1.5988, + "step": 7915 + }, + { + "epoch": 0.4412240120394627, + "grad_norm": 0.6062576770782471, + "learning_rate": 6.065179480866089e-05, + "loss": 1.7006, + "step": 7916 + }, + { + "epoch": 0.44127975029262584, + "grad_norm": 0.5636418461799622, + "learning_rate": 6.064309480582093e-05, + "loss": 1.6275, + "step": 7917 + }, + { + "epoch": 0.44133548854578897, + "grad_norm": 0.5832415223121643, + "learning_rate": 6.0634394465444056e-05, + "loss": 1.8278, + "step": 7918 + }, + { + "epoch": 0.4413912267989521, + "grad_norm": 0.5471083521842957, + "learning_rate": 6.062569378780621e-05, + "loss": 1.724, + "step": 7919 + }, + { + "epoch": 0.4414469650521153, + "grad_norm": 0.5676271915435791, + "learning_rate": 6.061699277318328e-05, + "loss": 1.706, + "step": 7920 + }, + { + "epoch": 0.4415027033052784, + "grad_norm": 0.5920431613922119, + "learning_rate": 6.060829142185125e-05, + "loss": 1.7118, + "step": 7921 + }, + { + "epoch": 0.44155844155844154, + "grad_norm": 0.6104030609130859, + "learning_rate": 6.059958973408607e-05, + "loss": 1.908, + "step": 7922 + }, + { + "epoch": 0.4416141798116047, + "grad_norm": 0.5903329849243164, + "learning_rate": 6.05908877101637e-05, + "loss": 1.7077, + "step": 7923 + }, + { + "epoch": 0.44166991806476785, + "grad_norm": 0.5489821434020996, + "learning_rate": 6.058218535036013e-05, + "loss": 1.6519, + "step": 7924 + }, + { + "epoch": 0.441725656317931, + "grad_norm": 0.5121790170669556, + "learning_rate": 6.057348265495133e-05, + "loss": 1.4665, + "step": 7925 + }, + { + "epoch": 0.44178139457109417, + "grad_norm": 0.5221953392028809, + "learning_rate": 6.0564779624213316e-05, + "loss": 1.6157, + "step": 7926 + }, + { + "epoch": 0.4418371328242573, + "grad_norm": 0.5600380897521973, + "learning_rate": 6.055607625842208e-05, + "loss": 1.5828, + "step": 7927 + }, + { + "epoch": 0.4418928710774204, + "grad_norm": 0.5320744514465332, + "learning_rate": 6.0547372557853655e-05, + "loss": 1.6772, + "step": 7928 + }, + { + "epoch": 0.44194860933058355, + "grad_norm": 0.5403137803077698, + "learning_rate": 6.053866852278406e-05, + "loss": 1.7394, + "step": 7929 + }, + { + "epoch": 0.44200434758374674, + "grad_norm": 0.591922402381897, + "learning_rate": 6.052996415348936e-05, + "loss": 1.8231, + "step": 7930 + }, + { + "epoch": 0.44206008583690987, + "grad_norm": 0.5516440868377686, + "learning_rate": 6.052125945024558e-05, + "loss": 1.6415, + "step": 7931 + }, + { + "epoch": 0.442115824090073, + "grad_norm": 0.5129381418228149, + "learning_rate": 6.05125544133288e-05, + "loss": 1.5515, + "step": 7932 + }, + { + "epoch": 0.4421715623432362, + "grad_norm": 0.5778689980506897, + "learning_rate": 6.050384904301508e-05, + "loss": 1.7348, + "step": 7933 + }, + { + "epoch": 0.4422273005963993, + "grad_norm": 0.5508379340171814, + "learning_rate": 6.049514333958052e-05, + "loss": 1.6601, + "step": 7934 + }, + { + "epoch": 0.44228303884956244, + "grad_norm": 0.5481617450714111, + "learning_rate": 6.048643730330119e-05, + "loss": 1.5493, + "step": 7935 + }, + { + "epoch": 0.4423387771027256, + "grad_norm": 0.5237631797790527, + "learning_rate": 6.0477730934453226e-05, + "loss": 1.5092, + "step": 7936 + }, + { + "epoch": 0.44239451535588875, + "grad_norm": 0.5657276511192322, + "learning_rate": 6.046902423331271e-05, + "loss": 1.4483, + "step": 7937 + }, + { + "epoch": 0.4424502536090519, + "grad_norm": 0.5502325892448425, + "learning_rate": 6.046031720015579e-05, + "loss": 1.6987, + "step": 7938 + }, + { + "epoch": 0.44250599186221506, + "grad_norm": 0.6082862615585327, + "learning_rate": 6.045160983525859e-05, + "loss": 1.8988, + "step": 7939 + }, + { + "epoch": 0.4425617301153782, + "grad_norm": 0.5569537878036499, + "learning_rate": 6.044290213889727e-05, + "loss": 1.696, + "step": 7940 + }, + { + "epoch": 0.4426174683685413, + "grad_norm": 0.518162190914154, + "learning_rate": 6.0434194111347985e-05, + "loss": 1.5279, + "step": 7941 + }, + { + "epoch": 0.44267320662170445, + "grad_norm": 0.5695126056671143, + "learning_rate": 6.042548575288689e-05, + "loss": 1.7109, + "step": 7942 + }, + { + "epoch": 0.44272894487486764, + "grad_norm": 0.49009808897972107, + "learning_rate": 6.0416777063790184e-05, + "loss": 1.4709, + "step": 7943 + }, + { + "epoch": 0.44278468312803076, + "grad_norm": 0.5802407264709473, + "learning_rate": 6.040806804433403e-05, + "loss": 1.6943, + "step": 7944 + }, + { + "epoch": 0.4428404213811939, + "grad_norm": 0.5507357716560364, + "learning_rate": 6.0399358694794647e-05, + "loss": 1.3918, + "step": 7945 + }, + { + "epoch": 0.4428961596343571, + "grad_norm": 0.5855342745780945, + "learning_rate": 6.039064901544824e-05, + "loss": 1.8103, + "step": 7946 + }, + { + "epoch": 0.4429518978875202, + "grad_norm": 0.5658082365989685, + "learning_rate": 6.038193900657102e-05, + "loss": 1.7597, + "step": 7947 + }, + { + "epoch": 0.44300763614068334, + "grad_norm": 0.5863122344017029, + "learning_rate": 6.037322866843923e-05, + "loss": 1.7671, + "step": 7948 + }, + { + "epoch": 0.4430633743938465, + "grad_norm": 0.5610207915306091, + "learning_rate": 6.036451800132912e-05, + "loss": 1.7487, + "step": 7949 + }, + { + "epoch": 0.44311911264700965, + "grad_norm": 0.5848312377929688, + "learning_rate": 6.03558070055169e-05, + "loss": 1.7112, + "step": 7950 + }, + { + "epoch": 0.4431748509001728, + "grad_norm": 0.5728501081466675, + "learning_rate": 6.0347095681278876e-05, + "loss": 1.7736, + "step": 7951 + }, + { + "epoch": 0.4432305891533359, + "grad_norm": 0.5987431406974792, + "learning_rate": 6.033838402889131e-05, + "loss": 1.7693, + "step": 7952 + }, + { + "epoch": 0.4432863274064991, + "grad_norm": 0.5747002959251404, + "learning_rate": 6.032967204863048e-05, + "loss": 1.6216, + "step": 7953 + }, + { + "epoch": 0.4433420656596622, + "grad_norm": 0.5476230382919312, + "learning_rate": 6.0320959740772666e-05, + "loss": 1.7631, + "step": 7954 + }, + { + "epoch": 0.44339780391282535, + "grad_norm": 0.5305277109146118, + "learning_rate": 6.031224710559419e-05, + "loss": 1.6809, + "step": 7955 + }, + { + "epoch": 0.44345354216598853, + "grad_norm": 0.5442744493484497, + "learning_rate": 6.0303534143371374e-05, + "loss": 1.5357, + "step": 7956 + }, + { + "epoch": 0.44350928041915166, + "grad_norm": 0.5553621053695679, + "learning_rate": 6.029482085438051e-05, + "loss": 1.6955, + "step": 7957 + }, + { + "epoch": 0.4435650186723148, + "grad_norm": 0.5430163741111755, + "learning_rate": 6.028610723889797e-05, + "loss": 1.762, + "step": 7958 + }, + { + "epoch": 0.443620756925478, + "grad_norm": 0.5217944979667664, + "learning_rate": 6.027739329720006e-05, + "loss": 1.4594, + "step": 7959 + }, + { + "epoch": 0.4436764951786411, + "grad_norm": 0.5763014554977417, + "learning_rate": 6.026867902956317e-05, + "loss": 1.7942, + "step": 7960 + }, + { + "epoch": 0.44373223343180423, + "grad_norm": 0.533718466758728, + "learning_rate": 6.025996443626364e-05, + "loss": 1.6659, + "step": 7961 + }, + { + "epoch": 0.4437879716849674, + "grad_norm": 0.5921129584312439, + "learning_rate": 6.0251249517577854e-05, + "loss": 1.9042, + "step": 7962 + }, + { + "epoch": 0.44384370993813055, + "grad_norm": 0.5379483103752136, + "learning_rate": 6.024253427378222e-05, + "loss": 1.6772, + "step": 7963 + }, + { + "epoch": 0.4438994481912937, + "grad_norm": 0.5350393652915955, + "learning_rate": 6.0233818705153114e-05, + "loss": 1.5868, + "step": 7964 + }, + { + "epoch": 0.4439551864444568, + "grad_norm": 0.5462901592254639, + "learning_rate": 6.022510281196695e-05, + "loss": 1.6118, + "step": 7965 + }, + { + "epoch": 0.44401092469762, + "grad_norm": 0.5518479943275452, + "learning_rate": 6.021638659450013e-05, + "loss": 1.4902, + "step": 7966 + }, + { + "epoch": 0.4440666629507831, + "grad_norm": 0.5284306406974792, + "learning_rate": 6.020767005302909e-05, + "loss": 1.5573, + "step": 7967 + }, + { + "epoch": 0.44412240120394625, + "grad_norm": 0.6189160346984863, + "learning_rate": 6.0198953187830277e-05, + "loss": 1.9599, + "step": 7968 + }, + { + "epoch": 0.44417813945710943, + "grad_norm": 0.5723422765731812, + "learning_rate": 6.019023599918014e-05, + "loss": 1.7111, + "step": 7969 + }, + { + "epoch": 0.44423387771027256, + "grad_norm": 0.5545480251312256, + "learning_rate": 6.018151848735511e-05, + "loss": 1.6214, + "step": 7970 + }, + { + "epoch": 0.4442896159634357, + "grad_norm": 0.5693395733833313, + "learning_rate": 6.01728006526317e-05, + "loss": 1.8074, + "step": 7971 + }, + { + "epoch": 0.4443453542165989, + "grad_norm": 0.5313411951065063, + "learning_rate": 6.0164082495286354e-05, + "loss": 1.6405, + "step": 7972 + }, + { + "epoch": 0.444401092469762, + "grad_norm": 0.5680732727050781, + "learning_rate": 6.015536401559556e-05, + "loss": 1.4973, + "step": 7973 + }, + { + "epoch": 0.44445683072292513, + "grad_norm": 0.6219733357429504, + "learning_rate": 6.014664521383584e-05, + "loss": 1.8733, + "step": 7974 + }, + { + "epoch": 0.44451256897608826, + "grad_norm": 0.5903530716896057, + "learning_rate": 6.0137926090283694e-05, + "loss": 1.6334, + "step": 7975 + }, + { + "epoch": 0.44456830722925145, + "grad_norm": 0.6504166722297668, + "learning_rate": 6.0129206645215655e-05, + "loss": 1.7995, + "step": 7976 + }, + { + "epoch": 0.4446240454824146, + "grad_norm": 0.6121776103973389, + "learning_rate": 6.012048687890821e-05, + "loss": 1.8132, + "step": 7977 + }, + { + "epoch": 0.4446797837355777, + "grad_norm": 0.6290067434310913, + "learning_rate": 6.011176679163796e-05, + "loss": 1.9482, + "step": 7978 + }, + { + "epoch": 0.4447355219887409, + "grad_norm": 0.6563844084739685, + "learning_rate": 6.010304638368139e-05, + "loss": 1.8485, + "step": 7979 + }, + { + "epoch": 0.444791260241904, + "grad_norm": 0.5514439940452576, + "learning_rate": 6.009432565531511e-05, + "loss": 1.6343, + "step": 7980 + }, + { + "epoch": 0.44484699849506715, + "grad_norm": 0.5945736765861511, + "learning_rate": 6.008560460681567e-05, + "loss": 1.721, + "step": 7981 + }, + { + "epoch": 0.44490273674823033, + "grad_norm": 0.5428782105445862, + "learning_rate": 6.007688323845966e-05, + "loss": 1.5152, + "step": 7982 + }, + { + "epoch": 0.44495847500139346, + "grad_norm": 0.5975694060325623, + "learning_rate": 6.006816155052366e-05, + "loss": 1.7975, + "step": 7983 + }, + { + "epoch": 0.4450142132545566, + "grad_norm": 0.5683627724647522, + "learning_rate": 6.005943954328429e-05, + "loss": 1.7401, + "step": 7984 + }, + { + "epoch": 0.4450699515077198, + "grad_norm": 0.552085280418396, + "learning_rate": 6.005071721701814e-05, + "loss": 1.5525, + "step": 7985 + }, + { + "epoch": 0.4451256897608829, + "grad_norm": 0.5957344770431519, + "learning_rate": 6.004199457200184e-05, + "loss": 1.8248, + "step": 7986 + }, + { + "epoch": 0.44518142801404603, + "grad_norm": 0.5816213488578796, + "learning_rate": 6.003327160851201e-05, + "loss": 1.5985, + "step": 7987 + }, + { + "epoch": 0.44523716626720916, + "grad_norm": 0.5090708136558533, + "learning_rate": 6.002454832682532e-05, + "loss": 1.4312, + "step": 7988 + }, + { + "epoch": 0.44529290452037235, + "grad_norm": 0.5570594668388367, + "learning_rate": 6.00158247272184e-05, + "loss": 1.6288, + "step": 7989 + }, + { + "epoch": 0.4453486427735355, + "grad_norm": 0.4970921576023102, + "learning_rate": 6.00071008099679e-05, + "loss": 1.2663, + "step": 7990 + }, + { + "epoch": 0.4454043810266986, + "grad_norm": 0.5791414976119995, + "learning_rate": 5.999837657535052e-05, + "loss": 1.8037, + "step": 7991 + }, + { + "epoch": 0.4454601192798618, + "grad_norm": 0.5636151432991028, + "learning_rate": 5.998965202364294e-05, + "loss": 1.6298, + "step": 7992 + }, + { + "epoch": 0.4455158575330249, + "grad_norm": 0.5829344987869263, + "learning_rate": 5.998092715512183e-05, + "loss": 1.6349, + "step": 7993 + }, + { + "epoch": 0.44557159578618805, + "grad_norm": 0.556348979473114, + "learning_rate": 5.9972201970063904e-05, + "loss": 1.5642, + "step": 7994 + }, + { + "epoch": 0.44562733403935123, + "grad_norm": 0.5365790724754333, + "learning_rate": 5.996347646874587e-05, + "loss": 1.6421, + "step": 7995 + }, + { + "epoch": 0.44568307229251436, + "grad_norm": 0.576501190662384, + "learning_rate": 5.9954750651444455e-05, + "loss": 1.6171, + "step": 7996 + }, + { + "epoch": 0.4457388105456775, + "grad_norm": 0.5861379504203796, + "learning_rate": 5.9946024518436406e-05, + "loss": 1.6702, + "step": 7997 + }, + { + "epoch": 0.4457945487988406, + "grad_norm": 0.5348252058029175, + "learning_rate": 5.9937298069998424e-05, + "loss": 1.4339, + "step": 7998 + }, + { + "epoch": 0.4458502870520038, + "grad_norm": 0.5516197085380554, + "learning_rate": 5.99285713064073e-05, + "loss": 1.738, + "step": 7999 + }, + { + "epoch": 0.44590602530516693, + "grad_norm": 0.58391934633255, + "learning_rate": 5.991984422793977e-05, + "loss": 1.4481, + "step": 8000 + }, + { + "epoch": 0.44596176355833006, + "grad_norm": 0.5707566738128662, + "learning_rate": 5.9911116834872624e-05, + "loss": 1.7051, + "step": 8001 + }, + { + "epoch": 0.44601750181149324, + "grad_norm": 0.5384584069252014, + "learning_rate": 5.990238912748265e-05, + "loss": 1.7542, + "step": 8002 + }, + { + "epoch": 0.4460732400646564, + "grad_norm": 0.5866785645484924, + "learning_rate": 5.989366110604662e-05, + "loss": 1.8245, + "step": 8003 + }, + { + "epoch": 0.4461289783178195, + "grad_norm": 0.5644246935844421, + "learning_rate": 5.988493277084134e-05, + "loss": 1.7637, + "step": 8004 + }, + { + "epoch": 0.4461847165709827, + "grad_norm": 0.5331970453262329, + "learning_rate": 5.9876204122143634e-05, + "loss": 1.6303, + "step": 8005 + }, + { + "epoch": 0.4462404548241458, + "grad_norm": 0.5923652648925781, + "learning_rate": 5.98674751602303e-05, + "loss": 1.8505, + "step": 8006 + }, + { + "epoch": 0.44629619307730894, + "grad_norm": 0.5415480136871338, + "learning_rate": 5.985874588537819e-05, + "loss": 1.6483, + "step": 8007 + }, + { + "epoch": 0.44635193133047213, + "grad_norm": 0.5634106397628784, + "learning_rate": 5.985001629786415e-05, + "loss": 1.5566, + "step": 8008 + }, + { + "epoch": 0.44640766958363526, + "grad_norm": 0.5723522901535034, + "learning_rate": 5.9841286397965014e-05, + "loss": 1.7409, + "step": 8009 + }, + { + "epoch": 0.4464634078367984, + "grad_norm": 0.5537884831428528, + "learning_rate": 5.983255618595767e-05, + "loss": 1.712, + "step": 8010 + }, + { + "epoch": 0.4465191460899615, + "grad_norm": 0.5915796160697937, + "learning_rate": 5.982382566211895e-05, + "loss": 1.7699, + "step": 8011 + }, + { + "epoch": 0.4465748843431247, + "grad_norm": 0.6134962439537048, + "learning_rate": 5.981509482672576e-05, + "loss": 1.862, + "step": 8012 + }, + { + "epoch": 0.44663062259628783, + "grad_norm": 0.4997968077659607, + "learning_rate": 5.980636368005499e-05, + "loss": 1.5174, + "step": 8013 + }, + { + "epoch": 0.44668636084945096, + "grad_norm": 0.5801420211791992, + "learning_rate": 5.979763222238354e-05, + "loss": 1.8425, + "step": 8014 + }, + { + "epoch": 0.44674209910261414, + "grad_norm": 0.5159302949905396, + "learning_rate": 5.978890045398833e-05, + "loss": 1.7243, + "step": 8015 + }, + { + "epoch": 0.44679783735577727, + "grad_norm": 0.59089195728302, + "learning_rate": 5.978016837514625e-05, + "loss": 1.8003, + "step": 8016 + }, + { + "epoch": 0.4468535756089404, + "grad_norm": 0.5666080713272095, + "learning_rate": 5.9771435986134274e-05, + "loss": 1.648, + "step": 8017 + }, + { + "epoch": 0.4469093138621036, + "grad_norm": 0.5891024470329285, + "learning_rate": 5.9762703287229304e-05, + "loss": 1.5867, + "step": 8018 + }, + { + "epoch": 0.4469650521152667, + "grad_norm": 0.5871114730834961, + "learning_rate": 5.975397027870831e-05, + "loss": 1.656, + "step": 8019 + }, + { + "epoch": 0.44702079036842984, + "grad_norm": 0.6023023724555969, + "learning_rate": 5.974523696084825e-05, + "loss": 1.6628, + "step": 8020 + }, + { + "epoch": 0.447076528621593, + "grad_norm": 0.5608631372451782, + "learning_rate": 5.97365033339261e-05, + "loss": 1.4316, + "step": 8021 + }, + { + "epoch": 0.44713226687475616, + "grad_norm": 0.5549430251121521, + "learning_rate": 5.972776939821883e-05, + "loss": 1.4696, + "step": 8022 + }, + { + "epoch": 0.4471880051279193, + "grad_norm": 0.5799054503440857, + "learning_rate": 5.971903515400342e-05, + "loss": 1.7885, + "step": 8023 + }, + { + "epoch": 0.4472437433810824, + "grad_norm": 0.5215498208999634, + "learning_rate": 5.971030060155689e-05, + "loss": 1.6956, + "step": 8024 + }, + { + "epoch": 0.4472994816342456, + "grad_norm": 0.5385097861289978, + "learning_rate": 5.970156574115623e-05, + "loss": 1.5434, + "step": 8025 + }, + { + "epoch": 0.44735521988740873, + "grad_norm": 0.5320507287979126, + "learning_rate": 5.969283057307847e-05, + "loss": 1.5207, + "step": 8026 + }, + { + "epoch": 0.44741095814057186, + "grad_norm": 0.53661048412323, + "learning_rate": 5.9684095097600645e-05, + "loss": 1.6211, + "step": 8027 + }, + { + "epoch": 0.44746669639373504, + "grad_norm": 0.5779610872268677, + "learning_rate": 5.967535931499979e-05, + "loss": 1.7282, + "step": 8028 + }, + { + "epoch": 0.44752243464689817, + "grad_norm": 0.5973451137542725, + "learning_rate": 5.966662322555294e-05, + "loss": 1.822, + "step": 8029 + }, + { + "epoch": 0.4475781729000613, + "grad_norm": 0.6070274710655212, + "learning_rate": 5.965788682953717e-05, + "loss": 1.6235, + "step": 8030 + }, + { + "epoch": 0.4476339111532245, + "grad_norm": 0.5565271377563477, + "learning_rate": 5.9649150127229534e-05, + "loss": 1.8248, + "step": 8031 + }, + { + "epoch": 0.4476896494063876, + "grad_norm": 0.5610112547874451, + "learning_rate": 5.964041311890711e-05, + "loss": 1.5738, + "step": 8032 + }, + { + "epoch": 0.44774538765955074, + "grad_norm": 0.5636839270591736, + "learning_rate": 5.9631675804846985e-05, + "loss": 1.5644, + "step": 8033 + }, + { + "epoch": 0.44780112591271387, + "grad_norm": 0.5381824970245361, + "learning_rate": 5.962293818532628e-05, + "loss": 1.6785, + "step": 8034 + }, + { + "epoch": 0.44785686416587706, + "grad_norm": 0.5614325404167175, + "learning_rate": 5.9614200260622066e-05, + "loss": 1.7991, + "step": 8035 + }, + { + "epoch": 0.4479126024190402, + "grad_norm": 0.527214527130127, + "learning_rate": 5.960546203101148e-05, + "loss": 1.6311, + "step": 8036 + }, + { + "epoch": 0.4479683406722033, + "grad_norm": 0.5667834877967834, + "learning_rate": 5.959672349677163e-05, + "loss": 1.4416, + "step": 8037 + }, + { + "epoch": 0.4480240789253665, + "grad_norm": 0.5953390002250671, + "learning_rate": 5.9587984658179676e-05, + "loss": 1.8168, + "step": 8038 + }, + { + "epoch": 0.4480798171785296, + "grad_norm": 0.5339275598526001, + "learning_rate": 5.957924551551275e-05, + "loss": 1.6999, + "step": 8039 + }, + { + "epoch": 0.44813555543169276, + "grad_norm": 0.5568943619728088, + "learning_rate": 5.9570506069048e-05, + "loss": 1.7066, + "step": 8040 + }, + { + "epoch": 0.44819129368485594, + "grad_norm": 0.5787097215652466, + "learning_rate": 5.95617663190626e-05, + "loss": 1.6468, + "step": 8041 + }, + { + "epoch": 0.44824703193801907, + "grad_norm": 0.5685398578643799, + "learning_rate": 5.955302626583374e-05, + "loss": 1.8804, + "step": 8042 + }, + { + "epoch": 0.4483027701911822, + "grad_norm": 0.5303986668586731, + "learning_rate": 5.9544285909638566e-05, + "loss": 1.4389, + "step": 8043 + }, + { + "epoch": 0.4483585084443453, + "grad_norm": 0.5936418771743774, + "learning_rate": 5.953554525075429e-05, + "loss": 1.9128, + "step": 8044 + }, + { + "epoch": 0.4484142466975085, + "grad_norm": 0.5271584391593933, + "learning_rate": 5.952680428945812e-05, + "loss": 1.5926, + "step": 8045 + }, + { + "epoch": 0.44846998495067164, + "grad_norm": 0.5615208148956299, + "learning_rate": 5.951806302602725e-05, + "loss": 1.6805, + "step": 8046 + }, + { + "epoch": 0.44852572320383477, + "grad_norm": 0.5467960834503174, + "learning_rate": 5.950932146073893e-05, + "loss": 1.6863, + "step": 8047 + }, + { + "epoch": 0.44858146145699795, + "grad_norm": 0.5716736912727356, + "learning_rate": 5.950057959387038e-05, + "loss": 1.695, + "step": 8048 + }, + { + "epoch": 0.4486371997101611, + "grad_norm": 0.5174785852432251, + "learning_rate": 5.9491837425698816e-05, + "loss": 1.3978, + "step": 8049 + }, + { + "epoch": 0.4486929379633242, + "grad_norm": 0.5112467408180237, + "learning_rate": 5.948309495650153e-05, + "loss": 1.3862, + "step": 8050 + }, + { + "epoch": 0.4487486762164874, + "grad_norm": 0.6070237755775452, + "learning_rate": 5.947435218655576e-05, + "loss": 1.744, + "step": 8051 + }, + { + "epoch": 0.4488044144696505, + "grad_norm": 0.5886159539222717, + "learning_rate": 5.946560911613877e-05, + "loss": 1.9782, + "step": 8052 + }, + { + "epoch": 0.44886015272281365, + "grad_norm": 0.6077089309692383, + "learning_rate": 5.945686574552785e-05, + "loss": 1.6861, + "step": 8053 + }, + { + "epoch": 0.44891589097597684, + "grad_norm": 0.5767019391059875, + "learning_rate": 5.944812207500029e-05, + "loss": 1.8577, + "step": 8054 + }, + { + "epoch": 0.44897162922913997, + "grad_norm": 0.5735483765602112, + "learning_rate": 5.943937810483338e-05, + "loss": 1.8143, + "step": 8055 + }, + { + "epoch": 0.4490273674823031, + "grad_norm": 0.5384686589241028, + "learning_rate": 5.943063383530444e-05, + "loss": 1.7183, + "step": 8056 + }, + { + "epoch": 0.4490831057354662, + "grad_norm": 0.5415961146354675, + "learning_rate": 5.942188926669077e-05, + "loss": 1.5619, + "step": 8057 + }, + { + "epoch": 0.4491388439886294, + "grad_norm": 0.5548281669616699, + "learning_rate": 5.941314439926969e-05, + "loss": 1.8049, + "step": 8058 + }, + { + "epoch": 0.44919458224179254, + "grad_norm": 0.5731210112571716, + "learning_rate": 5.940439923331857e-05, + "loss": 1.9301, + "step": 8059 + }, + { + "epoch": 0.44925032049495567, + "grad_norm": 0.5715717673301697, + "learning_rate": 5.939565376911475e-05, + "loss": 1.6145, + "step": 8060 + }, + { + "epoch": 0.44930605874811885, + "grad_norm": 0.5775079131126404, + "learning_rate": 5.938690800693556e-05, + "loss": 1.7435, + "step": 8061 + }, + { + "epoch": 0.449361797001282, + "grad_norm": 0.5366044044494629, + "learning_rate": 5.937816194705838e-05, + "loss": 1.7497, + "step": 8062 + }, + { + "epoch": 0.4494175352544451, + "grad_norm": 0.5498981475830078, + "learning_rate": 5.936941558976058e-05, + "loss": 1.6565, + "step": 8063 + }, + { + "epoch": 0.4494732735076083, + "grad_norm": 0.541826605796814, + "learning_rate": 5.936066893531954e-05, + "loss": 1.6147, + "step": 8064 + }, + { + "epoch": 0.4495290117607714, + "grad_norm": 0.5456510186195374, + "learning_rate": 5.9351921984012657e-05, + "loss": 1.652, + "step": 8065 + }, + { + "epoch": 0.44958475001393455, + "grad_norm": 0.5831677317619324, + "learning_rate": 5.934317473611734e-05, + "loss": 1.7302, + "step": 8066 + }, + { + "epoch": 0.4496404882670977, + "grad_norm": 0.55061274766922, + "learning_rate": 5.9334427191911e-05, + "loss": 1.6976, + "step": 8067 + }, + { + "epoch": 0.44969622652026087, + "grad_norm": 0.5210010409355164, + "learning_rate": 5.932567935167104e-05, + "loss": 1.5901, + "step": 8068 + }, + { + "epoch": 0.449751964773424, + "grad_norm": 0.5638371706008911, + "learning_rate": 5.931693121567492e-05, + "loss": 1.7005, + "step": 8069 + }, + { + "epoch": 0.4498077030265871, + "grad_norm": 0.5460227131843567, + "learning_rate": 5.930818278420005e-05, + "loss": 1.8827, + "step": 8070 + }, + { + "epoch": 0.4498634412797503, + "grad_norm": 0.5335036516189575, + "learning_rate": 5.9299434057523894e-05, + "loss": 1.6689, + "step": 8071 + }, + { + "epoch": 0.44991917953291344, + "grad_norm": 0.45309698581695557, + "learning_rate": 5.929068503592391e-05, + "loss": 1.1558, + "step": 8072 + }, + { + "epoch": 0.44997491778607657, + "grad_norm": 0.5678838491439819, + "learning_rate": 5.9281935719677574e-05, + "loss": 1.7916, + "step": 8073 + }, + { + "epoch": 0.45003065603923975, + "grad_norm": 0.6037769913673401, + "learning_rate": 5.927318610906234e-05, + "loss": 1.6458, + "step": 8074 + }, + { + "epoch": 0.4500863942924029, + "grad_norm": 0.5376781821250916, + "learning_rate": 5.9264436204355724e-05, + "loss": 1.754, + "step": 8075 + }, + { + "epoch": 0.450142132545566, + "grad_norm": 0.5493988394737244, + "learning_rate": 5.92556860058352e-05, + "loss": 1.7992, + "step": 8076 + }, + { + "epoch": 0.4501978707987292, + "grad_norm": 0.5373069643974304, + "learning_rate": 5.9246935513778276e-05, + "loss": 1.6756, + "step": 8077 + }, + { + "epoch": 0.4502536090518923, + "grad_norm": 0.5574460625648499, + "learning_rate": 5.923818472846248e-05, + "loss": 1.6423, + "step": 8078 + }, + { + "epoch": 0.45030934730505545, + "grad_norm": 0.5568375587463379, + "learning_rate": 5.922943365016531e-05, + "loss": 1.7708, + "step": 8079 + }, + { + "epoch": 0.4503650855582186, + "grad_norm": 0.551171064376831, + "learning_rate": 5.922068227916433e-05, + "loss": 1.7107, + "step": 8080 + }, + { + "epoch": 0.45042082381138177, + "grad_norm": 0.5870986580848694, + "learning_rate": 5.9211930615737066e-05, + "loss": 1.801, + "step": 8081 + }, + { + "epoch": 0.4504765620645449, + "grad_norm": 0.5700268745422363, + "learning_rate": 5.920317866016108e-05, + "loss": 1.6317, + "step": 8082 + }, + { + "epoch": 0.450532300317708, + "grad_norm": 0.5469490885734558, + "learning_rate": 5.919442641271391e-05, + "loss": 1.6841, + "step": 8083 + }, + { + "epoch": 0.4505880385708712, + "grad_norm": 0.5380752682685852, + "learning_rate": 5.9185673873673154e-05, + "loss": 1.3761, + "step": 8084 + }, + { + "epoch": 0.45064377682403434, + "grad_norm": 0.6156383156776428, + "learning_rate": 5.917692104331637e-05, + "loss": 1.9012, + "step": 8085 + }, + { + "epoch": 0.45069951507719747, + "grad_norm": 0.6044989824295044, + "learning_rate": 5.916816792192116e-05, + "loss": 1.8825, + "step": 8086 + }, + { + "epoch": 0.45075525333036065, + "grad_norm": 0.5541858673095703, + "learning_rate": 5.915941450976512e-05, + "loss": 1.6097, + "step": 8087 + }, + { + "epoch": 0.4508109915835238, + "grad_norm": 0.5468337535858154, + "learning_rate": 5.9150660807125844e-05, + "loss": 1.7299, + "step": 8088 + }, + { + "epoch": 0.4508667298366869, + "grad_norm": 0.6255477070808411, + "learning_rate": 5.9141906814280975e-05, + "loss": 1.818, + "step": 8089 + }, + { + "epoch": 0.45092246808985004, + "grad_norm": 0.5574450492858887, + "learning_rate": 5.9133152531508106e-05, + "loss": 1.8804, + "step": 8090 + }, + { + "epoch": 0.4509782063430132, + "grad_norm": 0.5240482091903687, + "learning_rate": 5.91243979590849e-05, + "loss": 1.6162, + "step": 8091 + }, + { + "epoch": 0.45103394459617635, + "grad_norm": 0.5322662591934204, + "learning_rate": 5.911564309728899e-05, + "loss": 1.7833, + "step": 8092 + }, + { + "epoch": 0.4510896828493395, + "grad_norm": 0.5365003347396851, + "learning_rate": 5.910688794639803e-05, + "loss": 1.5982, + "step": 8093 + }, + { + "epoch": 0.45114542110250266, + "grad_norm": 0.5948169827461243, + "learning_rate": 5.909813250668967e-05, + "loss": 1.8386, + "step": 8094 + }, + { + "epoch": 0.4512011593556658, + "grad_norm": 0.5501197576522827, + "learning_rate": 5.9089376778441606e-05, + "loss": 1.748, + "step": 8095 + }, + { + "epoch": 0.4512568976088289, + "grad_norm": 0.5238162875175476, + "learning_rate": 5.908062076193149e-05, + "loss": 1.4871, + "step": 8096 + }, + { + "epoch": 0.4513126358619921, + "grad_norm": 0.515355110168457, + "learning_rate": 5.907186445743704e-05, + "loss": 1.4985, + "step": 8097 + }, + { + "epoch": 0.45136837411515524, + "grad_norm": 0.5451371073722839, + "learning_rate": 5.9063107865235936e-05, + "loss": 1.7953, + "step": 8098 + }, + { + "epoch": 0.45142411236831836, + "grad_norm": 0.5602155327796936, + "learning_rate": 5.90543509856059e-05, + "loss": 1.4848, + "step": 8099 + }, + { + "epoch": 0.45147985062148155, + "grad_norm": 0.6136230826377869, + "learning_rate": 5.904559381882463e-05, + "loss": 1.8602, + "step": 8100 + }, + { + "epoch": 0.4515355888746447, + "grad_norm": 0.5416921973228455, + "learning_rate": 5.9036836365169865e-05, + "loss": 1.7242, + "step": 8101 + }, + { + "epoch": 0.4515913271278078, + "grad_norm": 0.5299700498580933, + "learning_rate": 5.9028078624919344e-05, + "loss": 1.4976, + "step": 8102 + }, + { + "epoch": 0.45164706538097094, + "grad_norm": 0.5295999050140381, + "learning_rate": 5.901932059835081e-05, + "loss": 1.667, + "step": 8103 + }, + { + "epoch": 0.4517028036341341, + "grad_norm": 0.5291856527328491, + "learning_rate": 5.9010562285742e-05, + "loss": 1.5909, + "step": 8104 + }, + { + "epoch": 0.45175854188729725, + "grad_norm": 0.5456459522247314, + "learning_rate": 5.9001803687370696e-05, + "loss": 1.6947, + "step": 8105 + }, + { + "epoch": 0.4518142801404604, + "grad_norm": 0.534061074256897, + "learning_rate": 5.8993044803514674e-05, + "loss": 1.4796, + "step": 8106 + }, + { + "epoch": 0.45187001839362356, + "grad_norm": 0.5795206427574158, + "learning_rate": 5.8984285634451695e-05, + "loss": 1.8176, + "step": 8107 + }, + { + "epoch": 0.4519257566467867, + "grad_norm": 0.5638490915298462, + "learning_rate": 5.897552618045956e-05, + "loss": 1.6067, + "step": 8108 + }, + { + "epoch": 0.4519814948999498, + "grad_norm": 0.5725950002670288, + "learning_rate": 5.896676644181607e-05, + "loss": 1.6761, + "step": 8109 + }, + { + "epoch": 0.452037233153113, + "grad_norm": 0.6189979314804077, + "learning_rate": 5.8958006418799005e-05, + "loss": 1.8323, + "step": 8110 + }, + { + "epoch": 0.45209297140627613, + "grad_norm": 0.550565779209137, + "learning_rate": 5.894924611168622e-05, + "loss": 1.865, + "step": 8111 + }, + { + "epoch": 0.45214870965943926, + "grad_norm": 0.563420832157135, + "learning_rate": 5.894048552075554e-05, + "loss": 1.8, + "step": 8112 + }, + { + "epoch": 0.4522044479126024, + "grad_norm": 0.5111345052719116, + "learning_rate": 5.893172464628477e-05, + "loss": 1.4806, + "step": 8113 + }, + { + "epoch": 0.4522601861657656, + "grad_norm": 0.566088855266571, + "learning_rate": 5.8922963488551775e-05, + "loss": 1.7427, + "step": 8114 + }, + { + "epoch": 0.4523159244189287, + "grad_norm": 0.5696318745613098, + "learning_rate": 5.89142020478344e-05, + "loss": 1.8576, + "step": 8115 + }, + { + "epoch": 0.45237166267209183, + "grad_norm": 0.5730637907981873, + "learning_rate": 5.890544032441051e-05, + "loss": 1.6966, + "step": 8116 + }, + { + "epoch": 0.452427400925255, + "grad_norm": 0.5427675247192383, + "learning_rate": 5.889667831855797e-05, + "loss": 1.639, + "step": 8117 + }, + { + "epoch": 0.45248313917841815, + "grad_norm": 0.6031304001808167, + "learning_rate": 5.888791603055467e-05, + "loss": 1.7707, + "step": 8118 + }, + { + "epoch": 0.4525388774315813, + "grad_norm": 0.5573417544364929, + "learning_rate": 5.887915346067851e-05, + "loss": 1.8751, + "step": 8119 + }, + { + "epoch": 0.45259461568474446, + "grad_norm": 0.5398233532905579, + "learning_rate": 5.8870390609207337e-05, + "loss": 1.5854, + "step": 8120 + }, + { + "epoch": 0.4526503539379076, + "grad_norm": 0.554905354976654, + "learning_rate": 5.886162747641912e-05, + "loss": 1.6138, + "step": 8121 + }, + { + "epoch": 0.4527060921910707, + "grad_norm": 0.5116898417472839, + "learning_rate": 5.885286406259174e-05, + "loss": 1.4997, + "step": 8122 + }, + { + "epoch": 0.4527618304442339, + "grad_norm": 0.5095398426055908, + "learning_rate": 5.884410036800312e-05, + "loss": 1.372, + "step": 8123 + }, + { + "epoch": 0.45281756869739703, + "grad_norm": 0.5345844626426697, + "learning_rate": 5.883533639293119e-05, + "loss": 1.7398, + "step": 8124 + }, + { + "epoch": 0.45287330695056016, + "grad_norm": 0.5889625549316406, + "learning_rate": 5.882657213765393e-05, + "loss": 1.8826, + "step": 8125 + }, + { + "epoch": 0.4529290452037233, + "grad_norm": 0.5907882452011108, + "learning_rate": 5.881780760244926e-05, + "loss": 1.8187, + "step": 8126 + }, + { + "epoch": 0.4529847834568865, + "grad_norm": 0.5326589941978455, + "learning_rate": 5.8809042787595135e-05, + "loss": 1.5317, + "step": 8127 + }, + { + "epoch": 0.4530405217100496, + "grad_norm": 0.6067203283309937, + "learning_rate": 5.880027769336953e-05, + "loss": 1.9912, + "step": 8128 + }, + { + "epoch": 0.45309625996321273, + "grad_norm": 0.5273611545562744, + "learning_rate": 5.879151232005044e-05, + "loss": 1.7771, + "step": 8129 + }, + { + "epoch": 0.4531519982163759, + "grad_norm": 0.5791671872138977, + "learning_rate": 5.8782746667915824e-05, + "loss": 1.9728, + "step": 8130 + }, + { + "epoch": 0.45320773646953905, + "grad_norm": 0.5748934149742126, + "learning_rate": 5.877398073724368e-05, + "loss": 1.7932, + "step": 8131 + }, + { + "epoch": 0.4532634747227022, + "grad_norm": 0.5750080943107605, + "learning_rate": 5.876521452831205e-05, + "loss": 1.6562, + "step": 8132 + }, + { + "epoch": 0.45331921297586536, + "grad_norm": 0.5455517172813416, + "learning_rate": 5.87564480413989e-05, + "loss": 1.6491, + "step": 8133 + }, + { + "epoch": 0.4533749512290285, + "grad_norm": 0.5786875486373901, + "learning_rate": 5.8747681276782294e-05, + "loss": 1.6799, + "step": 8134 + }, + { + "epoch": 0.4534306894821916, + "grad_norm": 0.5193260908126831, + "learning_rate": 5.8738914234740225e-05, + "loss": 1.7299, + "step": 8135 + }, + { + "epoch": 0.45348642773535475, + "grad_norm": 0.5477581024169922, + "learning_rate": 5.8730146915550745e-05, + "loss": 1.529, + "step": 8136 + }, + { + "epoch": 0.45354216598851793, + "grad_norm": 0.5622334480285645, + "learning_rate": 5.872137931949191e-05, + "loss": 1.7301, + "step": 8137 + }, + { + "epoch": 0.45359790424168106, + "grad_norm": 0.5410364866256714, + "learning_rate": 5.871261144684177e-05, + "loss": 1.7159, + "step": 8138 + }, + { + "epoch": 0.4536536424948442, + "grad_norm": 0.5440908670425415, + "learning_rate": 5.870384329787839e-05, + "loss": 1.6208, + "step": 8139 + }, + { + "epoch": 0.4537093807480074, + "grad_norm": 0.5730171799659729, + "learning_rate": 5.8695074872879855e-05, + "loss": 1.7554, + "step": 8140 + }, + { + "epoch": 0.4537651190011705, + "grad_norm": 0.5274659991264343, + "learning_rate": 5.868630617212424e-05, + "loss": 1.6493, + "step": 8141 + }, + { + "epoch": 0.45382085725433363, + "grad_norm": 0.5639094114303589, + "learning_rate": 5.867753719588963e-05, + "loss": 1.8717, + "step": 8142 + }, + { + "epoch": 0.4538765955074968, + "grad_norm": 0.5402084589004517, + "learning_rate": 5.8668767944454136e-05, + "loss": 1.7959, + "step": 8143 + }, + { + "epoch": 0.45393233376065995, + "grad_norm": 0.5999549627304077, + "learning_rate": 5.865999841809586e-05, + "loss": 1.7492, + "step": 8144 + }, + { + "epoch": 0.4539880720138231, + "grad_norm": 0.5832345485687256, + "learning_rate": 5.865122861709295e-05, + "loss": 1.7432, + "step": 8145 + }, + { + "epoch": 0.45404381026698626, + "grad_norm": 0.500333309173584, + "learning_rate": 5.864245854172349e-05, + "loss": 1.5536, + "step": 8146 + }, + { + "epoch": 0.4540995485201494, + "grad_norm": 0.5283179879188538, + "learning_rate": 5.8633688192265645e-05, + "loss": 1.5528, + "step": 8147 + }, + { + "epoch": 0.4541552867733125, + "grad_norm": 0.5074849128723145, + "learning_rate": 5.862491756899753e-05, + "loss": 1.5251, + "step": 8148 + }, + { + "epoch": 0.45421102502647565, + "grad_norm": 0.5706311464309692, + "learning_rate": 5.8616146672197326e-05, + "loss": 1.5709, + "step": 8149 + }, + { + "epoch": 0.45426676327963883, + "grad_norm": 0.570326566696167, + "learning_rate": 5.8607375502143183e-05, + "loss": 1.6585, + "step": 8150 + }, + { + "epoch": 0.45432250153280196, + "grad_norm": 0.7040314674377441, + "learning_rate": 5.859860405911328e-05, + "loss": 2.0239, + "step": 8151 + }, + { + "epoch": 0.4543782397859651, + "grad_norm": 0.5602174401283264, + "learning_rate": 5.858983234338579e-05, + "loss": 1.5565, + "step": 8152 + }, + { + "epoch": 0.4544339780391283, + "grad_norm": 0.596564531326294, + "learning_rate": 5.858106035523888e-05, + "loss": 1.8482, + "step": 8153 + }, + { + "epoch": 0.4544897162922914, + "grad_norm": 0.5571820735931396, + "learning_rate": 5.85722880949508e-05, + "loss": 1.6401, + "step": 8154 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.5759769678115845, + "learning_rate": 5.8563515562799695e-05, + "loss": 1.8876, + "step": 8155 + }, + { + "epoch": 0.4546011927986177, + "grad_norm": 0.526823103427887, + "learning_rate": 5.855474275906381e-05, + "loss": 1.4215, + "step": 8156 + }, + { + "epoch": 0.45465693105178084, + "grad_norm": 0.5801699161529541, + "learning_rate": 5.854596968402136e-05, + "loss": 1.8225, + "step": 8157 + }, + { + "epoch": 0.454712669304944, + "grad_norm": 0.548812747001648, + "learning_rate": 5.8537196337950596e-05, + "loss": 1.6582, + "step": 8158 + }, + { + "epoch": 0.4547684075581071, + "grad_norm": 0.5647279024124146, + "learning_rate": 5.8528422721129726e-05, + "loss": 1.6121, + "step": 8159 + }, + { + "epoch": 0.4548241458112703, + "grad_norm": 0.5501880645751953, + "learning_rate": 5.8519648833837013e-05, + "loss": 1.5704, + "step": 8160 + }, + { + "epoch": 0.4548798840644334, + "grad_norm": 0.5714605450630188, + "learning_rate": 5.851087467635071e-05, + "loss": 1.918, + "step": 8161 + }, + { + "epoch": 0.45493562231759654, + "grad_norm": 0.5872429609298706, + "learning_rate": 5.8502100248949085e-05, + "loss": 1.7381, + "step": 8162 + }, + { + "epoch": 0.45499136057075973, + "grad_norm": 0.5113133788108826, + "learning_rate": 5.8493325551910405e-05, + "loss": 1.6602, + "step": 8163 + }, + { + "epoch": 0.45504709882392286, + "grad_norm": 0.5724974274635315, + "learning_rate": 5.848455058551298e-05, + "loss": 1.7762, + "step": 8164 + }, + { + "epoch": 0.455102837077086, + "grad_norm": 0.5925339460372925, + "learning_rate": 5.8475775350035056e-05, + "loss": 1.8456, + "step": 8165 + }, + { + "epoch": 0.45515857533024917, + "grad_norm": 0.567402720451355, + "learning_rate": 5.846699984575497e-05, + "loss": 1.6512, + "step": 8166 + }, + { + "epoch": 0.4552143135834123, + "grad_norm": 0.53789883852005, + "learning_rate": 5.8458224072951005e-05, + "loss": 1.675, + "step": 8167 + }, + { + "epoch": 0.45527005183657543, + "grad_norm": 0.563400149345398, + "learning_rate": 5.844944803190149e-05, + "loss": 1.4973, + "step": 8168 + }, + { + "epoch": 0.4553257900897386, + "grad_norm": 0.5786770582199097, + "learning_rate": 5.844067172288474e-05, + "loss": 1.6223, + "step": 8169 + }, + { + "epoch": 0.45538152834290174, + "grad_norm": 0.5910102725028992, + "learning_rate": 5.843189514617911e-05, + "loss": 1.7822, + "step": 8170 + }, + { + "epoch": 0.45543726659606487, + "grad_norm": 0.5599364638328552, + "learning_rate": 5.8423118302062915e-05, + "loss": 1.7511, + "step": 8171 + }, + { + "epoch": 0.455493004849228, + "grad_norm": 0.5284358263015747, + "learning_rate": 5.841434119081453e-05, + "loss": 1.6494, + "step": 8172 + }, + { + "epoch": 0.4555487431023912, + "grad_norm": 0.5970794558525085, + "learning_rate": 5.840556381271229e-05, + "loss": 1.6952, + "step": 8173 + }, + { + "epoch": 0.4556044813555543, + "grad_norm": 0.5448065400123596, + "learning_rate": 5.839678616803458e-05, + "loss": 1.5907, + "step": 8174 + }, + { + "epoch": 0.45566021960871744, + "grad_norm": 0.5598198771476746, + "learning_rate": 5.838800825705977e-05, + "loss": 1.6862, + "step": 8175 + }, + { + "epoch": 0.4557159578618806, + "grad_norm": 0.5819631218910217, + "learning_rate": 5.837923008006623e-05, + "loss": 1.7354, + "step": 8176 + }, + { + "epoch": 0.45577169611504376, + "grad_norm": 0.5947074890136719, + "learning_rate": 5.837045163733239e-05, + "loss": 1.7971, + "step": 8177 + }, + { + "epoch": 0.4558274343682069, + "grad_norm": 0.541515588760376, + "learning_rate": 5.8361672929136614e-05, + "loss": 1.4939, + "step": 8178 + }, + { + "epoch": 0.45588317262137007, + "grad_norm": 0.670753002166748, + "learning_rate": 5.835289395575731e-05, + "loss": 1.8816, + "step": 8179 + }, + { + "epoch": 0.4559389108745332, + "grad_norm": 0.5665016174316406, + "learning_rate": 5.8344114717472943e-05, + "loss": 1.6907, + "step": 8180 + }, + { + "epoch": 0.4559946491276963, + "grad_norm": 0.5885823369026184, + "learning_rate": 5.833533521456188e-05, + "loss": 1.6905, + "step": 8181 + }, + { + "epoch": 0.45605038738085946, + "grad_norm": 0.5672965049743652, + "learning_rate": 5.832655544730259e-05, + "loss": 1.5996, + "step": 8182 + }, + { + "epoch": 0.45610612563402264, + "grad_norm": 0.5488877296447754, + "learning_rate": 5.831777541597351e-05, + "loss": 1.6316, + "step": 8183 + }, + { + "epoch": 0.45616186388718577, + "grad_norm": 0.541111409664154, + "learning_rate": 5.8308995120853096e-05, + "loss": 1.7246, + "step": 8184 + }, + { + "epoch": 0.4562176021403489, + "grad_norm": 0.5794996619224548, + "learning_rate": 5.830021456221979e-05, + "loss": 1.8438, + "step": 8185 + }, + { + "epoch": 0.4562733403935121, + "grad_norm": 0.4965246021747589, + "learning_rate": 5.829143374035209e-05, + "loss": 1.2569, + "step": 8186 + }, + { + "epoch": 0.4563290786466752, + "grad_norm": 0.5464833974838257, + "learning_rate": 5.8282652655528426e-05, + "loss": 1.6355, + "step": 8187 + }, + { + "epoch": 0.45638481689983834, + "grad_norm": 0.617215096950531, + "learning_rate": 5.827387130802733e-05, + "loss": 1.7473, + "step": 8188 + }, + { + "epoch": 0.4564405551530015, + "grad_norm": 0.6064026355743408, + "learning_rate": 5.826508969812726e-05, + "loss": 1.817, + "step": 8189 + }, + { + "epoch": 0.45649629340616465, + "grad_norm": 0.6004077792167664, + "learning_rate": 5.825630782610676e-05, + "loss": 1.8728, + "step": 8190 + }, + { + "epoch": 0.4565520316593278, + "grad_norm": 0.6301288604736328, + "learning_rate": 5.82475256922443e-05, + "loss": 1.8616, + "step": 8191 + }, + { + "epoch": 0.45660776991249097, + "grad_norm": 0.540440022945404, + "learning_rate": 5.8238743296818396e-05, + "loss": 1.7224, + "step": 8192 + }, + { + "epoch": 0.4566635081656541, + "grad_norm": 0.5390138626098633, + "learning_rate": 5.8229960640107617e-05, + "loss": 1.556, + "step": 8193 + }, + { + "epoch": 0.4567192464188172, + "grad_norm": 0.5261824131011963, + "learning_rate": 5.822117772239045e-05, + "loss": 1.6086, + "step": 8194 + }, + { + "epoch": 0.45677498467198036, + "grad_norm": 0.543070375919342, + "learning_rate": 5.821239454394547e-05, + "loss": 1.5987, + "step": 8195 + }, + { + "epoch": 0.45683072292514354, + "grad_norm": 0.6048296689987183, + "learning_rate": 5.8203611105051204e-05, + "loss": 1.7936, + "step": 8196 + }, + { + "epoch": 0.45688646117830667, + "grad_norm": 0.5308238863945007, + "learning_rate": 5.819482740598624e-05, + "loss": 1.5304, + "step": 8197 + }, + { + "epoch": 0.4569421994314698, + "grad_norm": 0.5806917548179626, + "learning_rate": 5.8186043447029125e-05, + "loss": 1.6869, + "step": 8198 + }, + { + "epoch": 0.456997937684633, + "grad_norm": 0.5387137532234192, + "learning_rate": 5.8177259228458444e-05, + "loss": 1.7673, + "step": 8199 + }, + { + "epoch": 0.4570536759377961, + "grad_norm": 0.5830815434455872, + "learning_rate": 5.816847475055277e-05, + "loss": 1.9119, + "step": 8200 + }, + { + "epoch": 0.45710941419095924, + "grad_norm": 0.5564570426940918, + "learning_rate": 5.8159690013590695e-05, + "loss": 1.5385, + "step": 8201 + }, + { + "epoch": 0.4571651524441224, + "grad_norm": 0.5688846707344055, + "learning_rate": 5.815090501785083e-05, + "loss": 1.5954, + "step": 8202 + }, + { + "epoch": 0.45722089069728555, + "grad_norm": 0.6317092776298523, + "learning_rate": 5.814211976361179e-05, + "loss": 1.9886, + "step": 8203 + }, + { + "epoch": 0.4572766289504487, + "grad_norm": 0.5649227499961853, + "learning_rate": 5.813333425115218e-05, + "loss": 1.6259, + "step": 8204 + }, + { + "epoch": 0.4573323672036118, + "grad_norm": 0.5452385544776917, + "learning_rate": 5.812454848075063e-05, + "loss": 1.7129, + "step": 8205 + }, + { + "epoch": 0.457388105456775, + "grad_norm": 0.5575756430625916, + "learning_rate": 5.8115762452685774e-05, + "loss": 1.7543, + "step": 8206 + }, + { + "epoch": 0.4574438437099381, + "grad_norm": 0.5120208263397217, + "learning_rate": 5.810697616723624e-05, + "loss": 1.5619, + "step": 8207 + }, + { + "epoch": 0.45749958196310125, + "grad_norm": 0.5111353397369385, + "learning_rate": 5.809818962468069e-05, + "loss": 1.5594, + "step": 8208 + }, + { + "epoch": 0.45755532021626444, + "grad_norm": 0.5274066925048828, + "learning_rate": 5.8089402825297776e-05, + "loss": 1.5727, + "step": 8209 + }, + { + "epoch": 0.45761105846942757, + "grad_norm": 0.531512439250946, + "learning_rate": 5.80806157693662e-05, + "loss": 1.6845, + "step": 8210 + }, + { + "epoch": 0.4576667967225907, + "grad_norm": 0.587890088558197, + "learning_rate": 5.807182845716458e-05, + "loss": 1.8239, + "step": 8211 + }, + { + "epoch": 0.4577225349757539, + "grad_norm": 0.543900191783905, + "learning_rate": 5.8063040888971635e-05, + "loss": 1.9671, + "step": 8212 + }, + { + "epoch": 0.457778273228917, + "grad_norm": 0.5269332528114319, + "learning_rate": 5.8054253065066024e-05, + "loss": 1.5801, + "step": 8213 + }, + { + "epoch": 0.45783401148208014, + "grad_norm": 0.5568074584007263, + "learning_rate": 5.8045464985726474e-05, + "loss": 1.5843, + "step": 8214 + }, + { + "epoch": 0.4578897497352433, + "grad_norm": 0.5887969136238098, + "learning_rate": 5.803667665123168e-05, + "loss": 1.9532, + "step": 8215 + }, + { + "epoch": 0.45794548798840645, + "grad_norm": 0.6071587204933167, + "learning_rate": 5.802788806186038e-05, + "loss": 1.9501, + "step": 8216 + }, + { + "epoch": 0.4580012262415696, + "grad_norm": 0.5481032133102417, + "learning_rate": 5.801909921789126e-05, + "loss": 1.7435, + "step": 8217 + }, + { + "epoch": 0.4580569644947327, + "grad_norm": 0.6313177347183228, + "learning_rate": 5.801031011960306e-05, + "loss": 1.928, + "step": 8218 + }, + { + "epoch": 0.4581127027478959, + "grad_norm": 0.5789720416069031, + "learning_rate": 5.800152076727454e-05, + "loss": 1.8, + "step": 8219 + }, + { + "epoch": 0.458168441001059, + "grad_norm": 0.5438299775123596, + "learning_rate": 5.799273116118443e-05, + "loss": 1.6805, + "step": 8220 + }, + { + "epoch": 0.45822417925422215, + "grad_norm": 0.5296357870101929, + "learning_rate": 5.798394130161149e-05, + "loss": 1.4218, + "step": 8221 + }, + { + "epoch": 0.45827991750738534, + "grad_norm": 0.6217812895774841, + "learning_rate": 5.7975151188834475e-05, + "loss": 1.7633, + "step": 8222 + }, + { + "epoch": 0.45833565576054847, + "grad_norm": 0.6416480541229248, + "learning_rate": 5.796636082313217e-05, + "loss": 2.0147, + "step": 8223 + }, + { + "epoch": 0.4583913940137116, + "grad_norm": 0.5263529419898987, + "learning_rate": 5.795757020478334e-05, + "loss": 1.5335, + "step": 8224 + }, + { + "epoch": 0.4584471322668748, + "grad_norm": 0.565466046333313, + "learning_rate": 5.794877933406679e-05, + "loss": 1.778, + "step": 8225 + }, + { + "epoch": 0.4585028705200379, + "grad_norm": 0.5382056832313538, + "learning_rate": 5.79399882112613e-05, + "loss": 1.678, + "step": 8226 + }, + { + "epoch": 0.45855860877320104, + "grad_norm": 0.5097582340240479, + "learning_rate": 5.7931196836645675e-05, + "loss": 1.5224, + "step": 8227 + }, + { + "epoch": 0.45861434702636417, + "grad_norm": 0.5619562268257141, + "learning_rate": 5.792240521049872e-05, + "loss": 1.9743, + "step": 8228 + }, + { + "epoch": 0.45867008527952735, + "grad_norm": 0.57401442527771, + "learning_rate": 5.791361333309926e-05, + "loss": 1.6526, + "step": 8229 + }, + { + "epoch": 0.4587258235326905, + "grad_norm": 0.557773232460022, + "learning_rate": 5.790482120472615e-05, + "loss": 1.7427, + "step": 8230 + }, + { + "epoch": 0.4587815617858536, + "grad_norm": 0.5370197296142578, + "learning_rate": 5.789602882565818e-05, + "loss": 1.5028, + "step": 8231 + }, + { + "epoch": 0.4588373000390168, + "grad_norm": 0.559916079044342, + "learning_rate": 5.788723619617422e-05, + "loss": 1.6115, + "step": 8232 + }, + { + "epoch": 0.4588930382921799, + "grad_norm": 0.5461910367012024, + "learning_rate": 5.787844331655311e-05, + "loss": 1.5789, + "step": 8233 + }, + { + "epoch": 0.45894877654534305, + "grad_norm": 0.5319302082061768, + "learning_rate": 5.786965018707371e-05, + "loss": 1.66, + "step": 8234 + }, + { + "epoch": 0.45900451479850624, + "grad_norm": 0.5757958292961121, + "learning_rate": 5.786085680801488e-05, + "loss": 1.9192, + "step": 8235 + }, + { + "epoch": 0.45906025305166936, + "grad_norm": 0.523041844367981, + "learning_rate": 5.785206317965553e-05, + "loss": 1.5435, + "step": 8236 + }, + { + "epoch": 0.4591159913048325, + "grad_norm": 0.5196270942687988, + "learning_rate": 5.7843269302274506e-05, + "loss": 1.2152, + "step": 8237 + }, + { + "epoch": 0.4591717295579957, + "grad_norm": 0.5284752249717712, + "learning_rate": 5.7834475176150715e-05, + "loss": 1.6407, + "step": 8238 + }, + { + "epoch": 0.4592274678111588, + "grad_norm": 0.5639576315879822, + "learning_rate": 5.782568080156303e-05, + "loss": 1.8297, + "step": 8239 + }, + { + "epoch": 0.45928320606432194, + "grad_norm": 0.5723278522491455, + "learning_rate": 5.781688617879039e-05, + "loss": 1.7981, + "step": 8240 + }, + { + "epoch": 0.45933894431748507, + "grad_norm": 0.5638182759284973, + "learning_rate": 5.780809130811169e-05, + "loss": 1.6244, + "step": 8241 + }, + { + "epoch": 0.45939468257064825, + "grad_norm": 0.5704604983329773, + "learning_rate": 5.779929618980586e-05, + "loss": 1.6348, + "step": 8242 + }, + { + "epoch": 0.4594504208238114, + "grad_norm": 0.5768876671791077, + "learning_rate": 5.779050082415184e-05, + "loss": 1.7342, + "step": 8243 + }, + { + "epoch": 0.4595061590769745, + "grad_norm": 0.5308094620704651, + "learning_rate": 5.778170521142854e-05, + "loss": 1.6838, + "step": 8244 + }, + { + "epoch": 0.4595618973301377, + "grad_norm": 0.6009156703948975, + "learning_rate": 5.777290935191493e-05, + "loss": 1.72, + "step": 8245 + }, + { + "epoch": 0.4596176355833008, + "grad_norm": 0.5695474743843079, + "learning_rate": 5.776411324588995e-05, + "loss": 1.6783, + "step": 8246 + }, + { + "epoch": 0.45967337383646395, + "grad_norm": 0.5541953444480896, + "learning_rate": 5.775531689363256e-05, + "loss": 1.5248, + "step": 8247 + }, + { + "epoch": 0.45972911208962713, + "grad_norm": 0.5543676614761353, + "learning_rate": 5.7746520295421736e-05, + "loss": 1.5673, + "step": 8248 + }, + { + "epoch": 0.45978485034279026, + "grad_norm": 0.6300926804542542, + "learning_rate": 5.773772345153648e-05, + "loss": 1.9275, + "step": 8249 + }, + { + "epoch": 0.4598405885959534, + "grad_norm": 0.580083429813385, + "learning_rate": 5.772892636225572e-05, + "loss": 1.583, + "step": 8250 + }, + { + "epoch": 0.4598963268491165, + "grad_norm": 0.6072207689285278, + "learning_rate": 5.7720129027858496e-05, + "loss": 1.6752, + "step": 8251 + }, + { + "epoch": 0.4599520651022797, + "grad_norm": 0.575436532497406, + "learning_rate": 5.771133144862377e-05, + "loss": 1.5191, + "step": 8252 + }, + { + "epoch": 0.46000780335544283, + "grad_norm": 0.5946778655052185, + "learning_rate": 5.770253362483059e-05, + "loss": 1.7338, + "step": 8253 + }, + { + "epoch": 0.46006354160860596, + "grad_norm": 0.5782346129417419, + "learning_rate": 5.769373555675794e-05, + "loss": 1.9825, + "step": 8254 + }, + { + "epoch": 0.46011927986176915, + "grad_norm": 0.6065311431884766, + "learning_rate": 5.7684937244684856e-05, + "loss": 1.8879, + "step": 8255 + }, + { + "epoch": 0.4601750181149323, + "grad_norm": 0.5789337158203125, + "learning_rate": 5.767613868889038e-05, + "loss": 1.5408, + "step": 8256 + }, + { + "epoch": 0.4602307563680954, + "grad_norm": 0.5640459060668945, + "learning_rate": 5.766733988965354e-05, + "loss": 1.7434, + "step": 8257 + }, + { + "epoch": 0.4602864946212586, + "grad_norm": 0.5351431965827942, + "learning_rate": 5.765854084725337e-05, + "loss": 1.7586, + "step": 8258 + }, + { + "epoch": 0.4603422328744217, + "grad_norm": 0.6039308905601501, + "learning_rate": 5.764974156196895e-05, + "loss": 1.8, + "step": 8259 + }, + { + "epoch": 0.46039797112758485, + "grad_norm": 0.5545447468757629, + "learning_rate": 5.764094203407933e-05, + "loss": 1.5867, + "step": 8260 + }, + { + "epoch": 0.46045370938074803, + "grad_norm": 0.5933241248130798, + "learning_rate": 5.763214226386355e-05, + "loss": 1.8117, + "step": 8261 + }, + { + "epoch": 0.46050944763391116, + "grad_norm": 0.6593655943870544, + "learning_rate": 5.7623342251600745e-05, + "loss": 1.6466, + "step": 8262 + }, + { + "epoch": 0.4605651858870743, + "grad_norm": 0.5840887427330017, + "learning_rate": 5.761454199756996e-05, + "loss": 1.6135, + "step": 8263 + }, + { + "epoch": 0.4606209241402374, + "grad_norm": 0.5381019711494446, + "learning_rate": 5.7605741502050314e-05, + "loss": 1.6211, + "step": 8264 + }, + { + "epoch": 0.4606766623934006, + "grad_norm": 0.6085990071296692, + "learning_rate": 5.759694076532087e-05, + "loss": 1.795, + "step": 8265 + }, + { + "epoch": 0.46073240064656373, + "grad_norm": 0.5574647784233093, + "learning_rate": 5.758813978766077e-05, + "loss": 1.4925, + "step": 8266 + }, + { + "epoch": 0.46078813889972686, + "grad_norm": 0.6263840794563293, + "learning_rate": 5.75793385693491e-05, + "loss": 1.8677, + "step": 8267 + }, + { + "epoch": 0.46084387715289005, + "grad_norm": 0.543647289276123, + "learning_rate": 5.7570537110665026e-05, + "loss": 1.7692, + "step": 8268 + }, + { + "epoch": 0.4608996154060532, + "grad_norm": 0.6330240368843079, + "learning_rate": 5.7561735411887644e-05, + "loss": 1.8521, + "step": 8269 + }, + { + "epoch": 0.4609553536592163, + "grad_norm": 0.5961319208145142, + "learning_rate": 5.75529334732961e-05, + "loss": 1.8511, + "step": 8270 + }, + { + "epoch": 0.4610110919123795, + "grad_norm": 0.5653590559959412, + "learning_rate": 5.754413129516956e-05, + "loss": 1.6472, + "step": 8271 + }, + { + "epoch": 0.4610668301655426, + "grad_norm": 0.5134671330451965, + "learning_rate": 5.753532887778714e-05, + "loss": 1.5722, + "step": 8272 + }, + { + "epoch": 0.46112256841870575, + "grad_norm": 0.5468015074729919, + "learning_rate": 5.7526526221428036e-05, + "loss": 1.6829, + "step": 8273 + }, + { + "epoch": 0.4611783066718689, + "grad_norm": 0.5542712211608887, + "learning_rate": 5.751772332637137e-05, + "loss": 1.6583, + "step": 8274 + }, + { + "epoch": 0.46123404492503206, + "grad_norm": 0.554300844669342, + "learning_rate": 5.75089201928964e-05, + "loss": 1.7805, + "step": 8275 + }, + { + "epoch": 0.4612897831781952, + "grad_norm": 0.5648434162139893, + "learning_rate": 5.750011682128222e-05, + "loss": 1.8315, + "step": 8276 + }, + { + "epoch": 0.4613455214313583, + "grad_norm": 0.5622681975364685, + "learning_rate": 5.7491313211808095e-05, + "loss": 1.6431, + "step": 8277 + }, + { + "epoch": 0.4614012596845215, + "grad_norm": 0.5813915133476257, + "learning_rate": 5.748250936475318e-05, + "loss": 1.9023, + "step": 8278 + }, + { + "epoch": 0.46145699793768463, + "grad_norm": 0.5567924380302429, + "learning_rate": 5.747370528039668e-05, + "loss": 1.7468, + "step": 8279 + }, + { + "epoch": 0.46151273619084776, + "grad_norm": 0.5861298441886902, + "learning_rate": 5.7464900959017844e-05, + "loss": 1.7059, + "step": 8280 + }, + { + "epoch": 0.46156847444401095, + "grad_norm": 0.642804741859436, + "learning_rate": 5.745609640089585e-05, + "loss": 1.8385, + "step": 8281 + }, + { + "epoch": 0.4616242126971741, + "grad_norm": 0.5455397963523865, + "learning_rate": 5.744729160630998e-05, + "loss": 1.5585, + "step": 8282 + }, + { + "epoch": 0.4616799509503372, + "grad_norm": 0.5456379055976868, + "learning_rate": 5.743848657553943e-05, + "loss": 1.6787, + "step": 8283 + }, + { + "epoch": 0.4617356892035004, + "grad_norm": 0.6248784065246582, + "learning_rate": 5.742968130886346e-05, + "loss": 1.9457, + "step": 8284 + }, + { + "epoch": 0.4617914274566635, + "grad_norm": 0.5508323311805725, + "learning_rate": 5.74208758065613e-05, + "loss": 1.7643, + "step": 8285 + }, + { + "epoch": 0.46184716570982665, + "grad_norm": 0.5070561170578003, + "learning_rate": 5.741207006891224e-05, + "loss": 1.414, + "step": 8286 + }, + { + "epoch": 0.4619029039629898, + "grad_norm": 0.5954271554946899, + "learning_rate": 5.740326409619552e-05, + "loss": 1.7004, + "step": 8287 + }, + { + "epoch": 0.46195864221615296, + "grad_norm": 0.5585724115371704, + "learning_rate": 5.739445788869043e-05, + "loss": 1.7653, + "step": 8288 + }, + { + "epoch": 0.4620143804693161, + "grad_norm": 0.5526925325393677, + "learning_rate": 5.738565144667626e-05, + "loss": 1.7572, + "step": 8289 + }, + { + "epoch": 0.4620701187224792, + "grad_norm": 0.5708301663398743, + "learning_rate": 5.737684477043228e-05, + "loss": 1.8134, + "step": 8290 + }, + { + "epoch": 0.4621258569756424, + "grad_norm": 0.5142967104911804, + "learning_rate": 5.736803786023779e-05, + "loss": 1.4841, + "step": 8291 + }, + { + "epoch": 0.46218159522880553, + "grad_norm": 0.6403586864471436, + "learning_rate": 5.7359230716372105e-05, + "loss": 1.9146, + "step": 8292 + }, + { + "epoch": 0.46223733348196866, + "grad_norm": 0.5327916145324707, + "learning_rate": 5.735042333911452e-05, + "loss": 1.6559, + "step": 8293 + }, + { + "epoch": 0.46229307173513184, + "grad_norm": 0.5524441599845886, + "learning_rate": 5.734161572874437e-05, + "loss": 1.6659, + "step": 8294 + }, + { + "epoch": 0.462348809988295, + "grad_norm": 0.5722818970680237, + "learning_rate": 5.7332807885540976e-05, + "loss": 1.7702, + "step": 8295 + }, + { + "epoch": 0.4624045482414581, + "grad_norm": 0.5551111698150635, + "learning_rate": 5.7323999809783656e-05, + "loss": 1.6766, + "step": 8296 + }, + { + "epoch": 0.46246028649462123, + "grad_norm": 0.5412301421165466, + "learning_rate": 5.731519150175179e-05, + "loss": 1.6475, + "step": 8297 + }, + { + "epoch": 0.4625160247477844, + "grad_norm": 0.5476828813552856, + "learning_rate": 5.730638296172467e-05, + "loss": 1.643, + "step": 8298 + }, + { + "epoch": 0.46257176300094754, + "grad_norm": 0.5418581366539001, + "learning_rate": 5.7297574189981705e-05, + "loss": 1.5904, + "step": 8299 + }, + { + "epoch": 0.4626275012541107, + "grad_norm": 0.5094223022460938, + "learning_rate": 5.7288765186802204e-05, + "loss": 1.6782, + "step": 8300 + }, + { + "epoch": 0.46268323950727386, + "grad_norm": 0.5535764694213867, + "learning_rate": 5.72799559524656e-05, + "loss": 1.7858, + "step": 8301 + }, + { + "epoch": 0.462738977760437, + "grad_norm": 0.5554370284080505, + "learning_rate": 5.7271146487251224e-05, + "loss": 1.757, + "step": 8302 + }, + { + "epoch": 0.4627947160136001, + "grad_norm": 0.5177475810050964, + "learning_rate": 5.726233679143849e-05, + "loss": 1.7816, + "step": 8303 + }, + { + "epoch": 0.4628504542667633, + "grad_norm": 0.5340207815170288, + "learning_rate": 5.725352686530676e-05, + "loss": 1.742, + "step": 8304 + }, + { + "epoch": 0.46290619251992643, + "grad_norm": 0.5540534257888794, + "learning_rate": 5.724471670913545e-05, + "loss": 1.7751, + "step": 8305 + }, + { + "epoch": 0.46296193077308956, + "grad_norm": 0.539763331413269, + "learning_rate": 5.7235906323203956e-05, + "loss": 1.6988, + "step": 8306 + }, + { + "epoch": 0.46301766902625274, + "grad_norm": 0.5649262070655823, + "learning_rate": 5.7227095707791714e-05, + "loss": 1.6722, + "step": 8307 + }, + { + "epoch": 0.46307340727941587, + "grad_norm": 0.583903968334198, + "learning_rate": 5.721828486317814e-05, + "loss": 1.8056, + "step": 8308 + }, + { + "epoch": 0.463129145532579, + "grad_norm": 0.5246012210845947, + "learning_rate": 5.7209473789642644e-05, + "loss": 1.4819, + "step": 8309 + }, + { + "epoch": 0.46318488378574213, + "grad_norm": 0.5652540922164917, + "learning_rate": 5.720066248746468e-05, + "loss": 1.7022, + "step": 8310 + }, + { + "epoch": 0.4632406220389053, + "grad_norm": 0.5494220852851868, + "learning_rate": 5.7191850956923675e-05, + "loss": 1.5258, + "step": 8311 + }, + { + "epoch": 0.46329636029206844, + "grad_norm": 0.5923638343811035, + "learning_rate": 5.7183039198299105e-05, + "loss": 1.7439, + "step": 8312 + }, + { + "epoch": 0.46335209854523157, + "grad_norm": 0.6051487922668457, + "learning_rate": 5.717422721187039e-05, + "loss": 1.8911, + "step": 8313 + }, + { + "epoch": 0.46340783679839476, + "grad_norm": 0.5064337253570557, + "learning_rate": 5.7165414997917045e-05, + "loss": 1.6547, + "step": 8314 + }, + { + "epoch": 0.4634635750515579, + "grad_norm": 0.6165828704833984, + "learning_rate": 5.715660255671848e-05, + "loss": 1.8988, + "step": 8315 + }, + { + "epoch": 0.463519313304721, + "grad_norm": 0.5490414500236511, + "learning_rate": 5.714778988855422e-05, + "loss": 1.8075, + "step": 8316 + }, + { + "epoch": 0.4635750515578842, + "grad_norm": 0.5493695139884949, + "learning_rate": 5.713897699370376e-05, + "loss": 1.6288, + "step": 8317 + }, + { + "epoch": 0.46363078981104733, + "grad_norm": 0.5596882700920105, + "learning_rate": 5.713016387244656e-05, + "loss": 1.6575, + "step": 8318 + }, + { + "epoch": 0.46368652806421046, + "grad_norm": 0.562776505947113, + "learning_rate": 5.7121350525062126e-05, + "loss": 1.7129, + "step": 8319 + }, + { + "epoch": 0.4637422663173736, + "grad_norm": 0.6399055123329163, + "learning_rate": 5.7112536951829975e-05, + "loss": 1.7888, + "step": 8320 + }, + { + "epoch": 0.46379800457053677, + "grad_norm": 0.5227872729301453, + "learning_rate": 5.710372315302963e-05, + "loss": 1.6324, + "step": 8321 + }, + { + "epoch": 0.4638537428236999, + "grad_norm": 0.5664421319961548, + "learning_rate": 5.70949091289406e-05, + "loss": 1.5484, + "step": 8322 + }, + { + "epoch": 0.46390948107686303, + "grad_norm": 0.5465877652168274, + "learning_rate": 5.708609487984242e-05, + "loss": 1.5863, + "step": 8323 + }, + { + "epoch": 0.4639652193300262, + "grad_norm": 0.562119722366333, + "learning_rate": 5.707728040601462e-05, + "loss": 1.7411, + "step": 8324 + }, + { + "epoch": 0.46402095758318934, + "grad_norm": 0.569681704044342, + "learning_rate": 5.706846570773676e-05, + "loss": 1.6488, + "step": 8325 + }, + { + "epoch": 0.46407669583635247, + "grad_norm": 0.6219793558120728, + "learning_rate": 5.7059650785288354e-05, + "loss": 1.7995, + "step": 8326 + }, + { + "epoch": 0.46413243408951566, + "grad_norm": 0.5750408172607422, + "learning_rate": 5.705083563894902e-05, + "loss": 1.8457, + "step": 8327 + }, + { + "epoch": 0.4641881723426788, + "grad_norm": 0.5338056683540344, + "learning_rate": 5.7042020268998265e-05, + "loss": 1.665, + "step": 8328 + }, + { + "epoch": 0.4642439105958419, + "grad_norm": 0.5091413259506226, + "learning_rate": 5.703320467571569e-05, + "loss": 1.5915, + "step": 8329 + }, + { + "epoch": 0.4642996488490051, + "grad_norm": 0.567847490310669, + "learning_rate": 5.7024388859380875e-05, + "loss": 1.6417, + "step": 8330 + }, + { + "epoch": 0.4643553871021682, + "grad_norm": 0.591010332107544, + "learning_rate": 5.701557282027339e-05, + "loss": 1.8457, + "step": 8331 + }, + { + "epoch": 0.46441112535533136, + "grad_norm": 0.5327983498573303, + "learning_rate": 5.700675655867285e-05, + "loss": 1.6806, + "step": 8332 + }, + { + "epoch": 0.4644668636084945, + "grad_norm": 0.5359470844268799, + "learning_rate": 5.6997940074858835e-05, + "loss": 1.5137, + "step": 8333 + }, + { + "epoch": 0.46452260186165767, + "grad_norm": 0.5727723240852356, + "learning_rate": 5.698912336911097e-05, + "loss": 1.737, + "step": 8334 + }, + { + "epoch": 0.4645783401148208, + "grad_norm": 0.5366725325584412, + "learning_rate": 5.6980306441708854e-05, + "loss": 1.5039, + "step": 8335 + }, + { + "epoch": 0.4646340783679839, + "grad_norm": 0.5799429416656494, + "learning_rate": 5.6971489292932126e-05, + "loss": 1.7687, + "step": 8336 + }, + { + "epoch": 0.4646898166211471, + "grad_norm": 0.6180622577667236, + "learning_rate": 5.69626719230604e-05, + "loss": 1.8375, + "step": 8337 + }, + { + "epoch": 0.46474555487431024, + "grad_norm": 0.5698204636573792, + "learning_rate": 5.6953854332373314e-05, + "loss": 1.6076, + "step": 8338 + }, + { + "epoch": 0.46480129312747337, + "grad_norm": 0.5486071109771729, + "learning_rate": 5.6945036521150495e-05, + "loss": 1.75, + "step": 8339 + }, + { + "epoch": 0.46485703138063655, + "grad_norm": 0.5504134893417358, + "learning_rate": 5.693621848967163e-05, + "loss": 1.753, + "step": 8340 + }, + { + "epoch": 0.4649127696337997, + "grad_norm": 0.5678994059562683, + "learning_rate": 5.6927400238216354e-05, + "loss": 1.845, + "step": 8341 + }, + { + "epoch": 0.4649685078869628, + "grad_norm": 0.5259969234466553, + "learning_rate": 5.6918581767064325e-05, + "loss": 1.5699, + "step": 8342 + }, + { + "epoch": 0.46502424614012594, + "grad_norm": 0.5243310928344727, + "learning_rate": 5.690976307649523e-05, + "loss": 1.5899, + "step": 8343 + }, + { + "epoch": 0.4650799843932891, + "grad_norm": 0.5647771954536438, + "learning_rate": 5.6900944166788725e-05, + "loss": 1.7661, + "step": 8344 + }, + { + "epoch": 0.46513572264645225, + "grad_norm": 0.6884542107582092, + "learning_rate": 5.689212503822452e-05, + "loss": 1.5225, + "step": 8345 + }, + { + "epoch": 0.4651914608996154, + "grad_norm": 0.5403727889060974, + "learning_rate": 5.688330569108228e-05, + "loss": 1.5896, + "step": 8346 + }, + { + "epoch": 0.46524719915277857, + "grad_norm": 0.5732728838920593, + "learning_rate": 5.6874486125641726e-05, + "loss": 1.5632, + "step": 8347 + }, + { + "epoch": 0.4653029374059417, + "grad_norm": 0.5338377356529236, + "learning_rate": 5.686566634218254e-05, + "loss": 1.679, + "step": 8348 + }, + { + "epoch": 0.4653586756591048, + "grad_norm": 0.6053128242492676, + "learning_rate": 5.685684634098447e-05, + "loss": 2.0888, + "step": 8349 + }, + { + "epoch": 0.465414413912268, + "grad_norm": 0.5830248594284058, + "learning_rate": 5.684802612232719e-05, + "loss": 1.7972, + "step": 8350 + }, + { + "epoch": 0.46547015216543114, + "grad_norm": 0.6264218688011169, + "learning_rate": 5.683920568649047e-05, + "loss": 1.8225, + "step": 8351 + }, + { + "epoch": 0.46552589041859427, + "grad_norm": 0.6199706196784973, + "learning_rate": 5.6830385033753995e-05, + "loss": 1.6771, + "step": 8352 + }, + { + "epoch": 0.46558162867175745, + "grad_norm": 0.5402054190635681, + "learning_rate": 5.682156416439755e-05, + "loss": 1.3349, + "step": 8353 + }, + { + "epoch": 0.4656373669249206, + "grad_norm": 0.5562443733215332, + "learning_rate": 5.681274307870085e-05, + "loss": 1.606, + "step": 8354 + }, + { + "epoch": 0.4656931051780837, + "grad_norm": 0.6087068915367126, + "learning_rate": 5.680392177694366e-05, + "loss": 1.7091, + "step": 8355 + }, + { + "epoch": 0.46574884343124684, + "grad_norm": 0.5770891904830933, + "learning_rate": 5.679510025940575e-05, + "loss": 1.7989, + "step": 8356 + }, + { + "epoch": 0.46580458168441, + "grad_norm": 0.5513335466384888, + "learning_rate": 5.6786278526366875e-05, + "loss": 1.5115, + "step": 8357 + }, + { + "epoch": 0.46586031993757315, + "grad_norm": 0.5334859490394592, + "learning_rate": 5.677745657810681e-05, + "loss": 1.5391, + "step": 8358 + }, + { + "epoch": 0.4659160581907363, + "grad_norm": 0.51854008436203, + "learning_rate": 5.6768634414905344e-05, + "loss": 1.4878, + "step": 8359 + }, + { + "epoch": 0.46597179644389947, + "grad_norm": 0.5759007930755615, + "learning_rate": 5.675981203704226e-05, + "loss": 1.7812, + "step": 8360 + }, + { + "epoch": 0.4660275346970626, + "grad_norm": 0.5255948305130005, + "learning_rate": 5.675098944479733e-05, + "loss": 1.6782, + "step": 8361 + }, + { + "epoch": 0.4660832729502257, + "grad_norm": 0.5190218091011047, + "learning_rate": 5.67421666384504e-05, + "loss": 1.4408, + "step": 8362 + }, + { + "epoch": 0.4661390112033889, + "grad_norm": 0.5538722276687622, + "learning_rate": 5.673334361828124e-05, + "loss": 1.6993, + "step": 8363 + }, + { + "epoch": 0.46619474945655204, + "grad_norm": 0.5251713991165161, + "learning_rate": 5.672452038456969e-05, + "loss": 1.5929, + "step": 8364 + }, + { + "epoch": 0.46625048770971517, + "grad_norm": 0.5203914642333984, + "learning_rate": 5.671569693759554e-05, + "loss": 1.5579, + "step": 8365 + }, + { + "epoch": 0.4663062259628783, + "grad_norm": 0.4919300675392151, + "learning_rate": 5.670687327763866e-05, + "loss": 1.5625, + "step": 8366 + }, + { + "epoch": 0.4663619642160415, + "grad_norm": 0.5500087141990662, + "learning_rate": 5.6698049404978845e-05, + "loss": 1.6695, + "step": 8367 + }, + { + "epoch": 0.4664177024692046, + "grad_norm": 0.5846395492553711, + "learning_rate": 5.6689225319895966e-05, + "loss": 1.884, + "step": 8368 + }, + { + "epoch": 0.46647344072236774, + "grad_norm": 0.5971377491950989, + "learning_rate": 5.668040102266987e-05, + "loss": 1.9091, + "step": 8369 + }, + { + "epoch": 0.4665291789755309, + "grad_norm": 0.5873506665229797, + "learning_rate": 5.6671576513580385e-05, + "loss": 1.7085, + "step": 8370 + }, + { + "epoch": 0.46658491722869405, + "grad_norm": 0.551792323589325, + "learning_rate": 5.66627517929074e-05, + "loss": 1.5626, + "step": 8371 + }, + { + "epoch": 0.4666406554818572, + "grad_norm": 0.5586331486701965, + "learning_rate": 5.665392686093076e-05, + "loss": 1.7621, + "step": 8372 + }, + { + "epoch": 0.46669639373502037, + "grad_norm": 0.6477528810501099, + "learning_rate": 5.664510171793038e-05, + "loss": 1.9983, + "step": 8373 + }, + { + "epoch": 0.4667521319881835, + "grad_norm": 0.5568731427192688, + "learning_rate": 5.6636276364186105e-05, + "loss": 1.5046, + "step": 8374 + }, + { + "epoch": 0.4668078702413466, + "grad_norm": 0.5492534637451172, + "learning_rate": 5.6627450799977844e-05, + "loss": 1.6931, + "step": 8375 + }, + { + "epoch": 0.4668636084945098, + "grad_norm": 0.5230808854103088, + "learning_rate": 5.661862502558547e-05, + "loss": 1.5232, + "step": 8376 + }, + { + "epoch": 0.46691934674767294, + "grad_norm": 0.5762078762054443, + "learning_rate": 5.660979904128891e-05, + "loss": 1.8327, + "step": 8377 + }, + { + "epoch": 0.46697508500083607, + "grad_norm": 0.5496635437011719, + "learning_rate": 5.660097284736805e-05, + "loss": 1.5354, + "step": 8378 + }, + { + "epoch": 0.4670308232539992, + "grad_norm": 0.5177884101867676, + "learning_rate": 5.6592146444102826e-05, + "loss": 1.4303, + "step": 8379 + }, + { + "epoch": 0.4670865615071624, + "grad_norm": 0.6022128462791443, + "learning_rate": 5.658331983177315e-05, + "loss": 1.9321, + "step": 8380 + }, + { + "epoch": 0.4671422997603255, + "grad_norm": 0.5913931131362915, + "learning_rate": 5.657449301065895e-05, + "loss": 1.9125, + "step": 8381 + }, + { + "epoch": 0.46719803801348864, + "grad_norm": 0.4976262152194977, + "learning_rate": 5.656566598104017e-05, + "loss": 1.6072, + "step": 8382 + }, + { + "epoch": 0.4672537762666518, + "grad_norm": 0.5472914576530457, + "learning_rate": 5.655683874319675e-05, + "loss": 1.719, + "step": 8383 + }, + { + "epoch": 0.46730951451981495, + "grad_norm": 0.5451732277870178, + "learning_rate": 5.6548011297408634e-05, + "loss": 1.6492, + "step": 8384 + }, + { + "epoch": 0.4673652527729781, + "grad_norm": 0.5876046419143677, + "learning_rate": 5.653918364395575e-05, + "loss": 1.7208, + "step": 8385 + }, + { + "epoch": 0.46742099102614126, + "grad_norm": 0.5409192442893982, + "learning_rate": 5.653035578311812e-05, + "loss": 1.6186, + "step": 8386 + }, + { + "epoch": 0.4674767292793044, + "grad_norm": 0.5066797733306885, + "learning_rate": 5.652152771517566e-05, + "loss": 1.2929, + "step": 8387 + }, + { + "epoch": 0.4675324675324675, + "grad_norm": 0.5531768202781677, + "learning_rate": 5.651269944040838e-05, + "loss": 1.7447, + "step": 8388 + }, + { + "epoch": 0.46758820578563065, + "grad_norm": 0.5745431780815125, + "learning_rate": 5.650387095909623e-05, + "loss": 1.7896, + "step": 8389 + }, + { + "epoch": 0.46764394403879384, + "grad_norm": 0.5450076460838318, + "learning_rate": 5.649504227151922e-05, + "loss": 1.5537, + "step": 8390 + }, + { + "epoch": 0.46769968229195696, + "grad_norm": 0.5614714622497559, + "learning_rate": 5.648621337795733e-05, + "loss": 1.5894, + "step": 8391 + }, + { + "epoch": 0.4677554205451201, + "grad_norm": 0.6122470498085022, + "learning_rate": 5.647738427869058e-05, + "loss": 1.8336, + "step": 8392 + }, + { + "epoch": 0.4678111587982833, + "grad_norm": 0.598466157913208, + "learning_rate": 5.6468554973998955e-05, + "loss": 1.799, + "step": 8393 + }, + { + "epoch": 0.4678668970514464, + "grad_norm": 0.5752211213111877, + "learning_rate": 5.645972546416248e-05, + "loss": 1.7678, + "step": 8394 + }, + { + "epoch": 0.46792263530460954, + "grad_norm": 0.5438199043273926, + "learning_rate": 5.6450895749461194e-05, + "loss": 1.6982, + "step": 8395 + }, + { + "epoch": 0.4679783735577727, + "grad_norm": 0.5414747595787048, + "learning_rate": 5.64420658301751e-05, + "loss": 1.5794, + "step": 8396 + }, + { + "epoch": 0.46803411181093585, + "grad_norm": 0.5446813702583313, + "learning_rate": 5.643323570658424e-05, + "loss": 1.4545, + "step": 8397 + }, + { + "epoch": 0.468089850064099, + "grad_norm": 0.5998760461807251, + "learning_rate": 5.642440537896863e-05, + "loss": 1.6886, + "step": 8398 + }, + { + "epoch": 0.46814558831726216, + "grad_norm": 0.5757097005844116, + "learning_rate": 5.6415574847608365e-05, + "loss": 1.6932, + "step": 8399 + }, + { + "epoch": 0.4682013265704253, + "grad_norm": 0.5681119561195374, + "learning_rate": 5.640674411278345e-05, + "loss": 1.6357, + "step": 8400 + }, + { + "epoch": 0.4682570648235884, + "grad_norm": 0.5782068371772766, + "learning_rate": 5.6397913174773986e-05, + "loss": 1.4748, + "step": 8401 + }, + { + "epoch": 0.46831280307675155, + "grad_norm": 0.5838581323623657, + "learning_rate": 5.638908203386001e-05, + "loss": 1.6619, + "step": 8402 + }, + { + "epoch": 0.46836854132991473, + "grad_norm": 0.5535818934440613, + "learning_rate": 5.638025069032159e-05, + "loss": 1.7486, + "step": 8403 + }, + { + "epoch": 0.46842427958307786, + "grad_norm": 0.5350418090820312, + "learning_rate": 5.637141914443883e-05, + "loss": 1.6243, + "step": 8404 + }, + { + "epoch": 0.468480017836241, + "grad_norm": 0.5376988053321838, + "learning_rate": 5.6362587396491805e-05, + "loss": 1.6984, + "step": 8405 + }, + { + "epoch": 0.4685357560894042, + "grad_norm": 0.593912661075592, + "learning_rate": 5.63537554467606e-05, + "loss": 1.6001, + "step": 8406 + }, + { + "epoch": 0.4685914943425673, + "grad_norm": 0.5185176730155945, + "learning_rate": 5.634492329552531e-05, + "loss": 1.4702, + "step": 8407 + }, + { + "epoch": 0.46864723259573043, + "grad_norm": 0.5814734101295471, + "learning_rate": 5.6336090943066063e-05, + "loss": 1.8799, + "step": 8408 + }, + { + "epoch": 0.4687029708488936, + "grad_norm": 0.5562795400619507, + "learning_rate": 5.632725838966294e-05, + "loss": 1.7107, + "step": 8409 + }, + { + "epoch": 0.46875870910205675, + "grad_norm": 0.5342075824737549, + "learning_rate": 5.631842563559608e-05, + "loss": 1.6502, + "step": 8410 + }, + { + "epoch": 0.4688144473552199, + "grad_norm": 0.5376294255256653, + "learning_rate": 5.630959268114558e-05, + "loss": 1.6374, + "step": 8411 + }, + { + "epoch": 0.46887018560838306, + "grad_norm": 0.5461024641990662, + "learning_rate": 5.630075952659162e-05, + "loss": 1.7209, + "step": 8412 + }, + { + "epoch": 0.4689259238615462, + "grad_norm": 0.5888074040412903, + "learning_rate": 5.629192617221427e-05, + "loss": 1.7923, + "step": 8413 + }, + { + "epoch": 0.4689816621147093, + "grad_norm": 0.5504298210144043, + "learning_rate": 5.6283092618293734e-05, + "loss": 1.6201, + "step": 8414 + }, + { + "epoch": 0.46903740036787245, + "grad_norm": 0.5408875942230225, + "learning_rate": 5.627425886511012e-05, + "loss": 1.5646, + "step": 8415 + }, + { + "epoch": 0.46909313862103563, + "grad_norm": 0.5847890377044678, + "learning_rate": 5.626542491294359e-05, + "loss": 1.7076, + "step": 8416 + }, + { + "epoch": 0.46914887687419876, + "grad_norm": 0.5354915261268616, + "learning_rate": 5.6256590762074315e-05, + "loss": 1.5801, + "step": 8417 + }, + { + "epoch": 0.4692046151273619, + "grad_norm": 0.5805383324623108, + "learning_rate": 5.624775641278247e-05, + "loss": 1.8075, + "step": 8418 + }, + { + "epoch": 0.4692603533805251, + "grad_norm": 0.5791111588478088, + "learning_rate": 5.6238921865348204e-05, + "loss": 1.8437, + "step": 8419 + }, + { + "epoch": 0.4693160916336882, + "grad_norm": 0.5863295793533325, + "learning_rate": 5.623008712005172e-05, + "loss": 1.7371, + "step": 8420 + }, + { + "epoch": 0.46937182988685133, + "grad_norm": 0.5539514422416687, + "learning_rate": 5.62212521771732e-05, + "loss": 1.646, + "step": 8421 + }, + { + "epoch": 0.4694275681400145, + "grad_norm": 0.5049216151237488, + "learning_rate": 5.6212417036992826e-05, + "loss": 1.447, + "step": 8422 + }, + { + "epoch": 0.46948330639317765, + "grad_norm": 0.5240146517753601, + "learning_rate": 5.620358169979082e-05, + "loss": 1.729, + "step": 8423 + }, + { + "epoch": 0.4695390446463408, + "grad_norm": 0.5284691452980042, + "learning_rate": 5.619474616584734e-05, + "loss": 1.5096, + "step": 8424 + }, + { + "epoch": 0.4695947828995039, + "grad_norm": 0.5499683618545532, + "learning_rate": 5.618591043544266e-05, + "loss": 1.5803, + "step": 8425 + }, + { + "epoch": 0.4696505211526671, + "grad_norm": 0.588737964630127, + "learning_rate": 5.617707450885695e-05, + "loss": 1.6776, + "step": 8426 + }, + { + "epoch": 0.4697062594058302, + "grad_norm": 0.5827232599258423, + "learning_rate": 5.6168238386370466e-05, + "loss": 1.6402, + "step": 8427 + }, + { + "epoch": 0.46976199765899335, + "grad_norm": 0.5729832649230957, + "learning_rate": 5.615940206826341e-05, + "loss": 1.7642, + "step": 8428 + }, + { + "epoch": 0.46981773591215653, + "grad_norm": 0.5644805431365967, + "learning_rate": 5.6150565554816035e-05, + "loss": 1.7081, + "step": 8429 + }, + { + "epoch": 0.46987347416531966, + "grad_norm": 0.5413994193077087, + "learning_rate": 5.6141728846308586e-05, + "loss": 1.7756, + "step": 8430 + }, + { + "epoch": 0.4699292124184828, + "grad_norm": 0.5305155515670776, + "learning_rate": 5.6132891943021304e-05, + "loss": 1.5193, + "step": 8431 + }, + { + "epoch": 0.469984950671646, + "grad_norm": 0.5325213074684143, + "learning_rate": 5.612405484523444e-05, + "loss": 1.5169, + "step": 8432 + }, + { + "epoch": 0.4700406889248091, + "grad_norm": 0.5783179998397827, + "learning_rate": 5.6115217553228274e-05, + "loss": 1.6159, + "step": 8433 + }, + { + "epoch": 0.47009642717797223, + "grad_norm": 0.5537718534469604, + "learning_rate": 5.610638006728306e-05, + "loss": 1.6027, + "step": 8434 + }, + { + "epoch": 0.4701521654311354, + "grad_norm": 0.6395325660705566, + "learning_rate": 5.609754238767907e-05, + "loss": 1.3854, + "step": 8435 + }, + { + "epoch": 0.47020790368429854, + "grad_norm": 0.5301234126091003, + "learning_rate": 5.608870451469659e-05, + "loss": 1.6888, + "step": 8436 + }, + { + "epoch": 0.4702636419374617, + "grad_norm": 0.5246771574020386, + "learning_rate": 5.607986644861588e-05, + "loss": 1.5963, + "step": 8437 + }, + { + "epoch": 0.4703193801906248, + "grad_norm": 0.5331987738609314, + "learning_rate": 5.607102818971729e-05, + "loss": 1.7791, + "step": 8438 + }, + { + "epoch": 0.470375118443788, + "grad_norm": 0.5587426424026489, + "learning_rate": 5.6062189738281056e-05, + "loss": 1.744, + "step": 8439 + }, + { + "epoch": 0.4704308566969511, + "grad_norm": 0.5236651301383972, + "learning_rate": 5.6053351094587526e-05, + "loss": 1.4963, + "step": 8440 + }, + { + "epoch": 0.47048659495011425, + "grad_norm": 0.5496351718902588, + "learning_rate": 5.604451225891698e-05, + "loss": 1.491, + "step": 8441 + }, + { + "epoch": 0.47054233320327743, + "grad_norm": 0.5666020512580872, + "learning_rate": 5.603567323154975e-05, + "loss": 1.6241, + "step": 8442 + }, + { + "epoch": 0.47059807145644056, + "grad_norm": 0.5503633618354797, + "learning_rate": 5.602683401276615e-05, + "loss": 1.6522, + "step": 8443 + }, + { + "epoch": 0.4706538097096037, + "grad_norm": 0.5833953022956848, + "learning_rate": 5.601799460284654e-05, + "loss": 1.7361, + "step": 8444 + }, + { + "epoch": 0.47070954796276687, + "grad_norm": 0.5664584636688232, + "learning_rate": 5.60091550020712e-05, + "loss": 1.6558, + "step": 8445 + }, + { + "epoch": 0.47076528621593, + "grad_norm": 0.5645166635513306, + "learning_rate": 5.60003152107205e-05, + "loss": 1.7492, + "step": 8446 + }, + { + "epoch": 0.47082102446909313, + "grad_norm": 0.5689491629600525, + "learning_rate": 5.599147522907481e-05, + "loss": 1.6956, + "step": 8447 + }, + { + "epoch": 0.47087676272225626, + "grad_norm": 0.6192054152488708, + "learning_rate": 5.598263505741443e-05, + "loss": 1.5153, + "step": 8448 + }, + { + "epoch": 0.47093250097541944, + "grad_norm": 0.5669271945953369, + "learning_rate": 5.597379469601978e-05, + "loss": 1.5719, + "step": 8449 + }, + { + "epoch": 0.4709882392285826, + "grad_norm": 0.5729002952575684, + "learning_rate": 5.5964954145171145e-05, + "loss": 1.7169, + "step": 8450 + }, + { + "epoch": 0.4710439774817457, + "grad_norm": 0.532015323638916, + "learning_rate": 5.595611340514898e-05, + "loss": 1.6197, + "step": 8451 + }, + { + "epoch": 0.4710997157349089, + "grad_norm": 0.5148784518241882, + "learning_rate": 5.594727247623361e-05, + "loss": 1.611, + "step": 8452 + }, + { + "epoch": 0.471155453988072, + "grad_norm": 0.5674019455909729, + "learning_rate": 5.593843135870545e-05, + "loss": 1.6694, + "step": 8453 + }, + { + "epoch": 0.47121119224123514, + "grad_norm": 0.5392388701438904, + "learning_rate": 5.592959005284485e-05, + "loss": 1.5342, + "step": 8454 + }, + { + "epoch": 0.47126693049439833, + "grad_norm": 0.5939937829971313, + "learning_rate": 5.592074855893223e-05, + "loss": 1.7698, + "step": 8455 + }, + { + "epoch": 0.47132266874756146, + "grad_norm": 0.603952169418335, + "learning_rate": 5.591190687724799e-05, + "loss": 1.885, + "step": 8456 + }, + { + "epoch": 0.4713784070007246, + "grad_norm": 0.5169516801834106, + "learning_rate": 5.590306500807253e-05, + "loss": 1.4436, + "step": 8457 + }, + { + "epoch": 0.47143414525388777, + "grad_norm": 0.5573791265487671, + "learning_rate": 5.589422295168626e-05, + "loss": 1.6708, + "step": 8458 + }, + { + "epoch": 0.4714898835070509, + "grad_norm": 0.5594834685325623, + "learning_rate": 5.5885380708369606e-05, + "loss": 1.6496, + "step": 8459 + }, + { + "epoch": 0.47154562176021403, + "grad_norm": 0.5771753787994385, + "learning_rate": 5.5876538278403e-05, + "loss": 1.7612, + "step": 8460 + }, + { + "epoch": 0.47160136001337716, + "grad_norm": 0.5862414240837097, + "learning_rate": 5.586769566206686e-05, + "loss": 1.9365, + "step": 8461 + }, + { + "epoch": 0.47165709826654034, + "grad_norm": 0.5807836055755615, + "learning_rate": 5.585885285964163e-05, + "loss": 1.623, + "step": 8462 + }, + { + "epoch": 0.47171283651970347, + "grad_norm": 0.5933867692947388, + "learning_rate": 5.5850009871407716e-05, + "loss": 1.8284, + "step": 8463 + }, + { + "epoch": 0.4717685747728666, + "grad_norm": 0.5377753973007202, + "learning_rate": 5.584116669764563e-05, + "loss": 1.462, + "step": 8464 + }, + { + "epoch": 0.4718243130260298, + "grad_norm": 0.5384745597839355, + "learning_rate": 5.583232333863577e-05, + "loss": 1.5878, + "step": 8465 + }, + { + "epoch": 0.4718800512791929, + "grad_norm": 0.5296236872673035, + "learning_rate": 5.582347979465864e-05, + "loss": 1.6045, + "step": 8466 + }, + { + "epoch": 0.47193578953235604, + "grad_norm": 0.6247029304504395, + "learning_rate": 5.581463606599467e-05, + "loss": 1.6802, + "step": 8467 + }, + { + "epoch": 0.4719915277855192, + "grad_norm": 0.5652837157249451, + "learning_rate": 5.580579215292435e-05, + "loss": 1.6555, + "step": 8468 + }, + { + "epoch": 0.47204726603868236, + "grad_norm": 0.5700575709342957, + "learning_rate": 5.5796948055728147e-05, + "loss": 1.8245, + "step": 8469 + }, + { + "epoch": 0.4721030042918455, + "grad_norm": 0.5366250276565552, + "learning_rate": 5.578810377468656e-05, + "loss": 1.8156, + "step": 8470 + }, + { + "epoch": 0.4721587425450086, + "grad_norm": 0.5650043487548828, + "learning_rate": 5.577925931008007e-05, + "loss": 1.6757, + "step": 8471 + }, + { + "epoch": 0.4722144807981718, + "grad_norm": 0.5967742204666138, + "learning_rate": 5.577041466218915e-05, + "loss": 1.939, + "step": 8472 + }, + { + "epoch": 0.4722702190513349, + "grad_norm": 0.5320480465888977, + "learning_rate": 5.576156983129435e-05, + "loss": 1.5016, + "step": 8473 + }, + { + "epoch": 0.47232595730449806, + "grad_norm": 0.5365233421325684, + "learning_rate": 5.5752724817676125e-05, + "loss": 1.5794, + "step": 8474 + }, + { + "epoch": 0.47238169555766124, + "grad_norm": 0.5704277753829956, + "learning_rate": 5.5743879621615026e-05, + "loss": 1.5467, + "step": 8475 + }, + { + "epoch": 0.47243743381082437, + "grad_norm": 0.5679128170013428, + "learning_rate": 5.5735034243391537e-05, + "loss": 1.6893, + "step": 8476 + }, + { + "epoch": 0.4724931720639875, + "grad_norm": 0.5593464970588684, + "learning_rate": 5.572618868328621e-05, + "loss": 1.6293, + "step": 8477 + }, + { + "epoch": 0.4725489103171507, + "grad_norm": 0.527761697769165, + "learning_rate": 5.5717342941579555e-05, + "loss": 1.6616, + "step": 8478 + }, + { + "epoch": 0.4726046485703138, + "grad_norm": 0.5714175701141357, + "learning_rate": 5.570849701855213e-05, + "loss": 1.7797, + "step": 8479 + }, + { + "epoch": 0.47266038682347694, + "grad_norm": 0.5801485180854797, + "learning_rate": 5.569965091448446e-05, + "loss": 1.6934, + "step": 8480 + }, + { + "epoch": 0.4727161250766401, + "grad_norm": 0.6128066778182983, + "learning_rate": 5.5690804629657076e-05, + "loss": 1.8593, + "step": 8481 + }, + { + "epoch": 0.47277186332980325, + "grad_norm": 0.6358544230461121, + "learning_rate": 5.568195816435057e-05, + "loss": 1.8292, + "step": 8482 + }, + { + "epoch": 0.4728276015829664, + "grad_norm": 0.5209305882453918, + "learning_rate": 5.567311151884547e-05, + "loss": 1.6183, + "step": 8483 + }, + { + "epoch": 0.4728833398361295, + "grad_norm": 0.5640316605567932, + "learning_rate": 5.566426469342235e-05, + "loss": 1.7618, + "step": 8484 + }, + { + "epoch": 0.4729390780892927, + "grad_norm": 0.5284755825996399, + "learning_rate": 5.565541768836178e-05, + "loss": 1.6473, + "step": 8485 + }, + { + "epoch": 0.4729948163424558, + "grad_norm": 0.5737931728363037, + "learning_rate": 5.564657050394434e-05, + "loss": 1.9419, + "step": 8486 + }, + { + "epoch": 0.47305055459561896, + "grad_norm": 0.5647780299186707, + "learning_rate": 5.563772314045059e-05, + "loss": 1.6413, + "step": 8487 + }, + { + "epoch": 0.47310629284878214, + "grad_norm": 0.5379336476325989, + "learning_rate": 5.562887559816116e-05, + "loss": 1.5344, + "step": 8488 + }, + { + "epoch": 0.47316203110194527, + "grad_norm": 0.5728521943092346, + "learning_rate": 5.562002787735657e-05, + "loss": 1.6937, + "step": 8489 + }, + { + "epoch": 0.4732177693551084, + "grad_norm": 0.5722839832305908, + "learning_rate": 5.561117997831751e-05, + "loss": 1.6869, + "step": 8490 + }, + { + "epoch": 0.4732735076082716, + "grad_norm": 0.5436987280845642, + "learning_rate": 5.56023319013245e-05, + "loss": 1.3939, + "step": 8491 + }, + { + "epoch": 0.4733292458614347, + "grad_norm": 0.5408251285552979, + "learning_rate": 5.559348364665822e-05, + "loss": 1.5309, + "step": 8492 + }, + { + "epoch": 0.47338498411459784, + "grad_norm": 0.5417353510856628, + "learning_rate": 5.5584635214599225e-05, + "loss": 1.5592, + "step": 8493 + }, + { + "epoch": 0.47344072236776097, + "grad_norm": 0.5821628570556641, + "learning_rate": 5.557578660542816e-05, + "loss": 1.5603, + "step": 8494 + }, + { + "epoch": 0.47349646062092415, + "grad_norm": 0.5318421721458435, + "learning_rate": 5.5566937819425656e-05, + "loss": 1.5251, + "step": 8495 + }, + { + "epoch": 0.4735521988740873, + "grad_norm": 0.5154527425765991, + "learning_rate": 5.5558088856872346e-05, + "loss": 1.572, + "step": 8496 + }, + { + "epoch": 0.4736079371272504, + "grad_norm": 0.5686662197113037, + "learning_rate": 5.554923971804887e-05, + "loss": 1.5153, + "step": 8497 + }, + { + "epoch": 0.4736636753804136, + "grad_norm": 0.5712747573852539, + "learning_rate": 5.554039040323586e-05, + "loss": 1.7534, + "step": 8498 + }, + { + "epoch": 0.4737194136335767, + "grad_norm": 0.5434257388114929, + "learning_rate": 5.5531540912713974e-05, + "loss": 1.6791, + "step": 8499 + }, + { + "epoch": 0.47377515188673985, + "grad_norm": 0.5522347092628479, + "learning_rate": 5.552269124676386e-05, + "loss": 1.7779, + "step": 8500 + }, + { + "epoch": 0.47383089013990304, + "grad_norm": 0.5155788064002991, + "learning_rate": 5.551384140566618e-05, + "loss": 1.4377, + "step": 8501 + }, + { + "epoch": 0.47388662839306617, + "grad_norm": 0.5739377737045288, + "learning_rate": 5.550499138970158e-05, + "loss": 1.8262, + "step": 8502 + }, + { + "epoch": 0.4739423666462293, + "grad_norm": 0.5527716875076294, + "learning_rate": 5.5496141199150766e-05, + "loss": 1.3705, + "step": 8503 + }, + { + "epoch": 0.4739981048993925, + "grad_norm": 0.5810341238975525, + "learning_rate": 5.548729083429439e-05, + "loss": 1.7927, + "step": 8504 + }, + { + "epoch": 0.4740538431525556, + "grad_norm": 0.5541203618049622, + "learning_rate": 5.547844029541316e-05, + "loss": 1.7237, + "step": 8505 + }, + { + "epoch": 0.47410958140571874, + "grad_norm": 0.5816789865493774, + "learning_rate": 5.546958958278773e-05, + "loss": 1.6761, + "step": 8506 + }, + { + "epoch": 0.47416531965888187, + "grad_norm": 0.5344805121421814, + "learning_rate": 5.546073869669881e-05, + "loss": 1.7347, + "step": 8507 + }, + { + "epoch": 0.47422105791204505, + "grad_norm": 0.5249469876289368, + "learning_rate": 5.5451887637427104e-05, + "loss": 1.5048, + "step": 8508 + }, + { + "epoch": 0.4742767961652082, + "grad_norm": 0.5707089900970459, + "learning_rate": 5.544303640525328e-05, + "loss": 1.811, + "step": 8509 + }, + { + "epoch": 0.4743325344183713, + "grad_norm": 0.5320430397987366, + "learning_rate": 5.5434185000458114e-05, + "loss": 1.7104, + "step": 8510 + }, + { + "epoch": 0.4743882726715345, + "grad_norm": 0.5608380436897278, + "learning_rate": 5.5425333423322255e-05, + "loss": 1.7893, + "step": 8511 + }, + { + "epoch": 0.4744440109246976, + "grad_norm": 0.5271068811416626, + "learning_rate": 5.5416481674126474e-05, + "loss": 1.7735, + "step": 8512 + }, + { + "epoch": 0.47449974917786075, + "grad_norm": 0.5395051836967468, + "learning_rate": 5.540762975315147e-05, + "loss": 1.7249, + "step": 8513 + }, + { + "epoch": 0.47455548743102394, + "grad_norm": 0.5892390012741089, + "learning_rate": 5.539877766067798e-05, + "loss": 1.7148, + "step": 8514 + }, + { + "epoch": 0.47461122568418707, + "grad_norm": 0.5333415269851685, + "learning_rate": 5.538992539698672e-05, + "loss": 1.6184, + "step": 8515 + }, + { + "epoch": 0.4746669639373502, + "grad_norm": 0.6480614542961121, + "learning_rate": 5.538107296235847e-05, + "loss": 1.6898, + "step": 8516 + }, + { + "epoch": 0.4747227021905133, + "grad_norm": 0.5696564316749573, + "learning_rate": 5.5372220357073955e-05, + "loss": 1.7039, + "step": 8517 + }, + { + "epoch": 0.4747784404436765, + "grad_norm": 0.5047008991241455, + "learning_rate": 5.536336758141394e-05, + "loss": 1.5221, + "step": 8518 + }, + { + "epoch": 0.47483417869683964, + "grad_norm": 0.6112247705459595, + "learning_rate": 5.535451463565916e-05, + "loss": 1.7282, + "step": 8519 + }, + { + "epoch": 0.47488991695000277, + "grad_norm": 0.5554122924804688, + "learning_rate": 5.5345661520090394e-05, + "loss": 1.6662, + "step": 8520 + }, + { + "epoch": 0.47494565520316595, + "grad_norm": 0.5461030602455139, + "learning_rate": 5.533680823498844e-05, + "loss": 1.6679, + "step": 8521 + }, + { + "epoch": 0.4750013934563291, + "grad_norm": 0.5860038995742798, + "learning_rate": 5.5327954780634004e-05, + "loss": 1.769, + "step": 8522 + }, + { + "epoch": 0.4750571317094922, + "grad_norm": 0.6236945390701294, + "learning_rate": 5.531910115730794e-05, + "loss": 1.9089, + "step": 8523 + }, + { + "epoch": 0.4751128699626554, + "grad_norm": 0.545220673084259, + "learning_rate": 5.531024736529099e-05, + "loss": 1.7743, + "step": 8524 + }, + { + "epoch": 0.4751686082158185, + "grad_norm": 0.6534609198570251, + "learning_rate": 5.5301393404863954e-05, + "loss": 1.9673, + "step": 8525 + }, + { + "epoch": 0.47522434646898165, + "grad_norm": 0.5649281740188599, + "learning_rate": 5.529253927630762e-05, + "loss": 1.6666, + "step": 8526 + }, + { + "epoch": 0.47528008472214484, + "grad_norm": 0.5315033197402954, + "learning_rate": 5.5283684979902815e-05, + "loss": 1.678, + "step": 8527 + }, + { + "epoch": 0.47533582297530796, + "grad_norm": 0.5951296091079712, + "learning_rate": 5.5274830515930306e-05, + "loss": 1.6429, + "step": 8528 + }, + { + "epoch": 0.4753915612284711, + "grad_norm": 0.5288706421852112, + "learning_rate": 5.526597588467095e-05, + "loss": 1.65, + "step": 8529 + }, + { + "epoch": 0.4754472994816342, + "grad_norm": 0.5894261002540588, + "learning_rate": 5.525712108640553e-05, + "loss": 1.6486, + "step": 8530 + }, + { + "epoch": 0.4755030377347974, + "grad_norm": 0.5475479960441589, + "learning_rate": 5.524826612141488e-05, + "loss": 1.5981, + "step": 8531 + }, + { + "epoch": 0.47555877598796054, + "grad_norm": 0.5496692657470703, + "learning_rate": 5.523941098997983e-05, + "loss": 1.6958, + "step": 8532 + }, + { + "epoch": 0.47561451424112366, + "grad_norm": 0.6038063168525696, + "learning_rate": 5.5230555692381214e-05, + "loss": 1.7152, + "step": 8533 + }, + { + "epoch": 0.47567025249428685, + "grad_norm": 0.5410369038581848, + "learning_rate": 5.5221700228899866e-05, + "loss": 1.5163, + "step": 8534 + }, + { + "epoch": 0.47572599074745, + "grad_norm": 0.5673332214355469, + "learning_rate": 5.521284459981662e-05, + "loss": 1.6854, + "step": 8535 + }, + { + "epoch": 0.4757817290006131, + "grad_norm": 0.5714686512947083, + "learning_rate": 5.520398880541235e-05, + "loss": 1.6205, + "step": 8536 + }, + { + "epoch": 0.4758374672537763, + "grad_norm": 0.6370970606803894, + "learning_rate": 5.519513284596789e-05, + "loss": 1.8303, + "step": 8537 + }, + { + "epoch": 0.4758932055069394, + "grad_norm": 0.5482840538024902, + "learning_rate": 5.518627672176412e-05, + "loss": 1.5506, + "step": 8538 + }, + { + "epoch": 0.47594894376010255, + "grad_norm": 0.5282999277114868, + "learning_rate": 5.5177420433081874e-05, + "loss": 1.2786, + "step": 8539 + }, + { + "epoch": 0.4760046820132657, + "grad_norm": 0.5575840473175049, + "learning_rate": 5.516856398020205e-05, + "loss": 1.5573, + "step": 8540 + }, + { + "epoch": 0.47606042026642886, + "grad_norm": 0.5926665663719177, + "learning_rate": 5.5159707363405485e-05, + "loss": 1.7721, + "step": 8541 + }, + { + "epoch": 0.476116158519592, + "grad_norm": 0.5172202587127686, + "learning_rate": 5.515085058297313e-05, + "loss": 1.4076, + "step": 8542 + }, + { + "epoch": 0.4761718967727551, + "grad_norm": 0.581986665725708, + "learning_rate": 5.514199363918578e-05, + "loss": 1.7104, + "step": 8543 + }, + { + "epoch": 0.4762276350259183, + "grad_norm": 0.5978564023971558, + "learning_rate": 5.51331365323244e-05, + "loss": 1.8326, + "step": 8544 + }, + { + "epoch": 0.47628337327908143, + "grad_norm": 0.5649850368499756, + "learning_rate": 5.5124279262669856e-05, + "loss": 1.6206, + "step": 8545 + }, + { + "epoch": 0.47633911153224456, + "grad_norm": 0.6205348372459412, + "learning_rate": 5.511542183050305e-05, + "loss": 1.7466, + "step": 8546 + }, + { + "epoch": 0.47639484978540775, + "grad_norm": 0.5095716714859009, + "learning_rate": 5.5106564236104884e-05, + "loss": 1.5614, + "step": 8547 + }, + { + "epoch": 0.4764505880385709, + "grad_norm": 0.5600999593734741, + "learning_rate": 5.509770647975626e-05, + "loss": 1.825, + "step": 8548 + }, + { + "epoch": 0.476506326291734, + "grad_norm": 0.5659551620483398, + "learning_rate": 5.508884856173813e-05, + "loss": 1.8289, + "step": 8549 + }, + { + "epoch": 0.4765620645448972, + "grad_norm": 0.524356484413147, + "learning_rate": 5.507999048233138e-05, + "loss": 1.591, + "step": 8550 + }, + { + "epoch": 0.4766178027980603, + "grad_norm": 0.5709447860717773, + "learning_rate": 5.507113224181696e-05, + "loss": 1.6152, + "step": 8551 + }, + { + "epoch": 0.47667354105122345, + "grad_norm": 0.5852453112602234, + "learning_rate": 5.506227384047579e-05, + "loss": 1.7522, + "step": 8552 + }, + { + "epoch": 0.4767292793043866, + "grad_norm": 0.6322617530822754, + "learning_rate": 5.50534152785888e-05, + "loss": 1.8002, + "step": 8553 + }, + { + "epoch": 0.47678501755754976, + "grad_norm": 0.6037564277648926, + "learning_rate": 5.504455655643694e-05, + "loss": 1.7472, + "step": 8554 + }, + { + "epoch": 0.4768407558107129, + "grad_norm": 0.6172270774841309, + "learning_rate": 5.503569767430118e-05, + "loss": 1.7638, + "step": 8555 + }, + { + "epoch": 0.476896494063876, + "grad_norm": 0.5917114615440369, + "learning_rate": 5.502683863246243e-05, + "loss": 1.7726, + "step": 8556 + }, + { + "epoch": 0.4769522323170392, + "grad_norm": 0.5618294477462769, + "learning_rate": 5.5017979431201675e-05, + "loss": 1.5519, + "step": 8557 + }, + { + "epoch": 0.47700797057020233, + "grad_norm": 0.5710815191268921, + "learning_rate": 5.500912007079987e-05, + "loss": 1.6896, + "step": 8558 + }, + { + "epoch": 0.47706370882336546, + "grad_norm": 0.5609897971153259, + "learning_rate": 5.5000260551537975e-05, + "loss": 1.7455, + "step": 8559 + }, + { + "epoch": 0.47711944707652865, + "grad_norm": 0.5565608739852905, + "learning_rate": 5.499140087369697e-05, + "loss": 1.5399, + "step": 8560 + }, + { + "epoch": 0.4771751853296918, + "grad_norm": 0.5751162767410278, + "learning_rate": 5.4982541037557823e-05, + "loss": 1.5373, + "step": 8561 + }, + { + "epoch": 0.4772309235828549, + "grad_norm": 0.5089201927185059, + "learning_rate": 5.4973681043401534e-05, + "loss": 1.2027, + "step": 8562 + }, + { + "epoch": 0.47728666183601803, + "grad_norm": 0.5925856232643127, + "learning_rate": 5.496482089150908e-05, + "loss": 1.9377, + "step": 8563 + }, + { + "epoch": 0.4773424000891812, + "grad_norm": 0.5660269260406494, + "learning_rate": 5.495596058216147e-05, + "loss": 1.4814, + "step": 8564 + }, + { + "epoch": 0.47739813834234435, + "grad_norm": 0.5554754734039307, + "learning_rate": 5.494710011563966e-05, + "loss": 1.6303, + "step": 8565 + }, + { + "epoch": 0.4774538765955075, + "grad_norm": 0.6004930138587952, + "learning_rate": 5.49382394922247e-05, + "loss": 1.6204, + "step": 8566 + }, + { + "epoch": 0.47750961484867066, + "grad_norm": 0.5308135747909546, + "learning_rate": 5.4929378712197556e-05, + "loss": 1.5949, + "step": 8567 + }, + { + "epoch": 0.4775653531018338, + "grad_norm": 0.5763102769851685, + "learning_rate": 5.4920517775839276e-05, + "loss": 1.7625, + "step": 8568 + }, + { + "epoch": 0.4776210913549969, + "grad_norm": 0.572308361530304, + "learning_rate": 5.491165668343085e-05, + "loss": 1.7809, + "step": 8569 + }, + { + "epoch": 0.4776768296081601, + "grad_norm": 0.6404359340667725, + "learning_rate": 5.4902795435253306e-05, + "loss": 2.0053, + "step": 8570 + }, + { + "epoch": 0.47773256786132323, + "grad_norm": 0.5613745450973511, + "learning_rate": 5.489393403158769e-05, + "loss": 1.8136, + "step": 8571 + }, + { + "epoch": 0.47778830611448636, + "grad_norm": 0.5631322860717773, + "learning_rate": 5.488507247271502e-05, + "loss": 1.9469, + "step": 8572 + }, + { + "epoch": 0.47784404436764955, + "grad_norm": 0.5425231456756592, + "learning_rate": 5.487621075891632e-05, + "loss": 1.7089, + "step": 8573 + }, + { + "epoch": 0.4778997826208127, + "grad_norm": 0.6085340976715088, + "learning_rate": 5.4867348890472646e-05, + "loss": 1.8108, + "step": 8574 + }, + { + "epoch": 0.4779555208739758, + "grad_norm": 0.5472151637077332, + "learning_rate": 5.485848686766506e-05, + "loss": 1.5179, + "step": 8575 + }, + { + "epoch": 0.47801125912713893, + "grad_norm": 0.5451512336730957, + "learning_rate": 5.484962469077458e-05, + "loss": 1.6112, + "step": 8576 + }, + { + "epoch": 0.4780669973803021, + "grad_norm": 0.5663710236549377, + "learning_rate": 5.4840762360082286e-05, + "loss": 1.6932, + "step": 8577 + }, + { + "epoch": 0.47812273563346525, + "grad_norm": 0.5614507794380188, + "learning_rate": 5.483189987586924e-05, + "loss": 1.7001, + "step": 8578 + }, + { + "epoch": 0.4781784738866284, + "grad_norm": 0.5428431034088135, + "learning_rate": 5.4823037238416506e-05, + "loss": 1.7767, + "step": 8579 + }, + { + "epoch": 0.47823421213979156, + "grad_norm": 0.5602681636810303, + "learning_rate": 5.481417444800512e-05, + "loss": 1.6749, + "step": 8580 + }, + { + "epoch": 0.4782899503929547, + "grad_norm": 0.5648148655891418, + "learning_rate": 5.480531150491622e-05, + "loss": 1.723, + "step": 8581 + }, + { + "epoch": 0.4783456886461178, + "grad_norm": 0.5764549970626831, + "learning_rate": 5.4796448409430845e-05, + "loss": 1.8049, + "step": 8582 + }, + { + "epoch": 0.478401426899281, + "grad_norm": 0.5871893167495728, + "learning_rate": 5.478758516183009e-05, + "loss": 1.979, + "step": 8583 + }, + { + "epoch": 0.47845716515244413, + "grad_norm": 0.5481773018836975, + "learning_rate": 5.477872176239506e-05, + "loss": 1.738, + "step": 8584 + }, + { + "epoch": 0.47851290340560726, + "grad_norm": 0.5214368104934692, + "learning_rate": 5.4769858211406824e-05, + "loss": 1.5133, + "step": 8585 + }, + { + "epoch": 0.4785686416587704, + "grad_norm": 0.5468040704727173, + "learning_rate": 5.4760994509146514e-05, + "loss": 1.6054, + "step": 8586 + }, + { + "epoch": 0.4786243799119336, + "grad_norm": 0.5729833841323853, + "learning_rate": 5.475213065589518e-05, + "loss": 1.4712, + "step": 8587 + }, + { + "epoch": 0.4786801181650967, + "grad_norm": 0.558814525604248, + "learning_rate": 5.4743266651934e-05, + "loss": 1.4907, + "step": 8588 + }, + { + "epoch": 0.47873585641825983, + "grad_norm": 0.5633212924003601, + "learning_rate": 5.4734402497544044e-05, + "loss": 1.4832, + "step": 8589 + }, + { + "epoch": 0.478791594671423, + "grad_norm": 0.6136720180511475, + "learning_rate": 5.472553819300645e-05, + "loss": 1.6588, + "step": 8590 + }, + { + "epoch": 0.47884733292458614, + "grad_norm": 0.537601113319397, + "learning_rate": 5.471667373860234e-05, + "loss": 1.6905, + "step": 8591 + }, + { + "epoch": 0.4789030711777493, + "grad_norm": 0.5937305688858032, + "learning_rate": 5.4707809134612844e-05, + "loss": 1.7177, + "step": 8592 + }, + { + "epoch": 0.47895880943091246, + "grad_norm": 0.6321950554847717, + "learning_rate": 5.469894438131906e-05, + "loss": 1.8388, + "step": 8593 + }, + { + "epoch": 0.4790145476840756, + "grad_norm": 0.5728781223297119, + "learning_rate": 5.469007947900219e-05, + "loss": 1.9354, + "step": 8594 + }, + { + "epoch": 0.4790702859372387, + "grad_norm": 0.5851932764053345, + "learning_rate": 5.468121442794333e-05, + "loss": 1.6465, + "step": 8595 + }, + { + "epoch": 0.4791260241904019, + "grad_norm": 0.5869148969650269, + "learning_rate": 5.467234922842363e-05, + "loss": 1.8636, + "step": 8596 + }, + { + "epoch": 0.47918176244356503, + "grad_norm": 0.5678532719612122, + "learning_rate": 5.4663483880724275e-05, + "loss": 1.7346, + "step": 8597 + }, + { + "epoch": 0.47923750069672816, + "grad_norm": 0.5783692598342896, + "learning_rate": 5.46546183851264e-05, + "loss": 1.8068, + "step": 8598 + }, + { + "epoch": 0.4792932389498913, + "grad_norm": 0.5361393690109253, + "learning_rate": 5.464575274191116e-05, + "loss": 1.4534, + "step": 8599 + }, + { + "epoch": 0.47934897720305447, + "grad_norm": 0.5204313397407532, + "learning_rate": 5.4636886951359726e-05, + "loss": 1.5212, + "step": 8600 + }, + { + "epoch": 0.4794047154562176, + "grad_norm": 0.5215826630592346, + "learning_rate": 5.4628021013753284e-05, + "loss": 1.6756, + "step": 8601 + }, + { + "epoch": 0.47946045370938073, + "grad_norm": 0.5335747599601746, + "learning_rate": 5.461915492937299e-05, + "loss": 1.7895, + "step": 8602 + }, + { + "epoch": 0.4795161919625439, + "grad_norm": 0.5702705979347229, + "learning_rate": 5.461028869850004e-05, + "loss": 1.7024, + "step": 8603 + }, + { + "epoch": 0.47957193021570704, + "grad_norm": 0.5771311521530151, + "learning_rate": 5.4601422321415606e-05, + "loss": 1.7879, + "step": 8604 + }, + { + "epoch": 0.47962766846887017, + "grad_norm": 0.5826980471611023, + "learning_rate": 5.459255579840089e-05, + "loss": 1.6198, + "step": 8605 + }, + { + "epoch": 0.47968340672203336, + "grad_norm": 0.5219647288322449, + "learning_rate": 5.458368912973707e-05, + "loss": 1.6159, + "step": 8606 + }, + { + "epoch": 0.4797391449751965, + "grad_norm": 0.5676286220550537, + "learning_rate": 5.4574822315705366e-05, + "loss": 1.6843, + "step": 8607 + }, + { + "epoch": 0.4797948832283596, + "grad_norm": 0.5792801380157471, + "learning_rate": 5.456595535658696e-05, + "loss": 1.8092, + "step": 8608 + }, + { + "epoch": 0.47985062148152274, + "grad_norm": 0.5464149713516235, + "learning_rate": 5.455708825266308e-05, + "loss": 1.7726, + "step": 8609 + }, + { + "epoch": 0.47990635973468593, + "grad_norm": 0.597957968711853, + "learning_rate": 5.4548221004214936e-05, + "loss": 1.7107, + "step": 8610 + }, + { + "epoch": 0.47996209798784906, + "grad_norm": 0.5609841346740723, + "learning_rate": 5.453935361152374e-05, + "loss": 1.5578, + "step": 8611 + }, + { + "epoch": 0.4800178362410122, + "grad_norm": 0.5753505229949951, + "learning_rate": 5.45304860748707e-05, + "loss": 1.8959, + "step": 8612 + }, + { + "epoch": 0.48007357449417537, + "grad_norm": 0.5798444747924805, + "learning_rate": 5.4521618394537056e-05, + "loss": 1.9346, + "step": 8613 + }, + { + "epoch": 0.4801293127473385, + "grad_norm": 0.536660373210907, + "learning_rate": 5.451275057080405e-05, + "loss": 1.6191, + "step": 8614 + }, + { + "epoch": 0.48018505100050163, + "grad_norm": 0.5759127736091614, + "learning_rate": 5.4503882603952905e-05, + "loss": 1.6555, + "step": 8615 + }, + { + "epoch": 0.4802407892536648, + "grad_norm": 0.5895690321922302, + "learning_rate": 5.449501449426487e-05, + "loss": 1.7481, + "step": 8616 + }, + { + "epoch": 0.48029652750682794, + "grad_norm": 0.5727548003196716, + "learning_rate": 5.448614624202117e-05, + "loss": 1.7338, + "step": 8617 + }, + { + "epoch": 0.48035226575999107, + "grad_norm": 0.5720645189285278, + "learning_rate": 5.447727784750308e-05, + "loss": 1.7127, + "step": 8618 + }, + { + "epoch": 0.48040800401315426, + "grad_norm": 0.5797655582427979, + "learning_rate": 5.446840931099182e-05, + "loss": 1.733, + "step": 8619 + }, + { + "epoch": 0.4804637422663174, + "grad_norm": 0.5146819949150085, + "learning_rate": 5.445954063276869e-05, + "loss": 1.5931, + "step": 8620 + }, + { + "epoch": 0.4805194805194805, + "grad_norm": 0.5465497970581055, + "learning_rate": 5.445067181311492e-05, + "loss": 1.6994, + "step": 8621 + }, + { + "epoch": 0.48057521877264364, + "grad_norm": 0.5129651427268982, + "learning_rate": 5.4441802852311795e-05, + "loss": 1.5357, + "step": 8622 + }, + { + "epoch": 0.4806309570258068, + "grad_norm": 0.5457690954208374, + "learning_rate": 5.443293375064058e-05, + "loss": 1.5543, + "step": 8623 + }, + { + "epoch": 0.48068669527896996, + "grad_norm": 0.5993552207946777, + "learning_rate": 5.4424064508382556e-05, + "loss": 1.902, + "step": 8624 + }, + { + "epoch": 0.4807424335321331, + "grad_norm": 0.5725103616714478, + "learning_rate": 5.4415195125819e-05, + "loss": 1.7444, + "step": 8625 + }, + { + "epoch": 0.48079817178529627, + "grad_norm": 0.5666811466217041, + "learning_rate": 5.440632560323118e-05, + "loss": 1.6553, + "step": 8626 + }, + { + "epoch": 0.4808539100384594, + "grad_norm": 0.5566148161888123, + "learning_rate": 5.439745594090042e-05, + "loss": 1.3808, + "step": 8627 + }, + { + "epoch": 0.4809096482916225, + "grad_norm": 0.5133042335510254, + "learning_rate": 5.438858613910799e-05, + "loss": 1.5705, + "step": 8628 + }, + { + "epoch": 0.4809653865447857, + "grad_norm": 0.6130719780921936, + "learning_rate": 5.43797161981352e-05, + "loss": 1.9702, + "step": 8629 + }, + { + "epoch": 0.48102112479794884, + "grad_norm": 0.5869434475898743, + "learning_rate": 5.4370846118263354e-05, + "loss": 1.8149, + "step": 8630 + }, + { + "epoch": 0.48107686305111197, + "grad_norm": 0.5676392316818237, + "learning_rate": 5.436197589977374e-05, + "loss": 1.5798, + "step": 8631 + }, + { + "epoch": 0.4811326013042751, + "grad_norm": 0.5470464825630188, + "learning_rate": 5.435310554294769e-05, + "loss": 1.6549, + "step": 8632 + }, + { + "epoch": 0.4811883395574383, + "grad_norm": 0.5741833448410034, + "learning_rate": 5.434423504806651e-05, + "loss": 1.7124, + "step": 8633 + }, + { + "epoch": 0.4812440778106014, + "grad_norm": 0.5436912178993225, + "learning_rate": 5.433536441541152e-05, + "loss": 1.568, + "step": 8634 + }, + { + "epoch": 0.48129981606376454, + "grad_norm": 0.5380058884620667, + "learning_rate": 5.432649364526403e-05, + "loss": 1.4785, + "step": 8635 + }, + { + "epoch": 0.4813555543169277, + "grad_norm": 0.5699672102928162, + "learning_rate": 5.4317622737905413e-05, + "loss": 1.4929, + "step": 8636 + }, + { + "epoch": 0.48141129257009085, + "grad_norm": 0.565059244632721, + "learning_rate": 5.4308751693616975e-05, + "loss": 1.7861, + "step": 8637 + }, + { + "epoch": 0.481467030823254, + "grad_norm": 0.5427149534225464, + "learning_rate": 5.429988051268006e-05, + "loss": 1.6655, + "step": 8638 + }, + { + "epoch": 0.48152276907641717, + "grad_norm": 0.5943994522094727, + "learning_rate": 5.429100919537597e-05, + "loss": 1.8461, + "step": 8639 + }, + { + "epoch": 0.4815785073295803, + "grad_norm": 0.5920754671096802, + "learning_rate": 5.4282137741986125e-05, + "loss": 1.9077, + "step": 8640 + }, + { + "epoch": 0.4816342455827434, + "grad_norm": 0.5471158623695374, + "learning_rate": 5.427326615279182e-05, + "loss": 1.6468, + "step": 8641 + }, + { + "epoch": 0.4816899838359066, + "grad_norm": 0.5595037341117859, + "learning_rate": 5.426439442807444e-05, + "loss": 1.7315, + "step": 8642 + }, + { + "epoch": 0.48174572208906974, + "grad_norm": 0.5808396935462952, + "learning_rate": 5.4255522568115314e-05, + "loss": 1.8597, + "step": 8643 + }, + { + "epoch": 0.48180146034223287, + "grad_norm": 0.5106577277183533, + "learning_rate": 5.424665057319584e-05, + "loss": 1.4579, + "step": 8644 + }, + { + "epoch": 0.481857198595396, + "grad_norm": 0.5588060617446899, + "learning_rate": 5.4237778443597366e-05, + "loss": 1.7045, + "step": 8645 + }, + { + "epoch": 0.4819129368485592, + "grad_norm": 0.5763769149780273, + "learning_rate": 5.4228906179601256e-05, + "loss": 1.7194, + "step": 8646 + }, + { + "epoch": 0.4819686751017223, + "grad_norm": 0.5877617597579956, + "learning_rate": 5.42200337814889e-05, + "loss": 1.8115, + "step": 8647 + }, + { + "epoch": 0.48202441335488544, + "grad_norm": 0.588557779788971, + "learning_rate": 5.421116124954169e-05, + "loss": 1.7122, + "step": 8648 + }, + { + "epoch": 0.4820801516080486, + "grad_norm": 0.5687382221221924, + "learning_rate": 5.4202288584040996e-05, + "loss": 1.6734, + "step": 8649 + }, + { + "epoch": 0.48213588986121175, + "grad_norm": 0.5797961950302124, + "learning_rate": 5.4193415785268195e-05, + "loss": 1.9098, + "step": 8650 + }, + { + "epoch": 0.4821916281143749, + "grad_norm": 0.5459732413291931, + "learning_rate": 5.418454285350472e-05, + "loss": 1.5751, + "step": 8651 + }, + { + "epoch": 0.48224736636753807, + "grad_norm": 0.6237668991088867, + "learning_rate": 5.4175669789031904e-05, + "loss": 1.9574, + "step": 8652 + }, + { + "epoch": 0.4823031046207012, + "grad_norm": 0.5237795114517212, + "learning_rate": 5.4166796592131216e-05, + "loss": 1.6274, + "step": 8653 + }, + { + "epoch": 0.4823588428738643, + "grad_norm": 0.8351784348487854, + "learning_rate": 5.415792326308403e-05, + "loss": 1.6101, + "step": 8654 + }, + { + "epoch": 0.48241458112702745, + "grad_norm": 0.553855836391449, + "learning_rate": 5.414904980217177e-05, + "loss": 1.7006, + "step": 8655 + }, + { + "epoch": 0.48247031938019064, + "grad_norm": 0.5128687620162964, + "learning_rate": 5.414017620967582e-05, + "loss": 1.5782, + "step": 8656 + }, + { + "epoch": 0.48252605763335377, + "grad_norm": 0.5743347406387329, + "learning_rate": 5.4131302485877635e-05, + "loss": 1.8762, + "step": 8657 + }, + { + "epoch": 0.4825817958865169, + "grad_norm": 0.5579991936683655, + "learning_rate": 5.412242863105862e-05, + "loss": 1.6882, + "step": 8658 + }, + { + "epoch": 0.4826375341396801, + "grad_norm": 0.5496572256088257, + "learning_rate": 5.41135546455002e-05, + "loss": 1.6909, + "step": 8659 + }, + { + "epoch": 0.4826932723928432, + "grad_norm": 0.5845061540603638, + "learning_rate": 5.410468052948381e-05, + "loss": 1.8966, + "step": 8660 + }, + { + "epoch": 0.48274901064600634, + "grad_norm": 0.5628004670143127, + "learning_rate": 5.409580628329088e-05, + "loss": 1.6114, + "step": 8661 + }, + { + "epoch": 0.4828047488991695, + "grad_norm": 0.52235347032547, + "learning_rate": 5.408693190720288e-05, + "loss": 1.4296, + "step": 8662 + }, + { + "epoch": 0.48286048715233265, + "grad_norm": 0.5655858516693115, + "learning_rate": 5.40780574015012e-05, + "loss": 1.7761, + "step": 8663 + }, + { + "epoch": 0.4829162254054958, + "grad_norm": 0.5697308778762817, + "learning_rate": 5.406918276646733e-05, + "loss": 1.7426, + "step": 8664 + }, + { + "epoch": 0.48297196365865896, + "grad_norm": 0.5626512169837952, + "learning_rate": 5.40603080023827e-05, + "loss": 1.5949, + "step": 8665 + }, + { + "epoch": 0.4830277019118221, + "grad_norm": 0.6178479194641113, + "learning_rate": 5.405143310952878e-05, + "loss": 1.9571, + "step": 8666 + }, + { + "epoch": 0.4830834401649852, + "grad_norm": 0.6123231053352356, + "learning_rate": 5.4042558088187014e-05, + "loss": 1.9154, + "step": 8667 + }, + { + "epoch": 0.48313917841814835, + "grad_norm": 0.5526097416877747, + "learning_rate": 5.40336829386389e-05, + "loss": 1.5508, + "step": 8668 + }, + { + "epoch": 0.48319491667131154, + "grad_norm": 0.5456022620201111, + "learning_rate": 5.4024807661165855e-05, + "loss": 1.5887, + "step": 8669 + }, + { + "epoch": 0.48325065492447467, + "grad_norm": 0.49078524112701416, + "learning_rate": 5.4015932256049386e-05, + "loss": 1.5876, + "step": 8670 + }, + { + "epoch": 0.4833063931776378, + "grad_norm": 0.5714897513389587, + "learning_rate": 5.4007056723570956e-05, + "loss": 1.8633, + "step": 8671 + }, + { + "epoch": 0.483362131430801, + "grad_norm": 0.6069988012313843, + "learning_rate": 5.399818106401206e-05, + "loss": 1.7922, + "step": 8672 + }, + { + "epoch": 0.4834178696839641, + "grad_norm": 0.5466931462287903, + "learning_rate": 5.3989305277654156e-05, + "loss": 1.7496, + "step": 8673 + }, + { + "epoch": 0.48347360793712724, + "grad_norm": 0.562350869178772, + "learning_rate": 5.398042936477875e-05, + "loss": 1.6191, + "step": 8674 + }, + { + "epoch": 0.4835293461902904, + "grad_norm": 0.5562702417373657, + "learning_rate": 5.397155332566736e-05, + "loss": 1.8695, + "step": 8675 + }, + { + "epoch": 0.48358508444345355, + "grad_norm": 0.598784863948822, + "learning_rate": 5.3962677160601426e-05, + "loss": 1.5275, + "step": 8676 + }, + { + "epoch": 0.4836408226966167, + "grad_norm": 0.5225400924682617, + "learning_rate": 5.395380086986249e-05, + "loss": 1.4847, + "step": 8677 + }, + { + "epoch": 0.4836965609497798, + "grad_norm": 0.58516925573349, + "learning_rate": 5.3944924453732014e-05, + "loss": 1.652, + "step": 8678 + }, + { + "epoch": 0.483752299202943, + "grad_norm": 0.5312181115150452, + "learning_rate": 5.3936047912491574e-05, + "loss": 1.356, + "step": 8679 + }, + { + "epoch": 0.4838080374561061, + "grad_norm": 0.5645095109939575, + "learning_rate": 5.3927171246422615e-05, + "loss": 1.7965, + "step": 8680 + }, + { + "epoch": 0.48386377570926925, + "grad_norm": 0.5576086044311523, + "learning_rate": 5.39182944558067e-05, + "loss": 1.6595, + "step": 8681 + }, + { + "epoch": 0.48391951396243243, + "grad_norm": 0.5667631030082703, + "learning_rate": 5.390941754092532e-05, + "loss": 1.6973, + "step": 8682 + }, + { + "epoch": 0.48397525221559556, + "grad_norm": 0.5693982243537903, + "learning_rate": 5.3900540502060015e-05, + "loss": 1.6383, + "step": 8683 + }, + { + "epoch": 0.4840309904687587, + "grad_norm": 0.5972820520401001, + "learning_rate": 5.3891663339492306e-05, + "loss": 1.73, + "step": 8684 + }, + { + "epoch": 0.4840867287219219, + "grad_norm": 0.5453163385391235, + "learning_rate": 5.388278605350372e-05, + "loss": 1.5295, + "step": 8685 + }, + { + "epoch": 0.484142466975085, + "grad_norm": 0.5659864544868469, + "learning_rate": 5.38739086443758e-05, + "loss": 1.6765, + "step": 8686 + }, + { + "epoch": 0.48419820522824814, + "grad_norm": 0.5438006520271301, + "learning_rate": 5.386503111239008e-05, + "loss": 1.5357, + "step": 8687 + }, + { + "epoch": 0.4842539434814113, + "grad_norm": 0.5650402903556824, + "learning_rate": 5.385615345782813e-05, + "loss": 1.7396, + "step": 8688 + }, + { + "epoch": 0.48430968173457445, + "grad_norm": 0.5356137156486511, + "learning_rate": 5.3847275680971454e-05, + "loss": 1.7116, + "step": 8689 + }, + { + "epoch": 0.4843654199877376, + "grad_norm": 0.5687363743782043, + "learning_rate": 5.383839778210163e-05, + "loss": 1.6747, + "step": 8690 + }, + { + "epoch": 0.4844211582409007, + "grad_norm": 0.5704367756843567, + "learning_rate": 5.38295197615002e-05, + "loss": 1.5563, + "step": 8691 + }, + { + "epoch": 0.4844768964940639, + "grad_norm": 0.6154001355171204, + "learning_rate": 5.382064161944874e-05, + "loss": 2.1129, + "step": 8692 + }, + { + "epoch": 0.484532634747227, + "grad_norm": 0.5885458588600159, + "learning_rate": 5.3811763356228804e-05, + "loss": 1.6652, + "step": 8693 + }, + { + "epoch": 0.48458837300039015, + "grad_norm": 0.5427495837211609, + "learning_rate": 5.3802884972121955e-05, + "loss": 1.7085, + "step": 8694 + }, + { + "epoch": 0.48464411125355333, + "grad_norm": 0.5415340065956116, + "learning_rate": 5.379400646740977e-05, + "loss": 1.7126, + "step": 8695 + }, + { + "epoch": 0.48469984950671646, + "grad_norm": 0.50815749168396, + "learning_rate": 5.3785127842373814e-05, + "loss": 1.7257, + "step": 8696 + }, + { + "epoch": 0.4847555877598796, + "grad_norm": 0.5710844397544861, + "learning_rate": 5.3776249097295696e-05, + "loss": 1.6778, + "step": 8697 + }, + { + "epoch": 0.4848113260130428, + "grad_norm": 0.5827280282974243, + "learning_rate": 5.376737023245695e-05, + "loss": 1.717, + "step": 8698 + }, + { + "epoch": 0.4848670642662059, + "grad_norm": 0.6222889423370361, + "learning_rate": 5.375849124813919e-05, + "loss": 1.9998, + "step": 8699 + }, + { + "epoch": 0.48492280251936903, + "grad_norm": 0.5893861651420593, + "learning_rate": 5.3749612144623995e-05, + "loss": 1.9211, + "step": 8700 + }, + { + "epoch": 0.48497854077253216, + "grad_norm": 0.5538213849067688, + "learning_rate": 5.374073292219297e-05, + "loss": 1.7934, + "step": 8701 + }, + { + "epoch": 0.48503427902569535, + "grad_norm": 0.5892875790596008, + "learning_rate": 5.3731853581127714e-05, + "loss": 1.8932, + "step": 8702 + }, + { + "epoch": 0.4850900172788585, + "grad_norm": 0.5553523302078247, + "learning_rate": 5.3722974121709815e-05, + "loss": 1.7465, + "step": 8703 + }, + { + "epoch": 0.4851457555320216, + "grad_norm": 0.57076096534729, + "learning_rate": 5.371409454422087e-05, + "loss": 1.7025, + "step": 8704 + }, + { + "epoch": 0.4852014937851848, + "grad_norm": 0.5483660101890564, + "learning_rate": 5.370521484894252e-05, + "loss": 1.6435, + "step": 8705 + }, + { + "epoch": 0.4852572320383479, + "grad_norm": 0.5742903351783752, + "learning_rate": 5.3696335036156345e-05, + "loss": 1.7067, + "step": 8706 + }, + { + "epoch": 0.48531297029151105, + "grad_norm": 0.5819395184516907, + "learning_rate": 5.368745510614399e-05, + "loss": 1.6528, + "step": 8707 + }, + { + "epoch": 0.48536870854467423, + "grad_norm": 0.5477610230445862, + "learning_rate": 5.367857505918704e-05, + "loss": 1.8253, + "step": 8708 + }, + { + "epoch": 0.48542444679783736, + "grad_norm": 0.6026375889778137, + "learning_rate": 5.3669694895567145e-05, + "loss": 1.8483, + "step": 8709 + }, + { + "epoch": 0.4854801850510005, + "grad_norm": 0.49743878841400146, + "learning_rate": 5.366081461556593e-05, + "loss": 1.4705, + "step": 8710 + }, + { + "epoch": 0.4855359233041637, + "grad_norm": 0.5510653853416443, + "learning_rate": 5.365193421946502e-05, + "loss": 1.4843, + "step": 8711 + }, + { + "epoch": 0.4855916615573268, + "grad_norm": 0.5583814978599548, + "learning_rate": 5.3643053707546034e-05, + "loss": 1.6045, + "step": 8712 + }, + { + "epoch": 0.48564739981048993, + "grad_norm": 0.5511784553527832, + "learning_rate": 5.363417308009062e-05, + "loss": 1.7184, + "step": 8713 + }, + { + "epoch": 0.48570313806365306, + "grad_norm": 0.5590716600418091, + "learning_rate": 5.362529233738045e-05, + "loss": 1.6326, + "step": 8714 + }, + { + "epoch": 0.48575887631681625, + "grad_norm": 0.564095139503479, + "learning_rate": 5.361641147969713e-05, + "loss": 1.6036, + "step": 8715 + }, + { + "epoch": 0.4858146145699794, + "grad_norm": 0.6147303581237793, + "learning_rate": 5.3607530507322334e-05, + "loss": 1.8542, + "step": 8716 + }, + { + "epoch": 0.4858703528231425, + "grad_norm": 0.556438684463501, + "learning_rate": 5.3598649420537675e-05, + "loss": 1.6413, + "step": 8717 + }, + { + "epoch": 0.4859260910763057, + "grad_norm": 0.5851439237594604, + "learning_rate": 5.358976821962487e-05, + "loss": 1.7414, + "step": 8718 + }, + { + "epoch": 0.4859818293294688, + "grad_norm": 0.5886179804801941, + "learning_rate": 5.358088690486553e-05, + "loss": 1.623, + "step": 8719 + }, + { + "epoch": 0.48603756758263195, + "grad_norm": 0.5328960418701172, + "learning_rate": 5.357200547654134e-05, + "loss": 1.4861, + "step": 8720 + }, + { + "epoch": 0.48609330583579513, + "grad_norm": 0.5452643036842346, + "learning_rate": 5.356312393493396e-05, + "loss": 1.763, + "step": 8721 + }, + { + "epoch": 0.48614904408895826, + "grad_norm": 0.5395748019218445, + "learning_rate": 5.3554242280325064e-05, + "loss": 1.4284, + "step": 8722 + }, + { + "epoch": 0.4862047823421214, + "grad_norm": 0.6557826399803162, + "learning_rate": 5.354536051299634e-05, + "loss": 1.8725, + "step": 8723 + }, + { + "epoch": 0.4862605205952845, + "grad_norm": 0.5590106248855591, + "learning_rate": 5.353647863322943e-05, + "loss": 1.6673, + "step": 8724 + }, + { + "epoch": 0.4863162588484477, + "grad_norm": 0.560207188129425, + "learning_rate": 5.3527596641306034e-05, + "loss": 1.7026, + "step": 8725 + }, + { + "epoch": 0.48637199710161083, + "grad_norm": 0.54021817445755, + "learning_rate": 5.3518714537507855e-05, + "loss": 1.3786, + "step": 8726 + }, + { + "epoch": 0.48642773535477396, + "grad_norm": 0.5303489565849304, + "learning_rate": 5.350983232211657e-05, + "loss": 1.5461, + "step": 8727 + }, + { + "epoch": 0.48648347360793714, + "grad_norm": 0.5234289169311523, + "learning_rate": 5.350094999541385e-05, + "loss": 1.8215, + "step": 8728 + }, + { + "epoch": 0.4865392118611003, + "grad_norm": 0.6171209216117859, + "learning_rate": 5.349206755768142e-05, + "loss": 1.6419, + "step": 8729 + }, + { + "epoch": 0.4865949501142634, + "grad_norm": 0.5630922317504883, + "learning_rate": 5.3483185009200955e-05, + "loss": 1.7303, + "step": 8730 + }, + { + "epoch": 0.4866506883674266, + "grad_norm": 0.5881733298301697, + "learning_rate": 5.347430235025419e-05, + "loss": 1.8506, + "step": 8731 + }, + { + "epoch": 0.4867064266205897, + "grad_norm": 0.5110684633255005, + "learning_rate": 5.34654195811228e-05, + "loss": 1.4549, + "step": 8732 + }, + { + "epoch": 0.48676216487375285, + "grad_norm": 0.5621329545974731, + "learning_rate": 5.345653670208851e-05, + "loss": 1.6001, + "step": 8733 + }, + { + "epoch": 0.48681790312691603, + "grad_norm": 0.5230090022087097, + "learning_rate": 5.344765371343302e-05, + "loss": 1.7102, + "step": 8734 + }, + { + "epoch": 0.48687364138007916, + "grad_norm": 0.5325090289115906, + "learning_rate": 5.343877061543806e-05, + "loss": 1.5661, + "step": 8735 + }, + { + "epoch": 0.4869293796332423, + "grad_norm": 0.5863301753997803, + "learning_rate": 5.342988740838535e-05, + "loss": 1.9036, + "step": 8736 + }, + { + "epoch": 0.4869851178864054, + "grad_norm": 0.5872917175292969, + "learning_rate": 5.342100409255659e-05, + "loss": 1.8516, + "step": 8737 + }, + { + "epoch": 0.4870408561395686, + "grad_norm": 0.5677287578582764, + "learning_rate": 5.341212066823355e-05, + "loss": 1.5462, + "step": 8738 + }, + { + "epoch": 0.48709659439273173, + "grad_norm": 0.5717810392379761, + "learning_rate": 5.340323713569792e-05, + "loss": 1.7118, + "step": 8739 + }, + { + "epoch": 0.48715233264589486, + "grad_norm": 0.5940883159637451, + "learning_rate": 5.339435349523148e-05, + "loss": 1.8225, + "step": 8740 + }, + { + "epoch": 0.48720807089905804, + "grad_norm": 0.6162937879562378, + "learning_rate": 5.33854697471159e-05, + "loss": 1.9512, + "step": 8741 + }, + { + "epoch": 0.4872638091522212, + "grad_norm": 0.5418954491615295, + "learning_rate": 5.337658589163299e-05, + "loss": 1.6836, + "step": 8742 + }, + { + "epoch": 0.4873195474053843, + "grad_norm": 0.5783557295799255, + "learning_rate": 5.3367701929064426e-05, + "loss": 1.709, + "step": 8743 + }, + { + "epoch": 0.4873752856585475, + "grad_norm": 0.5385530591011047, + "learning_rate": 5.3358817859692025e-05, + "loss": 1.5885, + "step": 8744 + }, + { + "epoch": 0.4874310239117106, + "grad_norm": 0.5666008591651917, + "learning_rate": 5.334993368379748e-05, + "loss": 1.6946, + "step": 8745 + }, + { + "epoch": 0.48748676216487374, + "grad_norm": 0.549767255783081, + "learning_rate": 5.3341049401662594e-05, + "loss": 1.5776, + "step": 8746 + }, + { + "epoch": 0.4875425004180369, + "grad_norm": 0.5610424280166626, + "learning_rate": 5.333216501356909e-05, + "loss": 1.6057, + "step": 8747 + }, + { + "epoch": 0.48759823867120006, + "grad_norm": 0.5643283724784851, + "learning_rate": 5.332328051979873e-05, + "loss": 1.7629, + "step": 8748 + }, + { + "epoch": 0.4876539769243632, + "grad_norm": 0.5474547743797302, + "learning_rate": 5.3314395920633306e-05, + "loss": 1.7972, + "step": 8749 + }, + { + "epoch": 0.4877097151775263, + "grad_norm": 0.56900554895401, + "learning_rate": 5.330551121635454e-05, + "loss": 1.7521, + "step": 8750 + }, + { + "epoch": 0.4877654534306895, + "grad_norm": 0.6560434103012085, + "learning_rate": 5.329662640724426e-05, + "loss": 1.4613, + "step": 8751 + }, + { + "epoch": 0.48782119168385263, + "grad_norm": 0.5190215110778809, + "learning_rate": 5.32877414935842e-05, + "loss": 1.4367, + "step": 8752 + }, + { + "epoch": 0.48787692993701576, + "grad_norm": 0.5503537058830261, + "learning_rate": 5.3278856475656144e-05, + "loss": 1.649, + "step": 8753 + }, + { + "epoch": 0.48793266819017894, + "grad_norm": 0.5634624361991882, + "learning_rate": 5.326997135374189e-05, + "loss": 1.9406, + "step": 8754 + }, + { + "epoch": 0.48798840644334207, + "grad_norm": 0.5632345676422119, + "learning_rate": 5.3261086128123206e-05, + "loss": 1.6661, + "step": 8755 + }, + { + "epoch": 0.4880441446965052, + "grad_norm": 0.6362982392311096, + "learning_rate": 5.3252200799081875e-05, + "loss": 1.9258, + "step": 8756 + }, + { + "epoch": 0.4880998829496684, + "grad_norm": 0.5737461447715759, + "learning_rate": 5.3243315366899694e-05, + "loss": 1.6868, + "step": 8757 + }, + { + "epoch": 0.4881556212028315, + "grad_norm": 0.5335796475410461, + "learning_rate": 5.3234429831858466e-05, + "loss": 1.4586, + "step": 8758 + }, + { + "epoch": 0.48821135945599464, + "grad_norm": 0.5574231743812561, + "learning_rate": 5.3225544194239984e-05, + "loss": 1.6262, + "step": 8759 + }, + { + "epoch": 0.48826709770915777, + "grad_norm": 0.5251532196998596, + "learning_rate": 5.3216658454326043e-05, + "loss": 1.5789, + "step": 8760 + }, + { + "epoch": 0.48832283596232096, + "grad_norm": 0.5983790159225464, + "learning_rate": 5.3207772612398444e-05, + "loss": 1.8751, + "step": 8761 + }, + { + "epoch": 0.4883785742154841, + "grad_norm": 0.5940685272216797, + "learning_rate": 5.319888666873902e-05, + "loss": 1.5181, + "step": 8762 + }, + { + "epoch": 0.4884343124686472, + "grad_norm": 0.5403158664703369, + "learning_rate": 5.319000062362953e-05, + "loss": 1.6698, + "step": 8763 + }, + { + "epoch": 0.4884900507218104, + "grad_norm": 0.5441331267356873, + "learning_rate": 5.318111447735186e-05, + "loss": 1.6822, + "step": 8764 + }, + { + "epoch": 0.4885457889749735, + "grad_norm": 0.6151909232139587, + "learning_rate": 5.317222823018775e-05, + "loss": 1.8201, + "step": 8765 + }, + { + "epoch": 0.48860152722813666, + "grad_norm": 0.5616387724876404, + "learning_rate": 5.316334188241908e-05, + "loss": 1.705, + "step": 8766 + }, + { + "epoch": 0.48865726548129984, + "grad_norm": 0.570561408996582, + "learning_rate": 5.3154455434327634e-05, + "loss": 1.7352, + "step": 8767 + }, + { + "epoch": 0.48871300373446297, + "grad_norm": 0.5549841523170471, + "learning_rate": 5.314556888619527e-05, + "loss": 1.7109, + "step": 8768 + }, + { + "epoch": 0.4887687419876261, + "grad_norm": 0.6028071045875549, + "learning_rate": 5.313668223830378e-05, + "loss": 1.7114, + "step": 8769 + }, + { + "epoch": 0.4888244802407892, + "grad_norm": 0.563991129398346, + "learning_rate": 5.312779549093503e-05, + "loss": 1.5484, + "step": 8770 + }, + { + "epoch": 0.4888802184939524, + "grad_norm": 0.5773816108703613, + "learning_rate": 5.3118908644370834e-05, + "loss": 1.7072, + "step": 8771 + }, + { + "epoch": 0.48893595674711554, + "grad_norm": 0.5592569708824158, + "learning_rate": 5.3110021698893053e-05, + "loss": 1.7843, + "step": 8772 + }, + { + "epoch": 0.48899169500027867, + "grad_norm": 0.5349111557006836, + "learning_rate": 5.310113465478351e-05, + "loss": 1.5887, + "step": 8773 + }, + { + "epoch": 0.48904743325344185, + "grad_norm": 0.5708144903182983, + "learning_rate": 5.309224751232406e-05, + "loss": 1.5671, + "step": 8774 + }, + { + "epoch": 0.489103171506605, + "grad_norm": 0.5695350766181946, + "learning_rate": 5.308336027179655e-05, + "loss": 1.8061, + "step": 8775 + }, + { + "epoch": 0.4891589097597681, + "grad_norm": 0.5757440328598022, + "learning_rate": 5.307447293348281e-05, + "loss": 1.7021, + "step": 8776 + }, + { + "epoch": 0.4892146480129313, + "grad_norm": 0.5219387412071228, + "learning_rate": 5.306558549766473e-05, + "loss": 1.5089, + "step": 8777 + }, + { + "epoch": 0.4892703862660944, + "grad_norm": 0.5836179256439209, + "learning_rate": 5.305669796462415e-05, + "loss": 1.764, + "step": 8778 + }, + { + "epoch": 0.48932612451925755, + "grad_norm": 0.5617983341217041, + "learning_rate": 5.3047810334642935e-05, + "loss": 1.751, + "step": 8779 + }, + { + "epoch": 0.48938186277242074, + "grad_norm": 0.5990623831748962, + "learning_rate": 5.303892260800294e-05, + "loss": 1.7939, + "step": 8780 + }, + { + "epoch": 0.48943760102558387, + "grad_norm": 0.5625554323196411, + "learning_rate": 5.303003478498605e-05, + "loss": 1.8436, + "step": 8781 + }, + { + "epoch": 0.489493339278747, + "grad_norm": 0.6201027631759644, + "learning_rate": 5.3021146865874117e-05, + "loss": 1.7894, + "step": 8782 + }, + { + "epoch": 0.4895490775319101, + "grad_norm": 0.5482053160667419, + "learning_rate": 5.301225885094902e-05, + "loss": 1.7486, + "step": 8783 + }, + { + "epoch": 0.4896048157850733, + "grad_norm": 0.5940152406692505, + "learning_rate": 5.300337074049262e-05, + "loss": 1.7971, + "step": 8784 + }, + { + "epoch": 0.48966055403823644, + "grad_norm": 0.49621883034706116, + "learning_rate": 5.299448253478683e-05, + "loss": 1.6085, + "step": 8785 + }, + { + "epoch": 0.48971629229139957, + "grad_norm": 0.5509806275367737, + "learning_rate": 5.29855942341135e-05, + "loss": 1.8445, + "step": 8786 + }, + { + "epoch": 0.48977203054456275, + "grad_norm": 0.5669719576835632, + "learning_rate": 5.297670583875454e-05, + "loss": 1.7854, + "step": 8787 + }, + { + "epoch": 0.4898277687977259, + "grad_norm": 0.5512406826019287, + "learning_rate": 5.296781734899182e-05, + "loss": 1.4982, + "step": 8788 + }, + { + "epoch": 0.489883507050889, + "grad_norm": 0.56741863489151, + "learning_rate": 5.295892876510723e-05, + "loss": 1.7415, + "step": 8789 + }, + { + "epoch": 0.4899392453040522, + "grad_norm": 0.5425149202346802, + "learning_rate": 5.295004008738268e-05, + "loss": 1.5488, + "step": 8790 + }, + { + "epoch": 0.4899949835572153, + "grad_norm": 0.5617731213569641, + "learning_rate": 5.294115131610006e-05, + "loss": 1.7582, + "step": 8791 + }, + { + "epoch": 0.49005072181037845, + "grad_norm": 0.5693073868751526, + "learning_rate": 5.293226245154127e-05, + "loss": 1.5738, + "step": 8792 + }, + { + "epoch": 0.4901064600635416, + "grad_norm": 0.6429868340492249, + "learning_rate": 5.292337349398821e-05, + "loss": 1.7709, + "step": 8793 + }, + { + "epoch": 0.49016219831670477, + "grad_norm": 0.568608283996582, + "learning_rate": 5.291448444372279e-05, + "loss": 1.5022, + "step": 8794 + }, + { + "epoch": 0.4902179365698679, + "grad_norm": 0.5543949604034424, + "learning_rate": 5.29055953010269e-05, + "loss": 1.7136, + "step": 8795 + }, + { + "epoch": 0.490273674823031, + "grad_norm": 0.5077717900276184, + "learning_rate": 5.289670606618248e-05, + "loss": 1.5791, + "step": 8796 + }, + { + "epoch": 0.4903294130761942, + "grad_norm": 0.5588290691375732, + "learning_rate": 5.288781673947143e-05, + "loss": 1.7905, + "step": 8797 + }, + { + "epoch": 0.49038515132935734, + "grad_norm": 0.5637931823730469, + "learning_rate": 5.2878927321175676e-05, + "loss": 1.7184, + "step": 8798 + }, + { + "epoch": 0.49044088958252047, + "grad_norm": 0.5664627552032471, + "learning_rate": 5.2870037811577125e-05, + "loss": 1.5013, + "step": 8799 + }, + { + "epoch": 0.49049662783568365, + "grad_norm": 0.5796491503715515, + "learning_rate": 5.28611482109577e-05, + "loss": 1.7939, + "step": 8800 + }, + { + "epoch": 0.4905523660888468, + "grad_norm": 0.556143045425415, + "learning_rate": 5.2852258519599365e-05, + "loss": 1.5717, + "step": 8801 + }, + { + "epoch": 0.4906081043420099, + "grad_norm": 0.5120705366134644, + "learning_rate": 5.284336873778398e-05, + "loss": 1.5725, + "step": 8802 + }, + { + "epoch": 0.4906638425951731, + "grad_norm": 0.5616738200187683, + "learning_rate": 5.2834478865793545e-05, + "loss": 1.5918, + "step": 8803 + }, + { + "epoch": 0.4907195808483362, + "grad_norm": 0.5868408679962158, + "learning_rate": 5.282558890390995e-05, + "loss": 1.7262, + "step": 8804 + }, + { + "epoch": 0.49077531910149935, + "grad_norm": 0.5609720945358276, + "learning_rate": 5.281669885241517e-05, + "loss": 1.6374, + "step": 8805 + }, + { + "epoch": 0.4908310573546625, + "grad_norm": 0.5879573225975037, + "learning_rate": 5.280780871159111e-05, + "loss": 1.7363, + "step": 8806 + }, + { + "epoch": 0.49088679560782567, + "grad_norm": 0.5944104790687561, + "learning_rate": 5.279891848171974e-05, + "loss": 1.8078, + "step": 8807 + }, + { + "epoch": 0.4909425338609888, + "grad_norm": 0.5318206548690796, + "learning_rate": 5.2790028163082985e-05, + "loss": 1.5397, + "step": 8808 + }, + { + "epoch": 0.4909982721141519, + "grad_norm": 0.542536199092865, + "learning_rate": 5.2781137755962794e-05, + "loss": 1.6362, + "step": 8809 + }, + { + "epoch": 0.4910540103673151, + "grad_norm": 0.5784698128700256, + "learning_rate": 5.2772247260641136e-05, + "loss": 1.765, + "step": 8810 + }, + { + "epoch": 0.49110974862047824, + "grad_norm": 0.5454279184341431, + "learning_rate": 5.276335667739998e-05, + "loss": 1.7014, + "step": 8811 + }, + { + "epoch": 0.49116548687364137, + "grad_norm": 0.519689679145813, + "learning_rate": 5.275446600652123e-05, + "loss": 1.7533, + "step": 8812 + }, + { + "epoch": 0.49122122512680455, + "grad_norm": 0.7089325785636902, + "learning_rate": 5.2745575248286895e-05, + "loss": 2.1051, + "step": 8813 + }, + { + "epoch": 0.4912769633799677, + "grad_norm": 0.5588321089744568, + "learning_rate": 5.273668440297892e-05, + "loss": 1.6069, + "step": 8814 + }, + { + "epoch": 0.4913327016331308, + "grad_norm": 0.5273601412773132, + "learning_rate": 5.272779347087925e-05, + "loss": 1.4399, + "step": 8815 + }, + { + "epoch": 0.49138843988629394, + "grad_norm": 0.5443345904350281, + "learning_rate": 5.27189024522699e-05, + "loss": 1.5401, + "step": 8816 + }, + { + "epoch": 0.4914441781394571, + "grad_norm": 0.5727609395980835, + "learning_rate": 5.271001134743281e-05, + "loss": 1.6588, + "step": 8817 + }, + { + "epoch": 0.49149991639262025, + "grad_norm": 0.5712710618972778, + "learning_rate": 5.270112015664997e-05, + "loss": 1.7393, + "step": 8818 + }, + { + "epoch": 0.4915556546457834, + "grad_norm": 0.5474506616592407, + "learning_rate": 5.2692228880203333e-05, + "loss": 1.6144, + "step": 8819 + }, + { + "epoch": 0.49161139289894656, + "grad_norm": 0.5622429251670837, + "learning_rate": 5.2683337518374906e-05, + "loss": 1.6107, + "step": 8820 + }, + { + "epoch": 0.4916671311521097, + "grad_norm": 0.5528522729873657, + "learning_rate": 5.267444607144665e-05, + "loss": 1.5545, + "step": 8821 + }, + { + "epoch": 0.4917228694052728, + "grad_norm": 0.5275382995605469, + "learning_rate": 5.2665554539700554e-05, + "loss": 1.6128, + "step": 8822 + }, + { + "epoch": 0.491778607658436, + "grad_norm": 0.6423818469047546, + "learning_rate": 5.265666292341861e-05, + "loss": 2.064, + "step": 8823 + }, + { + "epoch": 0.49183434591159914, + "grad_norm": 0.5372768640518188, + "learning_rate": 5.26477712228828e-05, + "loss": 1.2805, + "step": 8824 + }, + { + "epoch": 0.49189008416476226, + "grad_norm": 0.600679337978363, + "learning_rate": 5.2638879438375144e-05, + "loss": 1.8211, + "step": 8825 + }, + { + "epoch": 0.49194582241792545, + "grad_norm": 0.5628047585487366, + "learning_rate": 5.2629987570177606e-05, + "loss": 1.6321, + "step": 8826 + }, + { + "epoch": 0.4920015606710886, + "grad_norm": 0.600486695766449, + "learning_rate": 5.262109561857221e-05, + "loss": 1.782, + "step": 8827 + }, + { + "epoch": 0.4920572989242517, + "grad_norm": 0.5375781655311584, + "learning_rate": 5.261220358384091e-05, + "loss": 1.5132, + "step": 8828 + }, + { + "epoch": 0.49211303717741484, + "grad_norm": 0.5441939830780029, + "learning_rate": 5.260331146626578e-05, + "loss": 1.4457, + "step": 8829 + }, + { + "epoch": 0.492168775430578, + "grad_norm": 0.5390109419822693, + "learning_rate": 5.259441926612877e-05, + "loss": 1.6268, + "step": 8830 + }, + { + "epoch": 0.49222451368374115, + "grad_norm": 0.5406618714332581, + "learning_rate": 5.2585526983711916e-05, + "loss": 1.5747, + "step": 8831 + }, + { + "epoch": 0.4922802519369043, + "grad_norm": 0.5526447296142578, + "learning_rate": 5.2576634619297216e-05, + "loss": 1.6989, + "step": 8832 + }, + { + "epoch": 0.49233599019006746, + "grad_norm": 0.5135407447814941, + "learning_rate": 5.256774217316669e-05, + "loss": 1.4546, + "step": 8833 + }, + { + "epoch": 0.4923917284432306, + "grad_norm": 0.5286427736282349, + "learning_rate": 5.255884964560235e-05, + "loss": 1.6071, + "step": 8834 + }, + { + "epoch": 0.4924474666963937, + "grad_norm": 0.5706698894500732, + "learning_rate": 5.254995703688621e-05, + "loss": 1.7096, + "step": 8835 + }, + { + "epoch": 0.4925032049495569, + "grad_norm": 0.5597012042999268, + "learning_rate": 5.2541064347300306e-05, + "loss": 1.6175, + "step": 8836 + }, + { + "epoch": 0.49255894320272003, + "grad_norm": 0.4902280271053314, + "learning_rate": 5.253217157712666e-05, + "loss": 1.2836, + "step": 8837 + }, + { + "epoch": 0.49261468145588316, + "grad_norm": 0.598961591720581, + "learning_rate": 5.2523278726647304e-05, + "loss": 1.7038, + "step": 8838 + }, + { + "epoch": 0.4926704197090463, + "grad_norm": 1.2628682851791382, + "learning_rate": 5.251438579614425e-05, + "loss": 1.8079, + "step": 8839 + }, + { + "epoch": 0.4927261579622095, + "grad_norm": 0.5793728232383728, + "learning_rate": 5.250549278589955e-05, + "loss": 1.8102, + "step": 8840 + }, + { + "epoch": 0.4927818962153726, + "grad_norm": 0.5742671489715576, + "learning_rate": 5.249659969619519e-05, + "loss": 1.6611, + "step": 8841 + }, + { + "epoch": 0.49283763446853573, + "grad_norm": 0.5438802242279053, + "learning_rate": 5.248770652731327e-05, + "loss": 1.5826, + "step": 8842 + }, + { + "epoch": 0.4928933727216989, + "grad_norm": 0.553573727607727, + "learning_rate": 5.247881327953581e-05, + "loss": 1.5787, + "step": 8843 + }, + { + "epoch": 0.49294911097486205, + "grad_norm": 0.5531934499740601, + "learning_rate": 5.246991995314484e-05, + "loss": 1.7769, + "step": 8844 + }, + { + "epoch": 0.4930048492280252, + "grad_norm": 0.5669671893119812, + "learning_rate": 5.24610265484224e-05, + "loss": 1.6973, + "step": 8845 + }, + { + "epoch": 0.49306058748118836, + "grad_norm": 0.5406858921051025, + "learning_rate": 5.2452133065650565e-05, + "loss": 1.4484, + "step": 8846 + }, + { + "epoch": 0.4931163257343515, + "grad_norm": 0.6136825084686279, + "learning_rate": 5.2443239505111354e-05, + "loss": 1.7145, + "step": 8847 + }, + { + "epoch": 0.4931720639875146, + "grad_norm": 0.5375277400016785, + "learning_rate": 5.243434586708682e-05, + "loss": 1.5229, + "step": 8848 + }, + { + "epoch": 0.4932278022406778, + "grad_norm": 0.5452854633331299, + "learning_rate": 5.2425452151859045e-05, + "loss": 1.4448, + "step": 8849 + }, + { + "epoch": 0.49328354049384093, + "grad_norm": 0.5728045701980591, + "learning_rate": 5.241655835971006e-05, + "loss": 1.8291, + "step": 8850 + }, + { + "epoch": 0.49333927874700406, + "grad_norm": 0.5290676951408386, + "learning_rate": 5.240766449092194e-05, + "loss": 1.53, + "step": 8851 + }, + { + "epoch": 0.4933950170001672, + "grad_norm": 0.6011704206466675, + "learning_rate": 5.239877054577673e-05, + "loss": 1.7215, + "step": 8852 + }, + { + "epoch": 0.4934507552533304, + "grad_norm": 0.5930907130241394, + "learning_rate": 5.2389876524556526e-05, + "loss": 1.8231, + "step": 8853 + }, + { + "epoch": 0.4935064935064935, + "grad_norm": 0.5788987874984741, + "learning_rate": 5.2380982427543346e-05, + "loss": 1.7529, + "step": 8854 + }, + { + "epoch": 0.49356223175965663, + "grad_norm": 0.5591574311256409, + "learning_rate": 5.23720882550193e-05, + "loss": 1.5894, + "step": 8855 + }, + { + "epoch": 0.4936179700128198, + "grad_norm": 0.6035146117210388, + "learning_rate": 5.2363194007266435e-05, + "loss": 1.811, + "step": 8856 + }, + { + "epoch": 0.49367370826598295, + "grad_norm": 0.5160028338432312, + "learning_rate": 5.2354299684566856e-05, + "loss": 1.6787, + "step": 8857 + }, + { + "epoch": 0.4937294465191461, + "grad_norm": 0.5431737899780273, + "learning_rate": 5.2345405287202596e-05, + "loss": 1.4917, + "step": 8858 + }, + { + "epoch": 0.49378518477230926, + "grad_norm": 0.5381173491477966, + "learning_rate": 5.233651081545577e-05, + "loss": 1.6775, + "step": 8859 + }, + { + "epoch": 0.4938409230254724, + "grad_norm": 0.6041108965873718, + "learning_rate": 5.232761626960844e-05, + "loss": 1.6414, + "step": 8860 + }, + { + "epoch": 0.4938966612786355, + "grad_norm": 0.6218950152397156, + "learning_rate": 5.231872164994268e-05, + "loss": 1.6513, + "step": 8861 + }, + { + "epoch": 0.49395239953179865, + "grad_norm": 0.5222500562667847, + "learning_rate": 5.230982695674059e-05, + "loss": 1.7083, + "step": 8862 + }, + { + "epoch": 0.49400813778496183, + "grad_norm": 0.5420836806297302, + "learning_rate": 5.230093219028427e-05, + "loss": 1.5971, + "step": 8863 + }, + { + "epoch": 0.49406387603812496, + "grad_norm": 0.5384796857833862, + "learning_rate": 5.229203735085579e-05, + "loss": 1.5896, + "step": 8864 + }, + { + "epoch": 0.4941196142912881, + "grad_norm": 0.6375717520713806, + "learning_rate": 5.2283142438737245e-05, + "loss": 1.8503, + "step": 8865 + }, + { + "epoch": 0.4941753525444513, + "grad_norm": 0.5303763151168823, + "learning_rate": 5.227424745421074e-05, + "loss": 1.6416, + "step": 8866 + }, + { + "epoch": 0.4942310907976144, + "grad_norm": 0.5153331756591797, + "learning_rate": 5.2265352397558354e-05, + "loss": 1.3659, + "step": 8867 + }, + { + "epoch": 0.49428682905077753, + "grad_norm": 0.5397130846977234, + "learning_rate": 5.225645726906222e-05, + "loss": 1.5523, + "step": 8868 + }, + { + "epoch": 0.4943425673039407, + "grad_norm": 0.5596987009048462, + "learning_rate": 5.224756206900439e-05, + "loss": 1.7921, + "step": 8869 + }, + { + "epoch": 0.49439830555710385, + "grad_norm": 0.5709193348884583, + "learning_rate": 5.2238666797667026e-05, + "loss": 1.6013, + "step": 8870 + }, + { + "epoch": 0.494454043810267, + "grad_norm": 0.5561599731445312, + "learning_rate": 5.2229771455332176e-05, + "loss": 1.4794, + "step": 8871 + }, + { + "epoch": 0.49450978206343016, + "grad_norm": 0.5445564985275269, + "learning_rate": 5.2220876042281995e-05, + "loss": 1.5029, + "step": 8872 + }, + { + "epoch": 0.4945655203165933, + "grad_norm": 0.5647691488265991, + "learning_rate": 5.2211980558798565e-05, + "loss": 1.7888, + "step": 8873 + }, + { + "epoch": 0.4946212585697564, + "grad_norm": 0.5487396717071533, + "learning_rate": 5.220308500516401e-05, + "loss": 1.6931, + "step": 8874 + }, + { + "epoch": 0.49467699682291955, + "grad_norm": 0.5969203114509583, + "learning_rate": 5.219418938166044e-05, + "loss": 1.6718, + "step": 8875 + }, + { + "epoch": 0.49473273507608273, + "grad_norm": 0.564508855342865, + "learning_rate": 5.218529368856997e-05, + "loss": 1.6968, + "step": 8876 + }, + { + "epoch": 0.49478847332924586, + "grad_norm": 0.5070094466209412, + "learning_rate": 5.217639792617475e-05, + "loss": 1.5859, + "step": 8877 + }, + { + "epoch": 0.494844211582409, + "grad_norm": 0.5474216341972351, + "learning_rate": 5.216750209475685e-05, + "loss": 1.7858, + "step": 8878 + }, + { + "epoch": 0.4948999498355722, + "grad_norm": 0.4998477101325989, + "learning_rate": 5.2158606194598436e-05, + "loss": 1.4827, + "step": 8879 + }, + { + "epoch": 0.4949556880887353, + "grad_norm": 0.5660443305969238, + "learning_rate": 5.214971022598162e-05, + "loss": 1.7799, + "step": 8880 + }, + { + "epoch": 0.49501142634189843, + "grad_norm": 0.5911859273910522, + "learning_rate": 5.2140814189188514e-05, + "loss": 1.6708, + "step": 8881 + }, + { + "epoch": 0.4950671645950616, + "grad_norm": 0.5817141532897949, + "learning_rate": 5.213191808450127e-05, + "loss": 1.6558, + "step": 8882 + }, + { + "epoch": 0.49512290284822474, + "grad_norm": 0.5510105490684509, + "learning_rate": 5.212302191220203e-05, + "loss": 1.5644, + "step": 8883 + }, + { + "epoch": 0.4951786411013879, + "grad_norm": 0.6024221181869507, + "learning_rate": 5.21141256725729e-05, + "loss": 1.7236, + "step": 8884 + }, + { + "epoch": 0.495234379354551, + "grad_norm": 0.5197804570198059, + "learning_rate": 5.210522936589604e-05, + "loss": 1.5429, + "step": 8885 + }, + { + "epoch": 0.4952901176077142, + "grad_norm": 0.5537724494934082, + "learning_rate": 5.209633299245357e-05, + "loss": 1.7254, + "step": 8886 + }, + { + "epoch": 0.4953458558608773, + "grad_norm": 0.5095260739326477, + "learning_rate": 5.208743655252763e-05, + "loss": 1.4012, + "step": 8887 + }, + { + "epoch": 0.49540159411404044, + "grad_norm": 0.5599790811538696, + "learning_rate": 5.207854004640038e-05, + "loss": 1.7249, + "step": 8888 + }, + { + "epoch": 0.49545733236720363, + "grad_norm": 0.555938184261322, + "learning_rate": 5.206964347435396e-05, + "loss": 1.6312, + "step": 8889 + }, + { + "epoch": 0.49551307062036676, + "grad_norm": 0.5438600182533264, + "learning_rate": 5.206074683667053e-05, + "loss": 1.7241, + "step": 8890 + }, + { + "epoch": 0.4955688088735299, + "grad_norm": 0.5477585792541504, + "learning_rate": 5.2051850133632206e-05, + "loss": 1.6946, + "step": 8891 + }, + { + "epoch": 0.49562454712669307, + "grad_norm": 0.5788122415542603, + "learning_rate": 5.204295336552117e-05, + "loss": 1.503, + "step": 8892 + }, + { + "epoch": 0.4956802853798562, + "grad_norm": 0.5613676309585571, + "learning_rate": 5.203405653261956e-05, + "loss": 1.5574, + "step": 8893 + }, + { + "epoch": 0.49573602363301933, + "grad_norm": 0.5826630592346191, + "learning_rate": 5.202515963520953e-05, + "loss": 1.85, + "step": 8894 + }, + { + "epoch": 0.4957917618861825, + "grad_norm": 0.5635188817977905, + "learning_rate": 5.2016262673573246e-05, + "loss": 1.3931, + "step": 8895 + }, + { + "epoch": 0.49584750013934564, + "grad_norm": 0.5745763182640076, + "learning_rate": 5.200736564799288e-05, + "loss": 1.7307, + "step": 8896 + }, + { + "epoch": 0.49590323839250877, + "grad_norm": 0.5301480889320374, + "learning_rate": 5.199846855875057e-05, + "loss": 1.4952, + "step": 8897 + }, + { + "epoch": 0.4959589766456719, + "grad_norm": 0.561489999294281, + "learning_rate": 5.19895714061285e-05, + "loss": 1.5023, + "step": 8898 + }, + { + "epoch": 0.4960147148988351, + "grad_norm": 0.5963059663772583, + "learning_rate": 5.198067419040881e-05, + "loss": 1.7862, + "step": 8899 + }, + { + "epoch": 0.4960704531519982, + "grad_norm": 0.5533133149147034, + "learning_rate": 5.197177691187368e-05, + "loss": 1.6099, + "step": 8900 + }, + { + "epoch": 0.49612619140516134, + "grad_norm": 0.5286788940429688, + "learning_rate": 5.196287957080529e-05, + "loss": 1.5929, + "step": 8901 + }, + { + "epoch": 0.4961819296583245, + "grad_norm": 0.5352204442024231, + "learning_rate": 5.195398216748579e-05, + "loss": 1.5723, + "step": 8902 + }, + { + "epoch": 0.49623766791148766, + "grad_norm": 0.5606736540794373, + "learning_rate": 5.194508470219739e-05, + "loss": 1.6633, + "step": 8903 + }, + { + "epoch": 0.4962934061646508, + "grad_norm": 0.5791866779327393, + "learning_rate": 5.193618717522224e-05, + "loss": 1.6933, + "step": 8904 + }, + { + "epoch": 0.49634914441781397, + "grad_norm": 0.5928483009338379, + "learning_rate": 5.192728958684252e-05, + "loss": 1.8085, + "step": 8905 + }, + { + "epoch": 0.4964048826709771, + "grad_norm": 0.545987606048584, + "learning_rate": 5.1918391937340405e-05, + "loss": 1.6682, + "step": 8906 + }, + { + "epoch": 0.49646062092414023, + "grad_norm": 0.5828558206558228, + "learning_rate": 5.190949422699808e-05, + "loss": 1.7887, + "step": 8907 + }, + { + "epoch": 0.49651635917730336, + "grad_norm": 0.5636189579963684, + "learning_rate": 5.1900596456097736e-05, + "loss": 1.6192, + "step": 8908 + }, + { + "epoch": 0.49657209743046654, + "grad_norm": 0.5548069477081299, + "learning_rate": 5.189169862492156e-05, + "loss": 1.482, + "step": 8909 + }, + { + "epoch": 0.49662783568362967, + "grad_norm": 0.5686978697776794, + "learning_rate": 5.188280073375173e-05, + "loss": 1.5428, + "step": 8910 + }, + { + "epoch": 0.4966835739367928, + "grad_norm": 0.5715393424034119, + "learning_rate": 5.187390278287043e-05, + "loss": 1.751, + "step": 8911 + }, + { + "epoch": 0.496739312189956, + "grad_norm": 0.5473306775093079, + "learning_rate": 5.1865004772559876e-05, + "loss": 1.6317, + "step": 8912 + }, + { + "epoch": 0.4967950504431191, + "grad_norm": 0.5280557870864868, + "learning_rate": 5.1856106703102225e-05, + "loss": 1.382, + "step": 8913 + }, + { + "epoch": 0.49685078869628224, + "grad_norm": 0.566477358341217, + "learning_rate": 5.18472085747797e-05, + "loss": 1.6059, + "step": 8914 + }, + { + "epoch": 0.4969065269494454, + "grad_norm": 0.618401288986206, + "learning_rate": 5.183831038787449e-05, + "loss": 1.7905, + "step": 8915 + }, + { + "epoch": 0.49696226520260856, + "grad_norm": 0.555980384349823, + "learning_rate": 5.18294121426688e-05, + "loss": 1.7827, + "step": 8916 + }, + { + "epoch": 0.4970180034557717, + "grad_norm": 0.5835009813308716, + "learning_rate": 5.1820513839444804e-05, + "loss": 1.5225, + "step": 8917 + }, + { + "epoch": 0.49707374170893487, + "grad_norm": 0.5366058945655823, + "learning_rate": 5.181161547848474e-05, + "loss": 1.584, + "step": 8918 + }, + { + "epoch": 0.497129479962098, + "grad_norm": 0.5382677316665649, + "learning_rate": 5.1802717060070795e-05, + "loss": 1.7048, + "step": 8919 + }, + { + "epoch": 0.4971852182152611, + "grad_norm": 0.5656511783599854, + "learning_rate": 5.1793818584485166e-05, + "loss": 1.7254, + "step": 8920 + }, + { + "epoch": 0.49724095646842426, + "grad_norm": 0.4968765377998352, + "learning_rate": 5.178492005201007e-05, + "loss": 1.4276, + "step": 8921 + }, + { + "epoch": 0.49729669472158744, + "grad_norm": 0.599624514579773, + "learning_rate": 5.177602146292773e-05, + "loss": 1.7886, + "step": 8922 + }, + { + "epoch": 0.49735243297475057, + "grad_norm": 0.5555099844932556, + "learning_rate": 5.176712281752033e-05, + "loss": 1.5135, + "step": 8923 + }, + { + "epoch": 0.4974081712279137, + "grad_norm": 0.5166276693344116, + "learning_rate": 5.17582241160701e-05, + "loss": 1.284, + "step": 8924 + }, + { + "epoch": 0.4974639094810769, + "grad_norm": 0.5706877708435059, + "learning_rate": 5.1749325358859255e-05, + "loss": 1.5666, + "step": 8925 + }, + { + "epoch": 0.49751964773424, + "grad_norm": 0.6055343747138977, + "learning_rate": 5.1740426546170003e-05, + "loss": 1.7793, + "step": 8926 + }, + { + "epoch": 0.49757538598740314, + "grad_norm": 0.551367998123169, + "learning_rate": 5.1731527678284575e-05, + "loss": 1.7579, + "step": 8927 + }, + { + "epoch": 0.4976311242405663, + "grad_norm": 0.6338830590248108, + "learning_rate": 5.172262875548518e-05, + "loss": 1.691, + "step": 8928 + }, + { + "epoch": 0.49768686249372945, + "grad_norm": 0.5556480884552002, + "learning_rate": 5.171372977805405e-05, + "loss": 1.5507, + "step": 8929 + }, + { + "epoch": 0.4977426007468926, + "grad_norm": 0.5841500163078308, + "learning_rate": 5.17048307462734e-05, + "loss": 1.8044, + "step": 8930 + }, + { + "epoch": 0.4977983390000557, + "grad_norm": 0.5762627124786377, + "learning_rate": 5.169593166042547e-05, + "loss": 1.6068, + "step": 8931 + }, + { + "epoch": 0.4978540772532189, + "grad_norm": 0.5406793355941772, + "learning_rate": 5.1687032520792464e-05, + "loss": 1.6587, + "step": 8932 + }, + { + "epoch": 0.497909815506382, + "grad_norm": 0.5948076248168945, + "learning_rate": 5.1678133327656616e-05, + "loss": 1.7269, + "step": 8933 + }, + { + "epoch": 0.49796555375954515, + "grad_norm": 0.5559920072555542, + "learning_rate": 5.166923408130016e-05, + "loss": 1.7147, + "step": 8934 + }, + { + "epoch": 0.49802129201270834, + "grad_norm": 0.5676483511924744, + "learning_rate": 5.166033478200536e-05, + "loss": 1.5815, + "step": 8935 + }, + { + "epoch": 0.49807703026587147, + "grad_norm": 0.5557644367218018, + "learning_rate": 5.1651435430054396e-05, + "loss": 1.7004, + "step": 8936 + }, + { + "epoch": 0.4981327685190346, + "grad_norm": 0.5279107093811035, + "learning_rate": 5.164253602572954e-05, + "loss": 1.5522, + "step": 8937 + }, + { + "epoch": 0.4981885067721978, + "grad_norm": 0.5402976870536804, + "learning_rate": 5.1633636569313014e-05, + "loss": 1.6626, + "step": 8938 + }, + { + "epoch": 0.4982442450253609, + "grad_norm": 0.5484632849693298, + "learning_rate": 5.1624737061087056e-05, + "loss": 1.5598, + "step": 8939 + }, + { + "epoch": 0.49829998327852404, + "grad_norm": 0.5460349321365356, + "learning_rate": 5.161583750133392e-05, + "loss": 1.6661, + "step": 8940 + }, + { + "epoch": 0.4983557215316872, + "grad_norm": 0.5012972950935364, + "learning_rate": 5.160693789033583e-05, + "loss": 1.3436, + "step": 8941 + }, + { + "epoch": 0.49841145978485035, + "grad_norm": 0.5560734272003174, + "learning_rate": 5.159803822837506e-05, + "loss": 1.5994, + "step": 8942 + }, + { + "epoch": 0.4984671980380135, + "grad_norm": 0.5721739530563354, + "learning_rate": 5.1589138515733805e-05, + "loss": 1.8826, + "step": 8943 + }, + { + "epoch": 0.4985229362911766, + "grad_norm": 0.548629105091095, + "learning_rate": 5.158023875269436e-05, + "loss": 1.465, + "step": 8944 + }, + { + "epoch": 0.4985786745443398, + "grad_norm": 0.5386154651641846, + "learning_rate": 5.157133893953895e-05, + "loss": 1.624, + "step": 8945 + }, + { + "epoch": 0.4986344127975029, + "grad_norm": 0.6287878155708313, + "learning_rate": 5.156243907654983e-05, + "loss": 1.6433, + "step": 8946 + }, + { + "epoch": 0.49869015105066605, + "grad_norm": 0.6134181022644043, + "learning_rate": 5.155353916400925e-05, + "loss": 1.7598, + "step": 8947 + }, + { + "epoch": 0.49874588930382924, + "grad_norm": 0.5654070377349854, + "learning_rate": 5.154463920219947e-05, + "loss": 1.7002, + "step": 8948 + }, + { + "epoch": 0.49880162755699237, + "grad_norm": 0.5511396527290344, + "learning_rate": 5.153573919140274e-05, + "loss": 1.5513, + "step": 8949 + }, + { + "epoch": 0.4988573658101555, + "grad_norm": 0.5892798900604248, + "learning_rate": 5.1526839131901315e-05, + "loss": 1.8855, + "step": 8950 + }, + { + "epoch": 0.4989131040633187, + "grad_norm": 0.6024952530860901, + "learning_rate": 5.151793902397747e-05, + "loss": 1.591, + "step": 8951 + }, + { + "epoch": 0.4989688423164818, + "grad_norm": 0.545107901096344, + "learning_rate": 5.150903886791343e-05, + "loss": 1.54, + "step": 8952 + }, + { + "epoch": 0.49902458056964494, + "grad_norm": 0.5680729746818542, + "learning_rate": 5.150013866399147e-05, + "loss": 1.417, + "step": 8953 + }, + { + "epoch": 0.49908031882280807, + "grad_norm": 0.5475823879241943, + "learning_rate": 5.149123841249387e-05, + "loss": 1.5283, + "step": 8954 + }, + { + "epoch": 0.49913605707597125, + "grad_norm": 0.6003718376159668, + "learning_rate": 5.148233811370289e-05, + "loss": 1.9128, + "step": 8955 + }, + { + "epoch": 0.4991917953291344, + "grad_norm": 0.5217127203941345, + "learning_rate": 5.1473437767900766e-05, + "loss": 1.5466, + "step": 8956 + }, + { + "epoch": 0.4992475335822975, + "grad_norm": 0.5930051803588867, + "learning_rate": 5.1464537375369816e-05, + "loss": 1.7227, + "step": 8957 + }, + { + "epoch": 0.4993032718354607, + "grad_norm": 0.5506693124771118, + "learning_rate": 5.145563693639226e-05, + "loss": 1.5488, + "step": 8958 + }, + { + "epoch": 0.4993590100886238, + "grad_norm": 0.5341318845748901, + "learning_rate": 5.144673645125039e-05, + "loss": 1.6493, + "step": 8959 + }, + { + "epoch": 0.49941474834178695, + "grad_norm": 0.5735641717910767, + "learning_rate": 5.143783592022646e-05, + "loss": 1.6502, + "step": 8960 + }, + { + "epoch": 0.49947048659495014, + "grad_norm": 0.5525271892547607, + "learning_rate": 5.142893534360278e-05, + "loss": 1.389, + "step": 8961 + }, + { + "epoch": 0.49952622484811326, + "grad_norm": 0.6138321161270142, + "learning_rate": 5.1420034721661594e-05, + "loss": 1.882, + "step": 8962 + }, + { + "epoch": 0.4995819631012764, + "grad_norm": 0.5286270380020142, + "learning_rate": 5.1411134054685185e-05, + "loss": 1.6304, + "step": 8963 + }, + { + "epoch": 0.4996377013544396, + "grad_norm": 0.5324103832244873, + "learning_rate": 5.140223334295584e-05, + "loss": 1.7474, + "step": 8964 + }, + { + "epoch": 0.4996934396076027, + "grad_norm": 0.598732590675354, + "learning_rate": 5.139333258675582e-05, + "loss": 1.7623, + "step": 8965 + }, + { + "epoch": 0.49974917786076584, + "grad_norm": 0.5680933594703674, + "learning_rate": 5.138443178636742e-05, + "loss": 1.5633, + "step": 8966 + }, + { + "epoch": 0.49980491611392897, + "grad_norm": 0.5769996047019958, + "learning_rate": 5.13755309420729e-05, + "loss": 1.6215, + "step": 8967 + }, + { + "epoch": 0.49986065436709215, + "grad_norm": 0.5486459732055664, + "learning_rate": 5.1366630054154576e-05, + "loss": 1.6782, + "step": 8968 + }, + { + "epoch": 0.4999163926202553, + "grad_norm": 0.6276679635047913, + "learning_rate": 5.1357729122894706e-05, + "loss": 1.7972, + "step": 8969 + }, + { + "epoch": 0.4999721308734184, + "grad_norm": 0.5534047484397888, + "learning_rate": 5.134882814857559e-05, + "loss": 1.5217, + "step": 8970 + }, + { + "epoch": 0.5000278691265816, + "grad_norm": 0.7427502274513245, + "learning_rate": 5.1339927131479503e-05, + "loss": 1.7474, + "step": 8971 + }, + { + "epoch": 0.5000836073797447, + "grad_norm": 0.5830016136169434, + "learning_rate": 5.133102607188874e-05, + "loss": 1.7703, + "step": 8972 + }, + { + "epoch": 0.5001393456329079, + "grad_norm": 0.5821530818939209, + "learning_rate": 5.132212497008559e-05, + "loss": 1.6809, + "step": 8973 + }, + { + "epoch": 0.500195083886071, + "grad_norm": 0.5597349405288696, + "learning_rate": 5.1313223826352365e-05, + "loss": 1.6982, + "step": 8974 + }, + { + "epoch": 0.5002508221392341, + "grad_norm": 0.5627524256706238, + "learning_rate": 5.1304322640971315e-05, + "loss": 1.5646, + "step": 8975 + }, + { + "epoch": 0.5003065603923973, + "grad_norm": 0.568310558795929, + "learning_rate": 5.1295421414224754e-05, + "loss": 1.6019, + "step": 8976 + }, + { + "epoch": 0.5003622986455605, + "grad_norm": 0.5768476128578186, + "learning_rate": 5.128652014639499e-05, + "loss": 1.6455, + "step": 8977 + }, + { + "epoch": 0.5004180368987236, + "grad_norm": 0.5494751930236816, + "learning_rate": 5.1277618837764294e-05, + "loss": 1.5586, + "step": 8978 + }, + { + "epoch": 0.5004737751518867, + "grad_norm": 0.5893326997756958, + "learning_rate": 5.126871748861499e-05, + "loss": 1.8271, + "step": 8979 + }, + { + "epoch": 0.5005295134050499, + "grad_norm": 0.5742121934890747, + "learning_rate": 5.125981609922935e-05, + "loss": 1.7673, + "step": 8980 + }, + { + "epoch": 0.500585251658213, + "grad_norm": 0.5225714445114136, + "learning_rate": 5.1250914669889714e-05, + "loss": 1.5127, + "step": 8981 + }, + { + "epoch": 0.5006409899113762, + "grad_norm": 0.5902960300445557, + "learning_rate": 5.124201320087833e-05, + "loss": 1.7471, + "step": 8982 + }, + { + "epoch": 0.5006967281645394, + "grad_norm": 0.5950215458869934, + "learning_rate": 5.1233111692477555e-05, + "loss": 1.6188, + "step": 8983 + }, + { + "epoch": 0.5007524664177024, + "grad_norm": 0.5525108575820923, + "learning_rate": 5.122421014496965e-05, + "loss": 1.6802, + "step": 8984 + }, + { + "epoch": 0.5008082046708656, + "grad_norm": 0.5543337464332581, + "learning_rate": 5.1215308558636944e-05, + "loss": 1.5793, + "step": 8985 + }, + { + "epoch": 0.5008639429240288, + "grad_norm": 0.5265454053878784, + "learning_rate": 5.1206406933761716e-05, + "loss": 1.3947, + "step": 8986 + }, + { + "epoch": 0.5009196811771919, + "grad_norm": 0.6150608658790588, + "learning_rate": 5.119750527062632e-05, + "loss": 1.9244, + "step": 8987 + }, + { + "epoch": 0.5009754194303551, + "grad_norm": 0.5269333124160767, + "learning_rate": 5.1188603569513025e-05, + "loss": 1.6002, + "step": 8988 + }, + { + "epoch": 0.5010311576835182, + "grad_norm": 0.6029527187347412, + "learning_rate": 5.117970183070416e-05, + "loss": 1.8124, + "step": 8989 + }, + { + "epoch": 0.5010868959366813, + "grad_norm": 0.5682185292243958, + "learning_rate": 5.1170800054482035e-05, + "loss": 1.6561, + "step": 8990 + }, + { + "epoch": 0.5011426341898445, + "grad_norm": 0.5897371172904968, + "learning_rate": 5.116189824112896e-05, + "loss": 1.7734, + "step": 8991 + }, + { + "epoch": 0.5011983724430077, + "grad_norm": 0.5152097940444946, + "learning_rate": 5.115299639092723e-05, + "loss": 1.4226, + "step": 8992 + }, + { + "epoch": 0.5012541106961708, + "grad_norm": 0.546345591545105, + "learning_rate": 5.114409450415919e-05, + "loss": 1.4967, + "step": 8993 + }, + { + "epoch": 0.501309848949334, + "grad_norm": 0.5303710103034973, + "learning_rate": 5.113519258110715e-05, + "loss": 1.6527, + "step": 8994 + }, + { + "epoch": 0.501365587202497, + "grad_norm": 0.5513923764228821, + "learning_rate": 5.1126290622053405e-05, + "loss": 1.7632, + "step": 8995 + }, + { + "epoch": 0.5014213254556602, + "grad_norm": 0.5321218371391296, + "learning_rate": 5.1117388627280305e-05, + "loss": 1.5339, + "step": 8996 + }, + { + "epoch": 0.5014770637088234, + "grad_norm": 0.5597907900810242, + "learning_rate": 5.1108486597070125e-05, + "loss": 1.6767, + "step": 8997 + }, + { + "epoch": 0.5015328019619865, + "grad_norm": 0.5612991452217102, + "learning_rate": 5.109958453170524e-05, + "loss": 1.7141, + "step": 8998 + }, + { + "epoch": 0.5015885402151496, + "grad_norm": 0.549898087978363, + "learning_rate": 5.109068243146793e-05, + "loss": 1.393, + "step": 8999 + }, + { + "epoch": 0.5016442784683128, + "grad_norm": 0.5984362959861755, + "learning_rate": 5.1081780296640535e-05, + "loss": 1.8804, + "step": 9000 + } + ], + "logging_steps": 1, + "max_steps": 17941, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.714094069481472e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}