diff --git "a/checkpoint-2049/trainer_state.json" "b/checkpoint-2049/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2049/trainer_state.json" @@ -0,0 +1,14376 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998373454782042, + "eval_steps": 500, + "global_step": 2049, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00048796356538711777, + "grad_norm": 0.4446243345737457, + "learning_rate": 0.0001, + "loss": 1.8998, + "step": 1 + }, + { + "epoch": 0.0009759271307742355, + "grad_norm": 0.443472683429718, + "learning_rate": 0.0001, + "loss": 2.146, + "step": 2 + }, + { + "epoch": 0.0014638906961613532, + "grad_norm": 0.246729776263237, + "learning_rate": 0.0001, + "loss": 1.8931, + "step": 3 + }, + { + "epoch": 0.001951854261548471, + "grad_norm": 0.3018186688423157, + "learning_rate": 0.0001, + "loss": 1.984, + "step": 4 + }, + { + "epoch": 0.002439817826935589, + "grad_norm": 0.2850761413574219, + "learning_rate": 0.0001, + "loss": 1.863, + "step": 5 + }, + { + "epoch": 0.0029277813923227064, + "grad_norm": 0.23705212771892548, + "learning_rate": 0.0001, + "loss": 1.8384, + "step": 6 + }, + { + "epoch": 0.0034157449577098243, + "grad_norm": 0.24392390251159668, + "learning_rate": 0.0001, + "loss": 1.8743, + "step": 7 + }, + { + "epoch": 0.003903708523096942, + "grad_norm": 0.24215014278888702, + "learning_rate": 0.0001, + "loss": 1.8048, + "step": 8 + }, + { + "epoch": 0.00439167208848406, + "grad_norm": 0.22235405445098877, + "learning_rate": 0.0001, + "loss": 1.8098, + "step": 9 + }, + { + "epoch": 0.004879635653871178, + "grad_norm": 0.1880388706922531, + "learning_rate": 0.0001, + "loss": 1.7519, + "step": 10 + }, + { + "epoch": 0.005367599219258295, + "grad_norm": 0.2197292149066925, + "learning_rate": 0.0001, + "loss": 1.905, + "step": 11 + }, + { + "epoch": 0.005855562784645413, + "grad_norm": 0.20583945512771606, + "learning_rate": 0.0001, + "loss": 1.8143, + "step": 12 + }, + { + "epoch": 0.006343526350032531, + "grad_norm": 0.20737111568450928, + "learning_rate": 0.0001, + "loss": 1.8505, + "step": 13 + }, + { + "epoch": 0.0068314899154196486, + "grad_norm": 0.19384053349494934, + "learning_rate": 0.0001, + "loss": 1.7528, + "step": 14 + }, + { + "epoch": 0.007319453480806766, + "grad_norm": 0.23753000795841217, + "learning_rate": 0.0001, + "loss": 1.7206, + "step": 15 + }, + { + "epoch": 0.007807417046193884, + "grad_norm": 0.1946115642786026, + "learning_rate": 0.0001, + "loss": 1.7562, + "step": 16 + }, + { + "epoch": 0.008295380611581003, + "grad_norm": 0.18985839188098907, + "learning_rate": 0.0001, + "loss": 1.6665, + "step": 17 + }, + { + "epoch": 0.00878334417696812, + "grad_norm": 0.20499983429908752, + "learning_rate": 0.0001, + "loss": 1.9491, + "step": 18 + }, + { + "epoch": 0.009271307742355238, + "grad_norm": 0.1874532699584961, + "learning_rate": 0.0001, + "loss": 1.7975, + "step": 19 + }, + { + "epoch": 0.009759271307742356, + "grad_norm": 0.18048429489135742, + "learning_rate": 0.0001, + "loss": 1.7799, + "step": 20 + }, + { + "epoch": 0.010247234873129472, + "grad_norm": 0.1777779906988144, + "learning_rate": 0.0001, + "loss": 1.7816, + "step": 21 + }, + { + "epoch": 0.01073519843851659, + "grad_norm": 0.17349651455879211, + "learning_rate": 0.0001, + "loss": 1.7431, + "step": 22 + }, + { + "epoch": 0.011223162003903709, + "grad_norm": 0.18479375541210175, + "learning_rate": 0.0001, + "loss": 1.903, + "step": 23 + }, + { + "epoch": 0.011711125569290826, + "grad_norm": 0.1918632984161377, + "learning_rate": 0.0001, + "loss": 1.7957, + "step": 24 + }, + { + "epoch": 0.012199089134677944, + "grad_norm": 0.18239013850688934, + "learning_rate": 0.0001, + "loss": 1.8039, + "step": 25 + }, + { + "epoch": 0.012687052700065062, + "grad_norm": 0.17392802238464355, + "learning_rate": 0.0001, + "loss": 1.7022, + "step": 26 + }, + { + "epoch": 0.013175016265452179, + "grad_norm": 0.1769259124994278, + "learning_rate": 0.0001, + "loss": 1.7131, + "step": 27 + }, + { + "epoch": 0.013662979830839297, + "grad_norm": 0.17371872067451477, + "learning_rate": 0.0001, + "loss": 1.7657, + "step": 28 + }, + { + "epoch": 0.014150943396226415, + "grad_norm": 0.19897091388702393, + "learning_rate": 0.0001, + "loss": 1.8791, + "step": 29 + }, + { + "epoch": 0.014638906961613532, + "grad_norm": 0.17471033334732056, + "learning_rate": 0.0001, + "loss": 1.8765, + "step": 30 + }, + { + "epoch": 0.01512687052700065, + "grad_norm": 0.17650161683559418, + "learning_rate": 0.0001, + "loss": 1.8181, + "step": 31 + }, + { + "epoch": 0.015614834092387769, + "grad_norm": 0.18008925020694733, + "learning_rate": 0.0001, + "loss": 1.8138, + "step": 32 + }, + { + "epoch": 0.016102797657774885, + "grad_norm": 0.18406356871128082, + "learning_rate": 0.0001, + "loss": 1.907, + "step": 33 + }, + { + "epoch": 0.016590761223162005, + "grad_norm": 0.18869489431381226, + "learning_rate": 0.0001, + "loss": 1.9043, + "step": 34 + }, + { + "epoch": 0.017078724788549122, + "grad_norm": 0.18416965007781982, + "learning_rate": 0.0001, + "loss": 1.7695, + "step": 35 + }, + { + "epoch": 0.01756668835393624, + "grad_norm": 0.18121257424354553, + "learning_rate": 0.0001, + "loss": 1.8342, + "step": 36 + }, + { + "epoch": 0.01805465191932336, + "grad_norm": 0.18426860868930817, + "learning_rate": 0.0001, + "loss": 1.818, + "step": 37 + }, + { + "epoch": 0.018542615484710475, + "grad_norm": 0.18800823390483856, + "learning_rate": 0.0001, + "loss": 1.8019, + "step": 38 + }, + { + "epoch": 0.01903057905009759, + "grad_norm": 0.18787121772766113, + "learning_rate": 0.0001, + "loss": 1.8052, + "step": 39 + }, + { + "epoch": 0.01951854261548471, + "grad_norm": 0.18341200053691864, + "learning_rate": 0.0001, + "loss": 1.7288, + "step": 40 + }, + { + "epoch": 0.020006506180871828, + "grad_norm": 0.18460282683372498, + "learning_rate": 0.0001, + "loss": 1.9984, + "step": 41 + }, + { + "epoch": 0.020494469746258945, + "grad_norm": 0.17212441563606262, + "learning_rate": 0.0001, + "loss": 1.7928, + "step": 42 + }, + { + "epoch": 0.020982433311646065, + "grad_norm": 0.18548350036144257, + "learning_rate": 0.0001, + "loss": 1.9719, + "step": 43 + }, + { + "epoch": 0.02147039687703318, + "grad_norm": 0.18035617470741272, + "learning_rate": 0.0001, + "loss": 1.9265, + "step": 44 + }, + { + "epoch": 0.021958360442420298, + "grad_norm": 0.16300201416015625, + "learning_rate": 0.0001, + "loss": 1.6821, + "step": 45 + }, + { + "epoch": 0.022446324007807418, + "grad_norm": 0.1797887086868286, + "learning_rate": 0.0001, + "loss": 1.8276, + "step": 46 + }, + { + "epoch": 0.022934287573194535, + "grad_norm": 0.18614032864570618, + "learning_rate": 0.0001, + "loss": 1.769, + "step": 47 + }, + { + "epoch": 0.02342225113858165, + "grad_norm": 0.18762686848640442, + "learning_rate": 0.0001, + "loss": 1.7716, + "step": 48 + }, + { + "epoch": 0.02391021470396877, + "grad_norm": 0.1779824048280716, + "learning_rate": 0.0001, + "loss": 1.7047, + "step": 49 + }, + { + "epoch": 0.024398178269355888, + "grad_norm": 0.1713806688785553, + "learning_rate": 0.0001, + "loss": 1.7085, + "step": 50 + }, + { + "epoch": 0.024886141834743004, + "grad_norm": 0.17888174951076508, + "learning_rate": 0.0001, + "loss": 1.8539, + "step": 51 + }, + { + "epoch": 0.025374105400130124, + "grad_norm": 0.18366138637065887, + "learning_rate": 0.0001, + "loss": 1.7948, + "step": 52 + }, + { + "epoch": 0.02586206896551724, + "grad_norm": 0.1684766262769699, + "learning_rate": 0.0001, + "loss": 1.7752, + "step": 53 + }, + { + "epoch": 0.026350032530904358, + "grad_norm": 0.18316026031970978, + "learning_rate": 0.0001, + "loss": 1.8153, + "step": 54 + }, + { + "epoch": 0.026837996096291478, + "grad_norm": 0.1712900847196579, + "learning_rate": 0.0001, + "loss": 1.8209, + "step": 55 + }, + { + "epoch": 0.027325959661678594, + "grad_norm": 0.17653001844882965, + "learning_rate": 0.0001, + "loss": 1.7142, + "step": 56 + }, + { + "epoch": 0.02781392322706571, + "grad_norm": 0.17115001380443573, + "learning_rate": 0.0001, + "loss": 1.7014, + "step": 57 + }, + { + "epoch": 0.02830188679245283, + "grad_norm": 0.19934123754501343, + "learning_rate": 0.0001, + "loss": 1.8184, + "step": 58 + }, + { + "epoch": 0.028789850357839947, + "grad_norm": 0.20567697286605835, + "learning_rate": 0.0001, + "loss": 1.9174, + "step": 59 + }, + { + "epoch": 0.029277813923227064, + "grad_norm": 0.17345917224884033, + "learning_rate": 0.0001, + "loss": 1.7448, + "step": 60 + }, + { + "epoch": 0.029765777488614184, + "grad_norm": 0.24353067576885223, + "learning_rate": 0.0001, + "loss": 1.7974, + "step": 61 + }, + { + "epoch": 0.0302537410540013, + "grad_norm": 0.18949398398399353, + "learning_rate": 0.0001, + "loss": 1.8231, + "step": 62 + }, + { + "epoch": 0.03074170461938842, + "grad_norm": 0.22029712796211243, + "learning_rate": 0.0001, + "loss": 1.8535, + "step": 63 + }, + { + "epoch": 0.031229668184775537, + "grad_norm": 0.16962048411369324, + "learning_rate": 0.0001, + "loss": 1.7686, + "step": 64 + }, + { + "epoch": 0.03171763175016266, + "grad_norm": 0.19039765000343323, + "learning_rate": 0.0001, + "loss": 1.8303, + "step": 65 + }, + { + "epoch": 0.03220559531554977, + "grad_norm": 0.20166978240013123, + "learning_rate": 0.0001, + "loss": 1.768, + "step": 66 + }, + { + "epoch": 0.03269355888093689, + "grad_norm": 0.173394113779068, + "learning_rate": 0.0001, + "loss": 1.8253, + "step": 67 + }, + { + "epoch": 0.03318152244632401, + "grad_norm": 0.19260728359222412, + "learning_rate": 0.0001, + "loss": 1.7589, + "step": 68 + }, + { + "epoch": 0.033669486011711124, + "grad_norm": 0.19539032876491547, + "learning_rate": 0.0001, + "loss": 1.749, + "step": 69 + }, + { + "epoch": 0.034157449577098244, + "grad_norm": 0.16770870983600616, + "learning_rate": 0.0001, + "loss": 1.7132, + "step": 70 + }, + { + "epoch": 0.034645413142485364, + "grad_norm": 0.19755178689956665, + "learning_rate": 0.0001, + "loss": 1.8323, + "step": 71 + }, + { + "epoch": 0.03513337670787248, + "grad_norm": 0.18038292229175568, + "learning_rate": 0.0001, + "loss": 1.7599, + "step": 72 + }, + { + "epoch": 0.0356213402732596, + "grad_norm": 0.17995433509349823, + "learning_rate": 0.0001, + "loss": 1.9183, + "step": 73 + }, + { + "epoch": 0.03610930383864672, + "grad_norm": 0.19222807884216309, + "learning_rate": 0.0001, + "loss": 1.8642, + "step": 74 + }, + { + "epoch": 0.03659726740403383, + "grad_norm": 0.16965682804584503, + "learning_rate": 0.0001, + "loss": 1.7271, + "step": 75 + }, + { + "epoch": 0.03708523096942095, + "grad_norm": 0.17662999033927917, + "learning_rate": 0.0001, + "loss": 1.8263, + "step": 76 + }, + { + "epoch": 0.03757319453480807, + "grad_norm": 0.1699201613664627, + "learning_rate": 0.0001, + "loss": 1.6818, + "step": 77 + }, + { + "epoch": 0.03806115810019518, + "grad_norm": 0.17309829592704773, + "learning_rate": 0.0001, + "loss": 1.7424, + "step": 78 + }, + { + "epoch": 0.0385491216655823, + "grad_norm": 0.18537020683288574, + "learning_rate": 0.0001, + "loss": 1.7986, + "step": 79 + }, + { + "epoch": 0.03903708523096942, + "grad_norm": 0.1709861010313034, + "learning_rate": 0.0001, + "loss": 1.6091, + "step": 80 + }, + { + "epoch": 0.039525048796356536, + "grad_norm": 0.17050296068191528, + "learning_rate": 0.0001, + "loss": 1.6904, + "step": 81 + }, + { + "epoch": 0.040013012361743656, + "grad_norm": 0.17640157043933868, + "learning_rate": 0.0001, + "loss": 1.7087, + "step": 82 + }, + { + "epoch": 0.040500975927130776, + "grad_norm": 0.1919400542974472, + "learning_rate": 0.0001, + "loss": 1.8223, + "step": 83 + }, + { + "epoch": 0.04098893949251789, + "grad_norm": 0.19427765905857086, + "learning_rate": 0.0001, + "loss": 1.7443, + "step": 84 + }, + { + "epoch": 0.04147690305790501, + "grad_norm": 0.19496281445026398, + "learning_rate": 0.0001, + "loss": 1.8336, + "step": 85 + }, + { + "epoch": 0.04196486662329213, + "grad_norm": 0.18101565539836884, + "learning_rate": 0.0001, + "loss": 1.8422, + "step": 86 + }, + { + "epoch": 0.04245283018867924, + "grad_norm": 0.19941496849060059, + "learning_rate": 0.0001, + "loss": 1.7168, + "step": 87 + }, + { + "epoch": 0.04294079375406636, + "grad_norm": 0.1963973492383957, + "learning_rate": 0.0001, + "loss": 1.7558, + "step": 88 + }, + { + "epoch": 0.04342875731945348, + "grad_norm": 0.17694450914859772, + "learning_rate": 0.0001, + "loss": 1.6953, + "step": 89 + }, + { + "epoch": 0.043916720884840596, + "grad_norm": 0.19362711906433105, + "learning_rate": 0.0001, + "loss": 1.8165, + "step": 90 + }, + { + "epoch": 0.044404684450227716, + "grad_norm": 0.1736024022102356, + "learning_rate": 0.0001, + "loss": 1.777, + "step": 91 + }, + { + "epoch": 0.044892648015614836, + "grad_norm": 0.17649488151073456, + "learning_rate": 0.0001, + "loss": 1.7507, + "step": 92 + }, + { + "epoch": 0.04538061158100195, + "grad_norm": 0.2002265304327011, + "learning_rate": 0.0001, + "loss": 1.8796, + "step": 93 + }, + { + "epoch": 0.04586857514638907, + "grad_norm": 0.1667991429567337, + "learning_rate": 0.0001, + "loss": 1.7051, + "step": 94 + }, + { + "epoch": 0.04635653871177619, + "grad_norm": 0.1868171989917755, + "learning_rate": 0.0001, + "loss": 1.747, + "step": 95 + }, + { + "epoch": 0.0468445022771633, + "grad_norm": 0.18312174081802368, + "learning_rate": 0.0001, + "loss": 1.7835, + "step": 96 + }, + { + "epoch": 0.04733246584255042, + "grad_norm": 0.1762659102678299, + "learning_rate": 0.0001, + "loss": 1.6517, + "step": 97 + }, + { + "epoch": 0.04782042940793754, + "grad_norm": 0.19766494631767273, + "learning_rate": 0.0001, + "loss": 1.826, + "step": 98 + }, + { + "epoch": 0.048308392973324656, + "grad_norm": 0.17331789433956146, + "learning_rate": 0.0001, + "loss": 1.7506, + "step": 99 + }, + { + "epoch": 0.048796356538711776, + "grad_norm": 0.16851170361042023, + "learning_rate": 0.0001, + "loss": 1.744, + "step": 100 + }, + { + "epoch": 0.049284320104098896, + "grad_norm": 0.17572622001171112, + "learning_rate": 0.0001, + "loss": 1.6986, + "step": 101 + }, + { + "epoch": 0.04977228366948601, + "grad_norm": 0.1850849688053131, + "learning_rate": 0.0001, + "loss": 1.7895, + "step": 102 + }, + { + "epoch": 0.05026024723487313, + "grad_norm": 0.18450362980365753, + "learning_rate": 0.0001, + "loss": 1.8234, + "step": 103 + }, + { + "epoch": 0.05074821080026025, + "grad_norm": 0.1832476705312729, + "learning_rate": 0.0001, + "loss": 1.7986, + "step": 104 + }, + { + "epoch": 0.05123617436564736, + "grad_norm": 0.1809314638376236, + "learning_rate": 0.0001, + "loss": 1.7923, + "step": 105 + }, + { + "epoch": 0.05172413793103448, + "grad_norm": 0.17974039912223816, + "learning_rate": 0.0001, + "loss": 1.7095, + "step": 106 + }, + { + "epoch": 0.0522121014964216, + "grad_norm": 0.16436076164245605, + "learning_rate": 0.0001, + "loss": 1.6873, + "step": 107 + }, + { + "epoch": 0.052700065061808715, + "grad_norm": 0.16344858705997467, + "learning_rate": 0.0001, + "loss": 1.6991, + "step": 108 + }, + { + "epoch": 0.053188028627195835, + "grad_norm": 0.17950277030467987, + "learning_rate": 0.0001, + "loss": 1.8591, + "step": 109 + }, + { + "epoch": 0.053675992192582955, + "grad_norm": 0.18337760865688324, + "learning_rate": 0.0001, + "loss": 1.784, + "step": 110 + }, + { + "epoch": 0.05416395575797007, + "grad_norm": 0.1895488053560257, + "learning_rate": 0.0001, + "loss": 1.7853, + "step": 111 + }, + { + "epoch": 0.05465191932335719, + "grad_norm": 0.17522425949573517, + "learning_rate": 0.0001, + "loss": 1.7127, + "step": 112 + }, + { + "epoch": 0.05513988288874431, + "grad_norm": 0.17943814396858215, + "learning_rate": 0.0001, + "loss": 1.755, + "step": 113 + }, + { + "epoch": 0.05562784645413142, + "grad_norm": 0.1815492808818817, + "learning_rate": 0.0001, + "loss": 1.7687, + "step": 114 + }, + { + "epoch": 0.05611581001951854, + "grad_norm": 0.16954658925533295, + "learning_rate": 0.0001, + "loss": 1.7562, + "step": 115 + }, + { + "epoch": 0.05660377358490566, + "grad_norm": 0.17870648205280304, + "learning_rate": 0.0001, + "loss": 1.841, + "step": 116 + }, + { + "epoch": 0.057091737150292775, + "grad_norm": 0.17044954001903534, + "learning_rate": 0.0001, + "loss": 1.7118, + "step": 117 + }, + { + "epoch": 0.057579700715679895, + "grad_norm": 0.17524173855781555, + "learning_rate": 0.0001, + "loss": 1.6045, + "step": 118 + }, + { + "epoch": 0.058067664281067015, + "grad_norm": 0.17537613213062286, + "learning_rate": 0.0001, + "loss": 1.8018, + "step": 119 + }, + { + "epoch": 0.05855562784645413, + "grad_norm": 0.17819495499134064, + "learning_rate": 0.0001, + "loss": 1.7723, + "step": 120 + }, + { + "epoch": 0.05904359141184125, + "grad_norm": 0.17807795107364655, + "learning_rate": 0.0001, + "loss": 1.8558, + "step": 121 + }, + { + "epoch": 0.05953155497722837, + "grad_norm": 0.1687198132276535, + "learning_rate": 0.0001, + "loss": 1.7673, + "step": 122 + }, + { + "epoch": 0.06001951854261549, + "grad_norm": 0.17069241404533386, + "learning_rate": 0.0001, + "loss": 1.7561, + "step": 123 + }, + { + "epoch": 0.0605074821080026, + "grad_norm": 0.1655956506729126, + "learning_rate": 0.0001, + "loss": 1.6607, + "step": 124 + }, + { + "epoch": 0.06099544567338972, + "grad_norm": 0.1846679002046585, + "learning_rate": 0.0001, + "loss": 1.8676, + "step": 125 + }, + { + "epoch": 0.06148340923877684, + "grad_norm": 0.17344145476818085, + "learning_rate": 0.0001, + "loss": 1.7427, + "step": 126 + }, + { + "epoch": 0.061971372804163954, + "grad_norm": 0.17264996469020844, + "learning_rate": 0.0001, + "loss": 1.7279, + "step": 127 + }, + { + "epoch": 0.062459336369551074, + "grad_norm": 0.18628281354904175, + "learning_rate": 0.0001, + "loss": 1.6708, + "step": 128 + }, + { + "epoch": 0.0629472999349382, + "grad_norm": 0.178174689412117, + "learning_rate": 0.0001, + "loss": 1.7931, + "step": 129 + }, + { + "epoch": 0.06343526350032531, + "grad_norm": 0.17690585553646088, + "learning_rate": 0.0001, + "loss": 1.7647, + "step": 130 + }, + { + "epoch": 0.06392322706571242, + "grad_norm": 0.18117444217205048, + "learning_rate": 0.0001, + "loss": 1.7376, + "step": 131 + }, + { + "epoch": 0.06441119063109954, + "grad_norm": 0.17523089051246643, + "learning_rate": 0.0001, + "loss": 1.8403, + "step": 132 + }, + { + "epoch": 0.06489915419648666, + "grad_norm": 0.16988244652748108, + "learning_rate": 0.0001, + "loss": 1.6958, + "step": 133 + }, + { + "epoch": 0.06538711776187378, + "grad_norm": 0.1890041083097458, + "learning_rate": 0.0001, + "loss": 1.7388, + "step": 134 + }, + { + "epoch": 0.0658750813272609, + "grad_norm": 0.1703094244003296, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 135 + }, + { + "epoch": 0.06636304489264802, + "grad_norm": 0.17852698266506195, + "learning_rate": 0.0001, + "loss": 1.7786, + "step": 136 + }, + { + "epoch": 0.06685100845803513, + "grad_norm": 0.17648550868034363, + "learning_rate": 0.0001, + "loss": 1.7172, + "step": 137 + }, + { + "epoch": 0.06733897202342225, + "grad_norm": 0.18284566700458527, + "learning_rate": 0.0001, + "loss": 1.7491, + "step": 138 + }, + { + "epoch": 0.06782693558880937, + "grad_norm": 0.1686737835407257, + "learning_rate": 0.0001, + "loss": 1.7218, + "step": 139 + }, + { + "epoch": 0.06831489915419649, + "grad_norm": 0.1741771250963211, + "learning_rate": 0.0001, + "loss": 1.7534, + "step": 140 + }, + { + "epoch": 0.06880286271958361, + "grad_norm": 0.1778876781463623, + "learning_rate": 0.0001, + "loss": 1.7388, + "step": 141 + }, + { + "epoch": 0.06929082628497073, + "grad_norm": 0.1860485076904297, + "learning_rate": 0.0001, + "loss": 1.8109, + "step": 142 + }, + { + "epoch": 0.06977878985035783, + "grad_norm": 0.17966079711914062, + "learning_rate": 0.0001, + "loss": 1.7171, + "step": 143 + }, + { + "epoch": 0.07026675341574495, + "grad_norm": 0.19341900944709778, + "learning_rate": 0.0001, + "loss": 1.7911, + "step": 144 + }, + { + "epoch": 0.07075471698113207, + "grad_norm": 0.1968701183795929, + "learning_rate": 0.0001, + "loss": 1.858, + "step": 145 + }, + { + "epoch": 0.0712426805465192, + "grad_norm": 0.17585061490535736, + "learning_rate": 0.0001, + "loss": 1.6731, + "step": 146 + }, + { + "epoch": 0.07173064411190631, + "grad_norm": 0.17294664680957794, + "learning_rate": 0.0001, + "loss": 1.7284, + "step": 147 + }, + { + "epoch": 0.07221860767729343, + "grad_norm": 0.18245872855186462, + "learning_rate": 0.0001, + "loss": 1.7595, + "step": 148 + }, + { + "epoch": 0.07270657124268054, + "grad_norm": 0.16850219666957855, + "learning_rate": 0.0001, + "loss": 1.73, + "step": 149 + }, + { + "epoch": 0.07319453480806766, + "grad_norm": 0.16891759634017944, + "learning_rate": 0.0001, + "loss": 1.7434, + "step": 150 + }, + { + "epoch": 0.07368249837345478, + "grad_norm": 0.17363204061985016, + "learning_rate": 0.0001, + "loss": 1.738, + "step": 151 + }, + { + "epoch": 0.0741704619388419, + "grad_norm": 0.16307075321674347, + "learning_rate": 0.0001, + "loss": 1.6285, + "step": 152 + }, + { + "epoch": 0.07465842550422902, + "grad_norm": 0.1735111027956009, + "learning_rate": 0.0001, + "loss": 1.5711, + "step": 153 + }, + { + "epoch": 0.07514638906961614, + "grad_norm": 0.18169796466827393, + "learning_rate": 0.0001, + "loss": 1.7395, + "step": 154 + }, + { + "epoch": 0.07563435263500325, + "grad_norm": 0.16926725208759308, + "learning_rate": 0.0001, + "loss": 1.7534, + "step": 155 + }, + { + "epoch": 0.07612231620039037, + "grad_norm": 0.19919319450855255, + "learning_rate": 0.0001, + "loss": 1.6975, + "step": 156 + }, + { + "epoch": 0.07661027976577749, + "grad_norm": 0.19146177172660828, + "learning_rate": 0.0001, + "loss": 1.8272, + "step": 157 + }, + { + "epoch": 0.0770982433311646, + "grad_norm": 0.19453231990337372, + "learning_rate": 0.0001, + "loss": 1.8229, + "step": 158 + }, + { + "epoch": 0.07758620689655173, + "grad_norm": 0.20597495138645172, + "learning_rate": 0.0001, + "loss": 1.8567, + "step": 159 + }, + { + "epoch": 0.07807417046193885, + "grad_norm": 0.18599432706832886, + "learning_rate": 0.0001, + "loss": 1.7587, + "step": 160 + }, + { + "epoch": 0.07856213402732595, + "grad_norm": 0.21232162415981293, + "learning_rate": 0.0001, + "loss": 1.7179, + "step": 161 + }, + { + "epoch": 0.07905009759271307, + "grad_norm": 0.1712743043899536, + "learning_rate": 0.0001, + "loss": 1.678, + "step": 162 + }, + { + "epoch": 0.07953806115810019, + "grad_norm": 0.18402481079101562, + "learning_rate": 0.0001, + "loss": 1.7731, + "step": 163 + }, + { + "epoch": 0.08002602472348731, + "grad_norm": 0.18908202648162842, + "learning_rate": 0.0001, + "loss": 1.841, + "step": 164 + }, + { + "epoch": 0.08051398828887443, + "grad_norm": 0.17370882630348206, + "learning_rate": 0.0001, + "loss": 1.6713, + "step": 165 + }, + { + "epoch": 0.08100195185426155, + "grad_norm": 0.1881919503211975, + "learning_rate": 0.0001, + "loss": 1.8285, + "step": 166 + }, + { + "epoch": 0.08148991541964867, + "grad_norm": 0.1770172417163849, + "learning_rate": 0.0001, + "loss": 1.7292, + "step": 167 + }, + { + "epoch": 0.08197787898503578, + "grad_norm": 0.1822032779455185, + "learning_rate": 0.0001, + "loss": 1.6977, + "step": 168 + }, + { + "epoch": 0.0824658425504229, + "grad_norm": 0.19020989537239075, + "learning_rate": 0.0001, + "loss": 1.6964, + "step": 169 + }, + { + "epoch": 0.08295380611581002, + "grad_norm": 0.17227591574192047, + "learning_rate": 0.0001, + "loss": 1.703, + "step": 170 + }, + { + "epoch": 0.08344176968119714, + "grad_norm": 0.19228717684745789, + "learning_rate": 0.0001, + "loss": 1.7247, + "step": 171 + }, + { + "epoch": 0.08392973324658426, + "grad_norm": 0.1909552961587906, + "learning_rate": 0.0001, + "loss": 1.7973, + "step": 172 + }, + { + "epoch": 0.08441769681197138, + "grad_norm": 0.18189294636249542, + "learning_rate": 0.0001, + "loss": 1.7579, + "step": 173 + }, + { + "epoch": 0.08490566037735849, + "grad_norm": 0.19137217104434967, + "learning_rate": 0.0001, + "loss": 1.7198, + "step": 174 + }, + { + "epoch": 0.0853936239427456, + "grad_norm": 0.18612581491470337, + "learning_rate": 0.0001, + "loss": 1.7585, + "step": 175 + }, + { + "epoch": 0.08588158750813273, + "grad_norm": 0.1759909838438034, + "learning_rate": 0.0001, + "loss": 1.6732, + "step": 176 + }, + { + "epoch": 0.08636955107351985, + "grad_norm": 0.18982531130313873, + "learning_rate": 0.0001, + "loss": 1.8301, + "step": 177 + }, + { + "epoch": 0.08685751463890697, + "grad_norm": 0.16662733256816864, + "learning_rate": 0.0001, + "loss": 1.6799, + "step": 178 + }, + { + "epoch": 0.08734547820429409, + "grad_norm": 0.17956425249576569, + "learning_rate": 0.0001, + "loss": 1.671, + "step": 179 + }, + { + "epoch": 0.08783344176968119, + "grad_norm": 0.18416181206703186, + "learning_rate": 0.0001, + "loss": 1.7922, + "step": 180 + }, + { + "epoch": 0.08832140533506831, + "grad_norm": 0.16633754968643188, + "learning_rate": 0.0001, + "loss": 1.7096, + "step": 181 + }, + { + "epoch": 0.08880936890045543, + "grad_norm": 0.19759412109851837, + "learning_rate": 0.0001, + "loss": 1.8402, + "step": 182 + }, + { + "epoch": 0.08929733246584255, + "grad_norm": 0.17006362974643707, + "learning_rate": 0.0001, + "loss": 1.6922, + "step": 183 + }, + { + "epoch": 0.08978529603122967, + "grad_norm": 0.16919896006584167, + "learning_rate": 0.0001, + "loss": 1.6657, + "step": 184 + }, + { + "epoch": 0.09027325959661679, + "grad_norm": 0.20307502150535583, + "learning_rate": 0.0001, + "loss": 1.8772, + "step": 185 + }, + { + "epoch": 0.0907612231620039, + "grad_norm": 0.17572732269763947, + "learning_rate": 0.0001, + "loss": 1.7666, + "step": 186 + }, + { + "epoch": 0.09124918672739102, + "grad_norm": 0.17327293753623962, + "learning_rate": 0.0001, + "loss": 1.8206, + "step": 187 + }, + { + "epoch": 0.09173715029277814, + "grad_norm": 0.18354281783103943, + "learning_rate": 0.0001, + "loss": 1.8013, + "step": 188 + }, + { + "epoch": 0.09222511385816526, + "grad_norm": 0.16821032762527466, + "learning_rate": 0.0001, + "loss": 1.6893, + "step": 189 + }, + { + "epoch": 0.09271307742355238, + "grad_norm": 0.17506404221057892, + "learning_rate": 0.0001, + "loss": 1.7657, + "step": 190 + }, + { + "epoch": 0.0932010409889395, + "grad_norm": 0.1758153885602951, + "learning_rate": 0.0001, + "loss": 1.7095, + "step": 191 + }, + { + "epoch": 0.0936890045543266, + "grad_norm": 0.18787072598934174, + "learning_rate": 0.0001, + "loss": 1.7312, + "step": 192 + }, + { + "epoch": 0.09417696811971372, + "grad_norm": 0.1803017109632492, + "learning_rate": 0.0001, + "loss": 1.7521, + "step": 193 + }, + { + "epoch": 0.09466493168510084, + "grad_norm": 0.18097610771656036, + "learning_rate": 0.0001, + "loss": 1.6861, + "step": 194 + }, + { + "epoch": 0.09515289525048796, + "grad_norm": 0.1760302186012268, + "learning_rate": 0.0001, + "loss": 1.6703, + "step": 195 + }, + { + "epoch": 0.09564085881587508, + "grad_norm": 0.17225316166877747, + "learning_rate": 0.0001, + "loss": 1.73, + "step": 196 + }, + { + "epoch": 0.0961288223812622, + "grad_norm": 0.1856345683336258, + "learning_rate": 0.0001, + "loss": 1.6828, + "step": 197 + }, + { + "epoch": 0.09661678594664931, + "grad_norm": 0.18595090508460999, + "learning_rate": 0.0001, + "loss": 1.7136, + "step": 198 + }, + { + "epoch": 0.09710474951203643, + "grad_norm": 0.1780211329460144, + "learning_rate": 0.0001, + "loss": 1.8146, + "step": 199 + }, + { + "epoch": 0.09759271307742355, + "grad_norm": 0.17781271040439606, + "learning_rate": 0.0001, + "loss": 1.6679, + "step": 200 + }, + { + "epoch": 0.09808067664281067, + "grad_norm": 0.17124401032924652, + "learning_rate": 0.0001, + "loss": 1.7077, + "step": 201 + }, + { + "epoch": 0.09856864020819779, + "grad_norm": 0.18443076312541962, + "learning_rate": 0.0001, + "loss": 1.8058, + "step": 202 + }, + { + "epoch": 0.09905660377358491, + "grad_norm": 0.1758834272623062, + "learning_rate": 0.0001, + "loss": 1.81, + "step": 203 + }, + { + "epoch": 0.09954456733897202, + "grad_norm": 0.17878177762031555, + "learning_rate": 0.0001, + "loss": 1.7515, + "step": 204 + }, + { + "epoch": 0.10003253090435914, + "grad_norm": 0.18028298020362854, + "learning_rate": 0.0001, + "loss": 1.7733, + "step": 205 + }, + { + "epoch": 0.10052049446974626, + "grad_norm": 0.17935384809970856, + "learning_rate": 0.0001, + "loss": 1.8011, + "step": 206 + }, + { + "epoch": 0.10100845803513338, + "grad_norm": 0.19665150344371796, + "learning_rate": 0.0001, + "loss": 1.7667, + "step": 207 + }, + { + "epoch": 0.1014964216005205, + "grad_norm": 0.16669659316539764, + "learning_rate": 0.0001, + "loss": 1.7046, + "step": 208 + }, + { + "epoch": 0.10198438516590762, + "grad_norm": 0.17783086001873016, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 209 + }, + { + "epoch": 0.10247234873129472, + "grad_norm": 0.1761302351951599, + "learning_rate": 0.0001, + "loss": 1.726, + "step": 210 + }, + { + "epoch": 0.10296031229668184, + "grad_norm": 0.17417997121810913, + "learning_rate": 0.0001, + "loss": 1.7181, + "step": 211 + }, + { + "epoch": 0.10344827586206896, + "grad_norm": 0.17537769675254822, + "learning_rate": 0.0001, + "loss": 1.6876, + "step": 212 + }, + { + "epoch": 0.10393623942745608, + "grad_norm": 0.16924896836280823, + "learning_rate": 0.0001, + "loss": 1.768, + "step": 213 + }, + { + "epoch": 0.1044242029928432, + "grad_norm": 0.20247921347618103, + "learning_rate": 0.0001, + "loss": 1.9159, + "step": 214 + }, + { + "epoch": 0.10491216655823032, + "grad_norm": 0.16506172716617584, + "learning_rate": 0.0001, + "loss": 1.6491, + "step": 215 + }, + { + "epoch": 0.10540013012361743, + "grad_norm": 0.17558075487613678, + "learning_rate": 0.0001, + "loss": 1.7169, + "step": 216 + }, + { + "epoch": 0.10588809368900455, + "grad_norm": 0.17124514281749725, + "learning_rate": 0.0001, + "loss": 1.6931, + "step": 217 + }, + { + "epoch": 0.10637605725439167, + "grad_norm": 0.16885621845722198, + "learning_rate": 0.0001, + "loss": 1.6946, + "step": 218 + }, + { + "epoch": 0.10686402081977879, + "grad_norm": 0.17787247896194458, + "learning_rate": 0.0001, + "loss": 1.7477, + "step": 219 + }, + { + "epoch": 0.10735198438516591, + "grad_norm": 0.17979493737220764, + "learning_rate": 0.0001, + "loss": 1.7215, + "step": 220 + }, + { + "epoch": 0.10783994795055303, + "grad_norm": 0.187989741563797, + "learning_rate": 0.0001, + "loss": 1.6946, + "step": 221 + }, + { + "epoch": 0.10832791151594014, + "grad_norm": 0.18497705459594727, + "learning_rate": 0.0001, + "loss": 1.7725, + "step": 222 + }, + { + "epoch": 0.10881587508132726, + "grad_norm": 0.1895315796136856, + "learning_rate": 0.0001, + "loss": 1.7455, + "step": 223 + }, + { + "epoch": 0.10930383864671438, + "grad_norm": 0.17897574603557587, + "learning_rate": 0.0001, + "loss": 1.7297, + "step": 224 + }, + { + "epoch": 0.1097918022121015, + "grad_norm": 0.18770314753055573, + "learning_rate": 0.0001, + "loss": 1.7948, + "step": 225 + }, + { + "epoch": 0.11027976577748862, + "grad_norm": 0.1812209188938141, + "learning_rate": 0.0001, + "loss": 1.8229, + "step": 226 + }, + { + "epoch": 0.11076772934287574, + "grad_norm": 0.17030760645866394, + "learning_rate": 0.0001, + "loss": 1.6029, + "step": 227 + }, + { + "epoch": 0.11125569290826284, + "grad_norm": 0.18503767251968384, + "learning_rate": 0.0001, + "loss": 1.644, + "step": 228 + }, + { + "epoch": 0.11174365647364996, + "grad_norm": 0.17443233728408813, + "learning_rate": 0.0001, + "loss": 1.7024, + "step": 229 + }, + { + "epoch": 0.11223162003903708, + "grad_norm": 0.1859743744134903, + "learning_rate": 0.0001, + "loss": 1.7859, + "step": 230 + }, + { + "epoch": 0.1127195836044242, + "grad_norm": 0.1692182421684265, + "learning_rate": 0.0001, + "loss": 1.6996, + "step": 231 + }, + { + "epoch": 0.11320754716981132, + "grad_norm": 0.16695043444633484, + "learning_rate": 0.0001, + "loss": 1.7185, + "step": 232 + }, + { + "epoch": 0.11369551073519844, + "grad_norm": 0.18184787034988403, + "learning_rate": 0.0001, + "loss": 1.712, + "step": 233 + }, + { + "epoch": 0.11418347430058555, + "grad_norm": 0.19107092916965485, + "learning_rate": 0.0001, + "loss": 1.8902, + "step": 234 + }, + { + "epoch": 0.11467143786597267, + "grad_norm": 0.1724960058927536, + "learning_rate": 0.0001, + "loss": 1.7464, + "step": 235 + }, + { + "epoch": 0.11515940143135979, + "grad_norm": 0.17673127353191376, + "learning_rate": 0.0001, + "loss": 1.785, + "step": 236 + }, + { + "epoch": 0.11564736499674691, + "grad_norm": 0.18474438786506653, + "learning_rate": 0.0001, + "loss": 1.8143, + "step": 237 + }, + { + "epoch": 0.11613532856213403, + "grad_norm": 0.17361678183078766, + "learning_rate": 0.0001, + "loss": 1.7558, + "step": 238 + }, + { + "epoch": 0.11662329212752115, + "grad_norm": 0.17701455950737, + "learning_rate": 0.0001, + "loss": 1.5568, + "step": 239 + }, + { + "epoch": 0.11711125569290826, + "grad_norm": 0.18372413516044617, + "learning_rate": 0.0001, + "loss": 1.7913, + "step": 240 + }, + { + "epoch": 0.11759921925829538, + "grad_norm": 0.17780154943466187, + "learning_rate": 0.0001, + "loss": 1.668, + "step": 241 + }, + { + "epoch": 0.1180871828236825, + "grad_norm": 0.17763271927833557, + "learning_rate": 0.0001, + "loss": 1.7006, + "step": 242 + }, + { + "epoch": 0.11857514638906962, + "grad_norm": 0.17323441803455353, + "learning_rate": 0.0001, + "loss": 1.5985, + "step": 243 + }, + { + "epoch": 0.11906310995445674, + "grad_norm": 0.1981297731399536, + "learning_rate": 0.0001, + "loss": 1.7938, + "step": 244 + }, + { + "epoch": 0.11955107351984386, + "grad_norm": 0.1856129914522171, + "learning_rate": 0.0001, + "loss": 1.7469, + "step": 245 + }, + { + "epoch": 0.12003903708523098, + "grad_norm": 0.17878711223602295, + "learning_rate": 0.0001, + "loss": 1.7156, + "step": 246 + }, + { + "epoch": 0.12052700065061808, + "grad_norm": 0.18860337138175964, + "learning_rate": 0.0001, + "loss": 1.6269, + "step": 247 + }, + { + "epoch": 0.1210149642160052, + "grad_norm": 0.17960023880004883, + "learning_rate": 0.0001, + "loss": 1.7484, + "step": 248 + }, + { + "epoch": 0.12150292778139232, + "grad_norm": 0.21390804648399353, + "learning_rate": 0.0001, + "loss": 1.7815, + "step": 249 + }, + { + "epoch": 0.12199089134677944, + "grad_norm": 0.18213345110416412, + "learning_rate": 0.0001, + "loss": 1.8368, + "step": 250 + }, + { + "epoch": 0.12247885491216656, + "grad_norm": 0.19667306542396545, + "learning_rate": 0.0001, + "loss": 1.7547, + "step": 251 + }, + { + "epoch": 0.12296681847755368, + "grad_norm": 0.18796378374099731, + "learning_rate": 0.0001, + "loss": 1.6831, + "step": 252 + }, + { + "epoch": 0.12345478204294079, + "grad_norm": 0.18432985246181488, + "learning_rate": 0.0001, + "loss": 1.8219, + "step": 253 + }, + { + "epoch": 0.12394274560832791, + "grad_norm": 0.19263121485710144, + "learning_rate": 0.0001, + "loss": 1.7033, + "step": 254 + }, + { + "epoch": 0.12443070917371503, + "grad_norm": 0.19383201003074646, + "learning_rate": 0.0001, + "loss": 1.723, + "step": 255 + }, + { + "epoch": 0.12491867273910215, + "grad_norm": 0.17456290125846863, + "learning_rate": 0.0001, + "loss": 1.7354, + "step": 256 + }, + { + "epoch": 0.12540663630448926, + "grad_norm": 0.2073334902524948, + "learning_rate": 0.0001, + "loss": 1.7359, + "step": 257 + }, + { + "epoch": 0.1258945998698764, + "grad_norm": 0.1819145232439041, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 258 + }, + { + "epoch": 0.1263825634352635, + "grad_norm": 0.18823570013046265, + "learning_rate": 0.0001, + "loss": 1.7093, + "step": 259 + }, + { + "epoch": 0.12687052700065063, + "grad_norm": 0.2142113894224167, + "learning_rate": 0.0001, + "loss": 1.7367, + "step": 260 + }, + { + "epoch": 0.12735849056603774, + "grad_norm": 0.17133839428424835, + "learning_rate": 0.0001, + "loss": 1.7257, + "step": 261 + }, + { + "epoch": 0.12784645413142484, + "grad_norm": 0.20852066576480865, + "learning_rate": 0.0001, + "loss": 1.7453, + "step": 262 + }, + { + "epoch": 0.12833441769681198, + "grad_norm": 0.19172458350658417, + "learning_rate": 0.0001, + "loss": 1.817, + "step": 263 + }, + { + "epoch": 0.12882238126219908, + "grad_norm": 0.1805960088968277, + "learning_rate": 0.0001, + "loss": 1.7679, + "step": 264 + }, + { + "epoch": 0.12931034482758622, + "grad_norm": 0.2055218368768692, + "learning_rate": 0.0001, + "loss": 1.7874, + "step": 265 + }, + { + "epoch": 0.12979830839297332, + "grad_norm": 0.16831174492835999, + "learning_rate": 0.0001, + "loss": 1.6342, + "step": 266 + }, + { + "epoch": 0.13028627195836046, + "grad_norm": 0.17563872039318085, + "learning_rate": 0.0001, + "loss": 1.7768, + "step": 267 + }, + { + "epoch": 0.13077423552374756, + "grad_norm": 0.1891409158706665, + "learning_rate": 0.0001, + "loss": 1.7653, + "step": 268 + }, + { + "epoch": 0.13126219908913467, + "grad_norm": 0.2160748541355133, + "learning_rate": 0.0001, + "loss": 1.6957, + "step": 269 + }, + { + "epoch": 0.1317501626545218, + "grad_norm": 0.16802331805229187, + "learning_rate": 0.0001, + "loss": 1.6474, + "step": 270 + }, + { + "epoch": 0.1322381262199089, + "grad_norm": 0.21498991549015045, + "learning_rate": 0.0001, + "loss": 1.7201, + "step": 271 + }, + { + "epoch": 0.13272608978529604, + "grad_norm": 0.1941365897655487, + "learning_rate": 0.0001, + "loss": 1.7387, + "step": 272 + }, + { + "epoch": 0.13321405335068315, + "grad_norm": 0.19020740687847137, + "learning_rate": 0.0001, + "loss": 1.6985, + "step": 273 + }, + { + "epoch": 0.13370201691607025, + "grad_norm": 0.18627683818340302, + "learning_rate": 0.0001, + "loss": 1.7752, + "step": 274 + }, + { + "epoch": 0.1341899804814574, + "grad_norm": 0.1916990429162979, + "learning_rate": 0.0001, + "loss": 1.7438, + "step": 275 + }, + { + "epoch": 0.1346779440468445, + "grad_norm": 0.18649545311927795, + "learning_rate": 0.0001, + "loss": 1.663, + "step": 276 + }, + { + "epoch": 0.13516590761223163, + "grad_norm": 0.17986956238746643, + "learning_rate": 0.0001, + "loss": 1.7905, + "step": 277 + }, + { + "epoch": 0.13565387117761873, + "grad_norm": 0.18601469695568085, + "learning_rate": 0.0001, + "loss": 1.5608, + "step": 278 + }, + { + "epoch": 0.13614183474300587, + "grad_norm": 0.19612380862236023, + "learning_rate": 0.0001, + "loss": 1.7317, + "step": 279 + }, + { + "epoch": 0.13662979830839297, + "grad_norm": 0.17528840899467468, + "learning_rate": 0.0001, + "loss": 1.7114, + "step": 280 + }, + { + "epoch": 0.13711776187378008, + "grad_norm": 0.196456179022789, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 281 + }, + { + "epoch": 0.13760572543916721, + "grad_norm": 0.18218737840652466, + "learning_rate": 0.0001, + "loss": 1.6971, + "step": 282 + }, + { + "epoch": 0.13809368900455432, + "grad_norm": 0.18146923184394836, + "learning_rate": 0.0001, + "loss": 1.7656, + "step": 283 + }, + { + "epoch": 0.13858165256994145, + "grad_norm": 0.17707045376300812, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 284 + }, + { + "epoch": 0.13906961613532856, + "grad_norm": 0.18990135192871094, + "learning_rate": 0.0001, + "loss": 1.7412, + "step": 285 + }, + { + "epoch": 0.13955757970071567, + "grad_norm": 0.17993967235088348, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 286 + }, + { + "epoch": 0.1400455432661028, + "grad_norm": 0.20445284247398376, + "learning_rate": 0.0001, + "loss": 1.9164, + "step": 287 + }, + { + "epoch": 0.1405335068314899, + "grad_norm": 0.18881991505622864, + "learning_rate": 0.0001, + "loss": 1.8395, + "step": 288 + }, + { + "epoch": 0.14102147039687704, + "grad_norm": 0.17268231511116028, + "learning_rate": 0.0001, + "loss": 1.6494, + "step": 289 + }, + { + "epoch": 0.14150943396226415, + "grad_norm": 0.17375007271766663, + "learning_rate": 0.0001, + "loss": 1.6968, + "step": 290 + }, + { + "epoch": 0.14199739752765128, + "grad_norm": 0.17844517529010773, + "learning_rate": 0.0001, + "loss": 1.8686, + "step": 291 + }, + { + "epoch": 0.1424853610930384, + "grad_norm": 0.18538935482501984, + "learning_rate": 0.0001, + "loss": 1.8035, + "step": 292 + }, + { + "epoch": 0.1429733246584255, + "grad_norm": 0.18314018845558167, + "learning_rate": 0.0001, + "loss": 1.8051, + "step": 293 + }, + { + "epoch": 0.14346128822381263, + "grad_norm": 0.18008261919021606, + "learning_rate": 0.0001, + "loss": 1.7992, + "step": 294 + }, + { + "epoch": 0.14394925178919973, + "grad_norm": 0.19243541359901428, + "learning_rate": 0.0001, + "loss": 1.7394, + "step": 295 + }, + { + "epoch": 0.14443721535458687, + "grad_norm": 0.18523713946342468, + "learning_rate": 0.0001, + "loss": 1.7845, + "step": 296 + }, + { + "epoch": 0.14492517891997397, + "grad_norm": 0.1781051605939865, + "learning_rate": 0.0001, + "loss": 1.6748, + "step": 297 + }, + { + "epoch": 0.14541314248536108, + "grad_norm": 0.18994836509227753, + "learning_rate": 0.0001, + "loss": 1.704, + "step": 298 + }, + { + "epoch": 0.1459011060507482, + "grad_norm": 0.17285694181919098, + "learning_rate": 0.0001, + "loss": 1.7832, + "step": 299 + }, + { + "epoch": 0.14638906961613532, + "grad_norm": 0.20339974761009216, + "learning_rate": 0.0001, + "loss": 1.7191, + "step": 300 + }, + { + "epoch": 0.14687703318152245, + "grad_norm": 0.17608943581581116, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 301 + }, + { + "epoch": 0.14736499674690956, + "grad_norm": 0.17653749883174896, + "learning_rate": 0.0001, + "loss": 1.6948, + "step": 302 + }, + { + "epoch": 0.1478529603122967, + "grad_norm": 0.1792931854724884, + "learning_rate": 0.0001, + "loss": 1.7027, + "step": 303 + }, + { + "epoch": 0.1483409238776838, + "grad_norm": 0.18247826397418976, + "learning_rate": 0.0001, + "loss": 1.7433, + "step": 304 + }, + { + "epoch": 0.1488288874430709, + "grad_norm": 0.1712041050195694, + "learning_rate": 0.0001, + "loss": 1.6548, + "step": 305 + }, + { + "epoch": 0.14931685100845804, + "grad_norm": 0.184691920876503, + "learning_rate": 0.0001, + "loss": 1.7226, + "step": 306 + }, + { + "epoch": 0.14980481457384515, + "grad_norm": 0.1834600865840912, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 307 + }, + { + "epoch": 0.15029277813923228, + "grad_norm": 0.1753443032503128, + "learning_rate": 0.0001, + "loss": 1.636, + "step": 308 + }, + { + "epoch": 0.1507807417046194, + "grad_norm": 0.16590848565101624, + "learning_rate": 0.0001, + "loss": 1.6802, + "step": 309 + }, + { + "epoch": 0.1512687052700065, + "grad_norm": 0.17210128903388977, + "learning_rate": 0.0001, + "loss": 1.758, + "step": 310 + }, + { + "epoch": 0.15175666883539363, + "grad_norm": 0.19016823172569275, + "learning_rate": 0.0001, + "loss": 1.8243, + "step": 311 + }, + { + "epoch": 0.15224463240078073, + "grad_norm": 0.1756354421377182, + "learning_rate": 0.0001, + "loss": 1.7666, + "step": 312 + }, + { + "epoch": 0.15273259596616787, + "grad_norm": 0.19266565144062042, + "learning_rate": 0.0001, + "loss": 1.7856, + "step": 313 + }, + { + "epoch": 0.15322055953155497, + "grad_norm": 0.17626765370368958, + "learning_rate": 0.0001, + "loss": 1.7453, + "step": 314 + }, + { + "epoch": 0.1537085230969421, + "grad_norm": 0.1796361356973648, + "learning_rate": 0.0001, + "loss": 1.8428, + "step": 315 + }, + { + "epoch": 0.1541964866623292, + "grad_norm": 0.1971481889486313, + "learning_rate": 0.0001, + "loss": 1.8298, + "step": 316 + }, + { + "epoch": 0.15468445022771632, + "grad_norm": 0.17479249835014343, + "learning_rate": 0.0001, + "loss": 1.7243, + "step": 317 + }, + { + "epoch": 0.15517241379310345, + "grad_norm": 0.18558745086193085, + "learning_rate": 0.0001, + "loss": 1.8265, + "step": 318 + }, + { + "epoch": 0.15566037735849056, + "grad_norm": 0.17821088433265686, + "learning_rate": 0.0001, + "loss": 1.6735, + "step": 319 + }, + { + "epoch": 0.1561483409238777, + "grad_norm": 0.17939302325248718, + "learning_rate": 0.0001, + "loss": 1.7158, + "step": 320 + }, + { + "epoch": 0.1566363044892648, + "grad_norm": 0.17538347840309143, + "learning_rate": 0.0001, + "loss": 1.7467, + "step": 321 + }, + { + "epoch": 0.1571242680546519, + "grad_norm": 0.1796545684337616, + "learning_rate": 0.0001, + "loss": 1.7148, + "step": 322 + }, + { + "epoch": 0.15761223162003904, + "grad_norm": 0.19828006625175476, + "learning_rate": 0.0001, + "loss": 1.8431, + "step": 323 + }, + { + "epoch": 0.15810019518542615, + "grad_norm": 0.17246133089065552, + "learning_rate": 0.0001, + "loss": 1.7291, + "step": 324 + }, + { + "epoch": 0.15858815875081328, + "grad_norm": 0.1835339218378067, + "learning_rate": 0.0001, + "loss": 1.7319, + "step": 325 + }, + { + "epoch": 0.15907612231620039, + "grad_norm": 0.18122561275959015, + "learning_rate": 0.0001, + "loss": 1.7263, + "step": 326 + }, + { + "epoch": 0.15956408588158752, + "grad_norm": 0.19297321140766144, + "learning_rate": 0.0001, + "loss": 1.8792, + "step": 327 + }, + { + "epoch": 0.16005204944697463, + "grad_norm": 0.1762656420469284, + "learning_rate": 0.0001, + "loss": 1.7495, + "step": 328 + }, + { + "epoch": 0.16054001301236173, + "grad_norm": 0.17146944999694824, + "learning_rate": 0.0001, + "loss": 1.7089, + "step": 329 + }, + { + "epoch": 0.16102797657774887, + "grad_norm": 0.17192597687244415, + "learning_rate": 0.0001, + "loss": 1.694, + "step": 330 + }, + { + "epoch": 0.16151594014313597, + "grad_norm": 0.17271386086940765, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 331 + }, + { + "epoch": 0.1620039037085231, + "grad_norm": 0.17589011788368225, + "learning_rate": 0.0001, + "loss": 1.7123, + "step": 332 + }, + { + "epoch": 0.1624918672739102, + "grad_norm": 0.17920418083667755, + "learning_rate": 0.0001, + "loss": 1.6938, + "step": 333 + }, + { + "epoch": 0.16297983083929735, + "grad_norm": 0.16645678877830505, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 334 + }, + { + "epoch": 0.16346779440468445, + "grad_norm": 0.1698988974094391, + "learning_rate": 0.0001, + "loss": 1.7562, + "step": 335 + }, + { + "epoch": 0.16395575797007156, + "grad_norm": 0.17255748808383942, + "learning_rate": 0.0001, + "loss": 1.7408, + "step": 336 + }, + { + "epoch": 0.1644437215354587, + "grad_norm": 0.16908328235149384, + "learning_rate": 0.0001, + "loss": 1.711, + "step": 337 + }, + { + "epoch": 0.1649316851008458, + "grad_norm": 0.17891424894332886, + "learning_rate": 0.0001, + "loss": 1.7199, + "step": 338 + }, + { + "epoch": 0.16541964866623293, + "grad_norm": 0.17500531673431396, + "learning_rate": 0.0001, + "loss": 1.8027, + "step": 339 + }, + { + "epoch": 0.16590761223162004, + "grad_norm": 0.1908222734928131, + "learning_rate": 0.0001, + "loss": 1.7267, + "step": 340 + }, + { + "epoch": 0.16639557579700714, + "grad_norm": 0.16457560658454895, + "learning_rate": 0.0001, + "loss": 1.6551, + "step": 341 + }, + { + "epoch": 0.16688353936239428, + "grad_norm": 0.17455148696899414, + "learning_rate": 0.0001, + "loss": 1.7536, + "step": 342 + }, + { + "epoch": 0.16737150292778138, + "grad_norm": 0.24865932762622833, + "learning_rate": 0.0001, + "loss": 1.7038, + "step": 343 + }, + { + "epoch": 0.16785946649316852, + "grad_norm": 0.16769102215766907, + "learning_rate": 0.0001, + "loss": 1.6666, + "step": 344 + }, + { + "epoch": 0.16834743005855562, + "grad_norm": 0.17845629155635834, + "learning_rate": 0.0001, + "loss": 1.7729, + "step": 345 + }, + { + "epoch": 0.16883539362394276, + "grad_norm": 0.18893101811408997, + "learning_rate": 0.0001, + "loss": 1.6953, + "step": 346 + }, + { + "epoch": 0.16932335718932987, + "grad_norm": 0.17489705979824066, + "learning_rate": 0.0001, + "loss": 1.6451, + "step": 347 + }, + { + "epoch": 0.16981132075471697, + "grad_norm": 0.1895252764225006, + "learning_rate": 0.0001, + "loss": 1.6664, + "step": 348 + }, + { + "epoch": 0.1702992843201041, + "grad_norm": 0.18796460330486298, + "learning_rate": 0.0001, + "loss": 1.8179, + "step": 349 + }, + { + "epoch": 0.1707872478854912, + "grad_norm": 0.18239444494247437, + "learning_rate": 0.0001, + "loss": 1.7895, + "step": 350 + }, + { + "epoch": 0.17127521145087835, + "grad_norm": 0.18578602373600006, + "learning_rate": 0.0001, + "loss": 1.7201, + "step": 351 + }, + { + "epoch": 0.17176317501626545, + "grad_norm": 0.17505811154842377, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 352 + }, + { + "epoch": 0.17225113858165256, + "grad_norm": 0.16880185902118683, + "learning_rate": 0.0001, + "loss": 1.7064, + "step": 353 + }, + { + "epoch": 0.1727391021470397, + "grad_norm": 0.1847655326128006, + "learning_rate": 0.0001, + "loss": 1.6227, + "step": 354 + }, + { + "epoch": 0.1732270657124268, + "grad_norm": 0.18033885955810547, + "learning_rate": 0.0001, + "loss": 1.7613, + "step": 355 + }, + { + "epoch": 0.17371502927781393, + "grad_norm": 0.2022799551486969, + "learning_rate": 0.0001, + "loss": 1.6975, + "step": 356 + }, + { + "epoch": 0.17420299284320104, + "grad_norm": 0.18487118184566498, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 357 + }, + { + "epoch": 0.17469095640858817, + "grad_norm": 0.18200282752513885, + "learning_rate": 0.0001, + "loss": 1.8013, + "step": 358 + }, + { + "epoch": 0.17517891997397528, + "grad_norm": 0.16840700805187225, + "learning_rate": 0.0001, + "loss": 1.6904, + "step": 359 + }, + { + "epoch": 0.17566688353936238, + "grad_norm": 0.17556121945381165, + "learning_rate": 0.0001, + "loss": 1.7331, + "step": 360 + }, + { + "epoch": 0.17615484710474952, + "grad_norm": 0.18641792237758636, + "learning_rate": 0.0001, + "loss": 1.8248, + "step": 361 + }, + { + "epoch": 0.17664281067013662, + "grad_norm": 0.16753801703453064, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 362 + }, + { + "epoch": 0.17713077423552376, + "grad_norm": 0.16265541315078735, + "learning_rate": 0.0001, + "loss": 1.5814, + "step": 363 + }, + { + "epoch": 0.17761873780091086, + "grad_norm": 0.17881396412849426, + "learning_rate": 0.0001, + "loss": 1.8452, + "step": 364 + }, + { + "epoch": 0.17810670136629797, + "grad_norm": 0.18160590529441833, + "learning_rate": 0.0001, + "loss": 1.7977, + "step": 365 + }, + { + "epoch": 0.1785946649316851, + "grad_norm": 0.1778435856103897, + "learning_rate": 0.0001, + "loss": 1.7319, + "step": 366 + }, + { + "epoch": 0.1790826284970722, + "grad_norm": 0.17236903309822083, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 367 + }, + { + "epoch": 0.17957059206245934, + "grad_norm": 0.16980677843093872, + "learning_rate": 0.0001, + "loss": 1.6814, + "step": 368 + }, + { + "epoch": 0.18005855562784645, + "grad_norm": 0.17113539576530457, + "learning_rate": 0.0001, + "loss": 1.5835, + "step": 369 + }, + { + "epoch": 0.18054651919323358, + "grad_norm": 0.22926300764083862, + "learning_rate": 0.0001, + "loss": 1.7127, + "step": 370 + }, + { + "epoch": 0.1810344827586207, + "grad_norm": 0.1766396313905716, + "learning_rate": 0.0001, + "loss": 1.8002, + "step": 371 + }, + { + "epoch": 0.1815224463240078, + "grad_norm": 0.1911155730485916, + "learning_rate": 0.0001, + "loss": 1.7287, + "step": 372 + }, + { + "epoch": 0.18201040988939493, + "grad_norm": 0.1996450275182724, + "learning_rate": 0.0001, + "loss": 1.5601, + "step": 373 + }, + { + "epoch": 0.18249837345478204, + "grad_norm": 0.17531970143318176, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 374 + }, + { + "epoch": 0.18298633702016917, + "grad_norm": 0.19017955660820007, + "learning_rate": 0.0001, + "loss": 1.8052, + "step": 375 + }, + { + "epoch": 0.18347430058555628, + "grad_norm": 0.195291206240654, + "learning_rate": 0.0001, + "loss": 1.6787, + "step": 376 + }, + { + "epoch": 0.18396226415094338, + "grad_norm": 0.18030132353305817, + "learning_rate": 0.0001, + "loss": 1.6931, + "step": 377 + }, + { + "epoch": 0.18445022771633052, + "grad_norm": 0.1725359857082367, + "learning_rate": 0.0001, + "loss": 1.5814, + "step": 378 + }, + { + "epoch": 0.18493819128171762, + "grad_norm": 0.18235339224338531, + "learning_rate": 0.0001, + "loss": 1.7759, + "step": 379 + }, + { + "epoch": 0.18542615484710476, + "grad_norm": 0.19052359461784363, + "learning_rate": 0.0001, + "loss": 1.7898, + "step": 380 + }, + { + "epoch": 0.18591411841249186, + "grad_norm": 0.1713322550058365, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 381 + }, + { + "epoch": 0.186402081977879, + "grad_norm": 0.19699741899967194, + "learning_rate": 0.0001, + "loss": 1.7517, + "step": 382 + }, + { + "epoch": 0.1868900455432661, + "grad_norm": 0.17510955035686493, + "learning_rate": 0.0001, + "loss": 1.7045, + "step": 383 + }, + { + "epoch": 0.1873780091086532, + "grad_norm": 0.17883911728858948, + "learning_rate": 0.0001, + "loss": 1.6763, + "step": 384 + }, + { + "epoch": 0.18786597267404034, + "grad_norm": 0.18562713265419006, + "learning_rate": 0.0001, + "loss": 1.6603, + "step": 385 + }, + { + "epoch": 0.18835393623942745, + "grad_norm": 0.18200963735580444, + "learning_rate": 0.0001, + "loss": 1.7698, + "step": 386 + }, + { + "epoch": 0.18884189980481458, + "grad_norm": 0.192865788936615, + "learning_rate": 0.0001, + "loss": 1.8058, + "step": 387 + }, + { + "epoch": 0.1893298633702017, + "grad_norm": 0.17498141527175903, + "learning_rate": 0.0001, + "loss": 1.657, + "step": 388 + }, + { + "epoch": 0.1898178269355888, + "grad_norm": 0.17550218105316162, + "learning_rate": 0.0001, + "loss": 1.7638, + "step": 389 + }, + { + "epoch": 0.19030579050097593, + "grad_norm": 0.19263967871665955, + "learning_rate": 0.0001, + "loss": 1.7375, + "step": 390 + }, + { + "epoch": 0.19079375406636304, + "grad_norm": 0.1728338897228241, + "learning_rate": 0.0001, + "loss": 1.7467, + "step": 391 + }, + { + "epoch": 0.19128171763175017, + "grad_norm": 0.17929600179195404, + "learning_rate": 0.0001, + "loss": 1.6489, + "step": 392 + }, + { + "epoch": 0.19176968119713728, + "grad_norm": 0.18325988948345184, + "learning_rate": 0.0001, + "loss": 1.8676, + "step": 393 + }, + { + "epoch": 0.1922576447625244, + "grad_norm": 0.17365989089012146, + "learning_rate": 0.0001, + "loss": 1.6916, + "step": 394 + }, + { + "epoch": 0.19274560832791152, + "grad_norm": 0.17361170053482056, + "learning_rate": 0.0001, + "loss": 1.7118, + "step": 395 + }, + { + "epoch": 0.19323357189329862, + "grad_norm": 0.181492879986763, + "learning_rate": 0.0001, + "loss": 1.7197, + "step": 396 + }, + { + "epoch": 0.19372153545868576, + "grad_norm": 0.19113008677959442, + "learning_rate": 0.0001, + "loss": 1.788, + "step": 397 + }, + { + "epoch": 0.19420949902407286, + "grad_norm": 0.173355832695961, + "learning_rate": 0.0001, + "loss": 1.6866, + "step": 398 + }, + { + "epoch": 0.19469746258946, + "grad_norm": 0.1797139197587967, + "learning_rate": 0.0001, + "loss": 1.7505, + "step": 399 + }, + { + "epoch": 0.1951854261548471, + "grad_norm": 0.18337444961071014, + "learning_rate": 0.0001, + "loss": 1.7099, + "step": 400 + }, + { + "epoch": 0.1956733897202342, + "grad_norm": 0.17387695610523224, + "learning_rate": 0.0001, + "loss": 1.737, + "step": 401 + }, + { + "epoch": 0.19616135328562134, + "grad_norm": 0.1695685237646103, + "learning_rate": 0.0001, + "loss": 1.6916, + "step": 402 + }, + { + "epoch": 0.19664931685100845, + "grad_norm": 0.1874959021806717, + "learning_rate": 0.0001, + "loss": 1.6919, + "step": 403 + }, + { + "epoch": 0.19713728041639558, + "grad_norm": 0.17886492609977722, + "learning_rate": 0.0001, + "loss": 1.737, + "step": 404 + }, + { + "epoch": 0.1976252439817827, + "grad_norm": 0.19390465319156647, + "learning_rate": 0.0001, + "loss": 1.8003, + "step": 405 + }, + { + "epoch": 0.19811320754716982, + "grad_norm": 0.17292645573616028, + "learning_rate": 0.0001, + "loss": 1.6714, + "step": 406 + }, + { + "epoch": 0.19860117111255693, + "grad_norm": 0.16998599469661713, + "learning_rate": 0.0001, + "loss": 1.7242, + "step": 407 + }, + { + "epoch": 0.19908913467794404, + "grad_norm": 0.18668459355831146, + "learning_rate": 0.0001, + "loss": 1.7025, + "step": 408 + }, + { + "epoch": 0.19957709824333117, + "grad_norm": 0.16807502508163452, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 409 + }, + { + "epoch": 0.20006506180871828, + "grad_norm": 0.1849876344203949, + "learning_rate": 0.0001, + "loss": 1.8173, + "step": 410 + }, + { + "epoch": 0.2005530253741054, + "grad_norm": 0.18935902416706085, + "learning_rate": 0.0001, + "loss": 1.7108, + "step": 411 + }, + { + "epoch": 0.20104098893949252, + "grad_norm": 0.17630939185619354, + "learning_rate": 0.0001, + "loss": 1.7023, + "step": 412 + }, + { + "epoch": 0.20152895250487965, + "grad_norm": 0.19990061223506927, + "learning_rate": 0.0001, + "loss": 1.6862, + "step": 413 + }, + { + "epoch": 0.20201691607026676, + "grad_norm": 0.18538086116313934, + "learning_rate": 0.0001, + "loss": 1.796, + "step": 414 + }, + { + "epoch": 0.20250487963565386, + "grad_norm": 0.18812508881092072, + "learning_rate": 0.0001, + "loss": 1.7034, + "step": 415 + }, + { + "epoch": 0.202992843201041, + "grad_norm": 0.19069646298885345, + "learning_rate": 0.0001, + "loss": 1.7504, + "step": 416 + }, + { + "epoch": 0.2034808067664281, + "grad_norm": 0.17794154584407806, + "learning_rate": 0.0001, + "loss": 1.6469, + "step": 417 + }, + { + "epoch": 0.20396877033181524, + "grad_norm": 0.17641998827457428, + "learning_rate": 0.0001, + "loss": 1.7526, + "step": 418 + }, + { + "epoch": 0.20445673389720234, + "grad_norm": 0.19693951308727264, + "learning_rate": 0.0001, + "loss": 1.7007, + "step": 419 + }, + { + "epoch": 0.20494469746258945, + "grad_norm": 0.1921786069869995, + "learning_rate": 0.0001, + "loss": 1.7514, + "step": 420 + }, + { + "epoch": 0.20543266102797658, + "grad_norm": 0.1899469792842865, + "learning_rate": 0.0001, + "loss": 1.7508, + "step": 421 + }, + { + "epoch": 0.2059206245933637, + "grad_norm": 0.16994713246822357, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 422 + }, + { + "epoch": 0.20640858815875082, + "grad_norm": 0.20480570197105408, + "learning_rate": 0.0001, + "loss": 1.7714, + "step": 423 + }, + { + "epoch": 0.20689655172413793, + "grad_norm": 0.20870919525623322, + "learning_rate": 0.0001, + "loss": 1.7782, + "step": 424 + }, + { + "epoch": 0.20738451528952506, + "grad_norm": 0.18410471081733704, + "learning_rate": 0.0001, + "loss": 1.72, + "step": 425 + }, + { + "epoch": 0.20787247885491217, + "grad_norm": 0.23531974852085114, + "learning_rate": 0.0001, + "loss": 1.8923, + "step": 426 + }, + { + "epoch": 0.20836044242029927, + "grad_norm": 0.18552608788013458, + "learning_rate": 0.0001, + "loss": 1.7272, + "step": 427 + }, + { + "epoch": 0.2088484059856864, + "grad_norm": 0.2085346281528473, + "learning_rate": 0.0001, + "loss": 1.6953, + "step": 428 + }, + { + "epoch": 0.20933636955107351, + "grad_norm": 0.1959279626607895, + "learning_rate": 0.0001, + "loss": 1.6288, + "step": 429 + }, + { + "epoch": 0.20982433311646065, + "grad_norm": 0.17610879242420197, + "learning_rate": 0.0001, + "loss": 1.7151, + "step": 430 + }, + { + "epoch": 0.21031229668184775, + "grad_norm": 0.1928284466266632, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 431 + }, + { + "epoch": 0.21080026024723486, + "grad_norm": 0.199452742934227, + "learning_rate": 0.0001, + "loss": 1.7704, + "step": 432 + }, + { + "epoch": 0.211288223812622, + "grad_norm": 0.18074338138103485, + "learning_rate": 0.0001, + "loss": 1.7899, + "step": 433 + }, + { + "epoch": 0.2117761873780091, + "grad_norm": 0.19121356308460236, + "learning_rate": 0.0001, + "loss": 1.694, + "step": 434 + }, + { + "epoch": 0.21226415094339623, + "grad_norm": 0.18307030200958252, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 435 + }, + { + "epoch": 0.21275211450878334, + "grad_norm": 0.18400311470031738, + "learning_rate": 0.0001, + "loss": 1.7526, + "step": 436 + }, + { + "epoch": 0.21324007807417047, + "grad_norm": 0.1944567859172821, + "learning_rate": 0.0001, + "loss": 1.7884, + "step": 437 + }, + { + "epoch": 0.21372804163955758, + "grad_norm": 0.18847782909870148, + "learning_rate": 0.0001, + "loss": 1.6859, + "step": 438 + }, + { + "epoch": 0.2142160052049447, + "grad_norm": 0.17663119733333588, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 439 + }, + { + "epoch": 0.21470396877033182, + "grad_norm": 0.18704909086227417, + "learning_rate": 0.0001, + "loss": 1.7352, + "step": 440 + }, + { + "epoch": 0.21519193233571893, + "grad_norm": 0.19525641202926636, + "learning_rate": 0.0001, + "loss": 1.6241, + "step": 441 + }, + { + "epoch": 0.21567989590110606, + "grad_norm": 0.19030174612998962, + "learning_rate": 0.0001, + "loss": 1.7425, + "step": 442 + }, + { + "epoch": 0.21616785946649317, + "grad_norm": 0.18872150778770447, + "learning_rate": 0.0001, + "loss": 1.7177, + "step": 443 + }, + { + "epoch": 0.21665582303188027, + "grad_norm": 0.17374157905578613, + "learning_rate": 0.0001, + "loss": 1.7236, + "step": 444 + }, + { + "epoch": 0.2171437865972674, + "grad_norm": 0.18159011006355286, + "learning_rate": 0.0001, + "loss": 1.6885, + "step": 445 + }, + { + "epoch": 0.2176317501626545, + "grad_norm": 0.18726180493831635, + "learning_rate": 0.0001, + "loss": 1.8226, + "step": 446 + }, + { + "epoch": 0.21811971372804165, + "grad_norm": 0.193464457988739, + "learning_rate": 0.0001, + "loss": 1.7834, + "step": 447 + }, + { + "epoch": 0.21860767729342875, + "grad_norm": 0.19700440764427185, + "learning_rate": 0.0001, + "loss": 1.6766, + "step": 448 + }, + { + "epoch": 0.2190956408588159, + "grad_norm": 0.16808220744132996, + "learning_rate": 0.0001, + "loss": 1.6773, + "step": 449 + }, + { + "epoch": 0.219583604424203, + "grad_norm": 0.1885610967874527, + "learning_rate": 0.0001, + "loss": 1.7195, + "step": 450 + }, + { + "epoch": 0.2200715679895901, + "grad_norm": 0.17235183715820312, + "learning_rate": 0.0001, + "loss": 1.6651, + "step": 451 + }, + { + "epoch": 0.22055953155497723, + "grad_norm": 0.17667032778263092, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 452 + }, + { + "epoch": 0.22104749512036434, + "grad_norm": 0.17659679055213928, + "learning_rate": 0.0001, + "loss": 1.8337, + "step": 453 + }, + { + "epoch": 0.22153545868575147, + "grad_norm": 0.17201969027519226, + "learning_rate": 0.0001, + "loss": 1.7385, + "step": 454 + }, + { + "epoch": 0.22202342225113858, + "grad_norm": 0.17937779426574707, + "learning_rate": 0.0001, + "loss": 1.7864, + "step": 455 + }, + { + "epoch": 0.2225113858165257, + "grad_norm": 0.1681385189294815, + "learning_rate": 0.0001, + "loss": 1.636, + "step": 456 + }, + { + "epoch": 0.22299934938191282, + "grad_norm": 0.17030152678489685, + "learning_rate": 0.0001, + "loss": 1.7613, + "step": 457 + }, + { + "epoch": 0.22348731294729993, + "grad_norm": 0.18430882692337036, + "learning_rate": 0.0001, + "loss": 1.7746, + "step": 458 + }, + { + "epoch": 0.22397527651268706, + "grad_norm": 0.17070208489894867, + "learning_rate": 0.0001, + "loss": 1.619, + "step": 459 + }, + { + "epoch": 0.22446324007807417, + "grad_norm": 0.1672583520412445, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 460 + }, + { + "epoch": 0.2249512036434613, + "grad_norm": 0.18070879578590393, + "learning_rate": 0.0001, + "loss": 1.7752, + "step": 461 + }, + { + "epoch": 0.2254391672088484, + "grad_norm": 0.17931310832500458, + "learning_rate": 0.0001, + "loss": 1.8331, + "step": 462 + }, + { + "epoch": 0.2259271307742355, + "grad_norm": 0.18687482178211212, + "learning_rate": 0.0001, + "loss": 1.7745, + "step": 463 + }, + { + "epoch": 0.22641509433962265, + "grad_norm": 0.18673428893089294, + "learning_rate": 0.0001, + "loss": 1.8001, + "step": 464 + }, + { + "epoch": 0.22690305790500975, + "grad_norm": 0.18758326768875122, + "learning_rate": 0.0001, + "loss": 1.8024, + "step": 465 + }, + { + "epoch": 0.2273910214703969, + "grad_norm": 0.17651711404323578, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 466 + }, + { + "epoch": 0.227878985035784, + "grad_norm": 0.17466424405574799, + "learning_rate": 0.0001, + "loss": 1.6529, + "step": 467 + }, + { + "epoch": 0.2283669486011711, + "grad_norm": 0.17049545049667358, + "learning_rate": 0.0001, + "loss": 1.6707, + "step": 468 + }, + { + "epoch": 0.22885491216655823, + "grad_norm": 0.19238895177841187, + "learning_rate": 0.0001, + "loss": 1.7262, + "step": 469 + }, + { + "epoch": 0.22934287573194534, + "grad_norm": 0.183549702167511, + "learning_rate": 0.0001, + "loss": 1.6949, + "step": 470 + }, + { + "epoch": 0.22983083929733247, + "grad_norm": 0.19222155213356018, + "learning_rate": 0.0001, + "loss": 1.7727, + "step": 471 + }, + { + "epoch": 0.23031880286271958, + "grad_norm": 0.18078762292861938, + "learning_rate": 0.0001, + "loss": 1.8166, + "step": 472 + }, + { + "epoch": 0.2308067664281067, + "grad_norm": 0.17769628763198853, + "learning_rate": 0.0001, + "loss": 1.7215, + "step": 473 + }, + { + "epoch": 0.23129472999349382, + "grad_norm": 0.1750006526708603, + "learning_rate": 0.0001, + "loss": 1.7311, + "step": 474 + }, + { + "epoch": 0.23178269355888093, + "grad_norm": 0.1803676038980484, + "learning_rate": 0.0001, + "loss": 1.7596, + "step": 475 + }, + { + "epoch": 0.23227065712426806, + "grad_norm": 0.18478356301784515, + "learning_rate": 0.0001, + "loss": 1.7262, + "step": 476 + }, + { + "epoch": 0.23275862068965517, + "grad_norm": 0.16509763896465302, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 477 + }, + { + "epoch": 0.2332465842550423, + "grad_norm": 0.19317001104354858, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 478 + }, + { + "epoch": 0.2337345478204294, + "grad_norm": 0.18081186711788177, + "learning_rate": 0.0001, + "loss": 1.6959, + "step": 479 + }, + { + "epoch": 0.2342225113858165, + "grad_norm": 0.18306545913219452, + "learning_rate": 0.0001, + "loss": 1.7328, + "step": 480 + }, + { + "epoch": 0.23471047495120365, + "grad_norm": 0.18552261590957642, + "learning_rate": 0.0001, + "loss": 1.6847, + "step": 481 + }, + { + "epoch": 0.23519843851659075, + "grad_norm": 0.17930322885513306, + "learning_rate": 0.0001, + "loss": 1.7678, + "step": 482 + }, + { + "epoch": 0.23568640208197789, + "grad_norm": 0.17558367550373077, + "learning_rate": 0.0001, + "loss": 1.6756, + "step": 483 + }, + { + "epoch": 0.236174365647365, + "grad_norm": 0.18899041414260864, + "learning_rate": 0.0001, + "loss": 1.7778, + "step": 484 + }, + { + "epoch": 0.23666232921275213, + "grad_norm": 0.17528998851776123, + "learning_rate": 0.0001, + "loss": 1.6651, + "step": 485 + }, + { + "epoch": 0.23715029277813923, + "grad_norm": 0.16732053458690643, + "learning_rate": 0.0001, + "loss": 1.6796, + "step": 486 + }, + { + "epoch": 0.23763825634352634, + "grad_norm": 0.1849820613861084, + "learning_rate": 0.0001, + "loss": 1.737, + "step": 487 + }, + { + "epoch": 0.23812621990891347, + "grad_norm": 0.1789163500070572, + "learning_rate": 0.0001, + "loss": 1.6919, + "step": 488 + }, + { + "epoch": 0.23861418347430058, + "grad_norm": 0.1739804446697235, + "learning_rate": 0.0001, + "loss": 1.8225, + "step": 489 + }, + { + "epoch": 0.2391021470396877, + "grad_norm": 0.18246984481811523, + "learning_rate": 0.0001, + "loss": 1.734, + "step": 490 + }, + { + "epoch": 0.23959011060507482, + "grad_norm": 0.17464157938957214, + "learning_rate": 0.0001, + "loss": 1.7442, + "step": 491 + }, + { + "epoch": 0.24007807417046195, + "grad_norm": 0.19501306116580963, + "learning_rate": 0.0001, + "loss": 1.7521, + "step": 492 + }, + { + "epoch": 0.24056603773584906, + "grad_norm": 0.17958857119083405, + "learning_rate": 0.0001, + "loss": 1.8191, + "step": 493 + }, + { + "epoch": 0.24105400130123616, + "grad_norm": 0.18241986632347107, + "learning_rate": 0.0001, + "loss": 1.7709, + "step": 494 + }, + { + "epoch": 0.2415419648666233, + "grad_norm": 0.18529468774795532, + "learning_rate": 0.0001, + "loss": 1.6871, + "step": 495 + }, + { + "epoch": 0.2420299284320104, + "grad_norm": 0.18519562482833862, + "learning_rate": 0.0001, + "loss": 1.7605, + "step": 496 + }, + { + "epoch": 0.24251789199739754, + "grad_norm": 0.17868764698505402, + "learning_rate": 0.0001, + "loss": 1.725, + "step": 497 + }, + { + "epoch": 0.24300585556278465, + "grad_norm": 0.17040537297725677, + "learning_rate": 0.0001, + "loss": 1.6161, + "step": 498 + }, + { + "epoch": 0.24349381912817175, + "grad_norm": 0.1820056289434433, + "learning_rate": 0.0001, + "loss": 1.7249, + "step": 499 + }, + { + "epoch": 0.24398178269355889, + "grad_norm": 0.1877366453409195, + "learning_rate": 0.0001, + "loss": 1.6976, + "step": 500 + }, + { + "epoch": 0.244469746258946, + "grad_norm": 0.1717415153980255, + "learning_rate": 0.0001, + "loss": 1.6109, + "step": 501 + }, + { + "epoch": 0.24495770982433313, + "grad_norm": 0.17338915169239044, + "learning_rate": 0.0001, + "loss": 1.7433, + "step": 502 + }, + { + "epoch": 0.24544567338972023, + "grad_norm": 0.18489517271518707, + "learning_rate": 0.0001, + "loss": 1.7283, + "step": 503 + }, + { + "epoch": 0.24593363695510737, + "grad_norm": 0.17153921723365784, + "learning_rate": 0.0001, + "loss": 1.7261, + "step": 504 + }, + { + "epoch": 0.24642160052049447, + "grad_norm": 0.19024662673473358, + "learning_rate": 0.0001, + "loss": 1.8498, + "step": 505 + }, + { + "epoch": 0.24690956408588158, + "grad_norm": 0.1675989329814911, + "learning_rate": 0.0001, + "loss": 1.5903, + "step": 506 + }, + { + "epoch": 0.2473975276512687, + "grad_norm": 0.18422546982765198, + "learning_rate": 0.0001, + "loss": 1.7294, + "step": 507 + }, + { + "epoch": 0.24788549121665582, + "grad_norm": 0.17943088710308075, + "learning_rate": 0.0001, + "loss": 1.6842, + "step": 508 + }, + { + "epoch": 0.24837345478204295, + "grad_norm": 0.18048308789730072, + "learning_rate": 0.0001, + "loss": 1.677, + "step": 509 + }, + { + "epoch": 0.24886141834743006, + "grad_norm": 0.17185211181640625, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 510 + }, + { + "epoch": 0.24934938191281716, + "grad_norm": 0.1717991977930069, + "learning_rate": 0.0001, + "loss": 1.7077, + "step": 511 + }, + { + "epoch": 0.2498373454782043, + "grad_norm": 0.18661388754844666, + "learning_rate": 0.0001, + "loss": 1.8163, + "step": 512 + }, + { + "epoch": 0.2503253090435914, + "grad_norm": 0.19672876596450806, + "learning_rate": 0.0001, + "loss": 1.7733, + "step": 513 + }, + { + "epoch": 0.2508132726089785, + "grad_norm": 0.18052315711975098, + "learning_rate": 0.0001, + "loss": 1.7242, + "step": 514 + }, + { + "epoch": 0.25130123617436567, + "grad_norm": 0.17241713404655457, + "learning_rate": 0.0001, + "loss": 1.6513, + "step": 515 + }, + { + "epoch": 0.2517891997397528, + "grad_norm": 0.1861806958913803, + "learning_rate": 0.0001, + "loss": 1.7189, + "step": 516 + }, + { + "epoch": 0.2522771633051399, + "grad_norm": 0.17267678678035736, + "learning_rate": 0.0001, + "loss": 1.5993, + "step": 517 + }, + { + "epoch": 0.252765126870527, + "grad_norm": 0.16948658227920532, + "learning_rate": 0.0001, + "loss": 1.5733, + "step": 518 + }, + { + "epoch": 0.2532530904359141, + "grad_norm": 0.18075625598430634, + "learning_rate": 0.0001, + "loss": 1.7755, + "step": 519 + }, + { + "epoch": 0.25374105400130126, + "grad_norm": 0.17203836143016815, + "learning_rate": 0.0001, + "loss": 1.6755, + "step": 520 + }, + { + "epoch": 0.25422901756668836, + "grad_norm": 0.1631672978401184, + "learning_rate": 0.0001, + "loss": 1.5949, + "step": 521 + }, + { + "epoch": 0.25471698113207547, + "grad_norm": 0.1776244342327118, + "learning_rate": 0.0001, + "loss": 1.7231, + "step": 522 + }, + { + "epoch": 0.2552049446974626, + "grad_norm": 0.18010790646076202, + "learning_rate": 0.0001, + "loss": 1.7575, + "step": 523 + }, + { + "epoch": 0.2556929082628497, + "grad_norm": 0.16827166080474854, + "learning_rate": 0.0001, + "loss": 1.6907, + "step": 524 + }, + { + "epoch": 0.25618087182823684, + "grad_norm": 0.19028151035308838, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 525 + }, + { + "epoch": 0.25666883539362395, + "grad_norm": 0.17831748723983765, + "learning_rate": 0.0001, + "loss": 1.7746, + "step": 526 + }, + { + "epoch": 0.25715679895901106, + "grad_norm": 0.19768738746643066, + "learning_rate": 0.0001, + "loss": 1.7111, + "step": 527 + }, + { + "epoch": 0.25764476252439816, + "grad_norm": 0.1869453638792038, + "learning_rate": 0.0001, + "loss": 1.7493, + "step": 528 + }, + { + "epoch": 0.25813272608978527, + "grad_norm": 0.17493435740470886, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 529 + }, + { + "epoch": 0.25862068965517243, + "grad_norm": 0.1741894632577896, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 530 + }, + { + "epoch": 0.25910865322055954, + "grad_norm": 0.19671699404716492, + "learning_rate": 0.0001, + "loss": 1.7265, + "step": 531 + }, + { + "epoch": 0.25959661678594664, + "grad_norm": 0.1766589730978012, + "learning_rate": 0.0001, + "loss": 1.6655, + "step": 532 + }, + { + "epoch": 0.26008458035133375, + "grad_norm": 0.17494948208332062, + "learning_rate": 0.0001, + "loss": 1.6571, + "step": 533 + }, + { + "epoch": 0.2605725439167209, + "grad_norm": 0.20303772389888763, + "learning_rate": 0.0001, + "loss": 1.7987, + "step": 534 + }, + { + "epoch": 0.261060507482108, + "grad_norm": 0.18097007274627686, + "learning_rate": 0.0001, + "loss": 1.6341, + "step": 535 + }, + { + "epoch": 0.2615484710474951, + "grad_norm": 0.20877449214458466, + "learning_rate": 0.0001, + "loss": 1.7057, + "step": 536 + }, + { + "epoch": 0.26203643461288223, + "grad_norm": 0.19047099351882935, + "learning_rate": 0.0001, + "loss": 1.7048, + "step": 537 + }, + { + "epoch": 0.26252439817826934, + "grad_norm": 0.18251296877861023, + "learning_rate": 0.0001, + "loss": 1.6979, + "step": 538 + }, + { + "epoch": 0.2630123617436565, + "grad_norm": 0.18078570067882538, + "learning_rate": 0.0001, + "loss": 1.801, + "step": 539 + }, + { + "epoch": 0.2635003253090436, + "grad_norm": 0.18725551664829254, + "learning_rate": 0.0001, + "loss": 1.7638, + "step": 540 + }, + { + "epoch": 0.2639882888744307, + "grad_norm": 0.20769141614437103, + "learning_rate": 0.0001, + "loss": 1.8201, + "step": 541 + }, + { + "epoch": 0.2644762524398178, + "grad_norm": 0.16759508848190308, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 542 + }, + { + "epoch": 0.2649642160052049, + "grad_norm": 0.20297077298164368, + "learning_rate": 0.0001, + "loss": 1.8241, + "step": 543 + }, + { + "epoch": 0.2654521795705921, + "grad_norm": 0.17038699984550476, + "learning_rate": 0.0001, + "loss": 1.6566, + "step": 544 + }, + { + "epoch": 0.2659401431359792, + "grad_norm": 0.17414064705371857, + "learning_rate": 0.0001, + "loss": 1.5866, + "step": 545 + }, + { + "epoch": 0.2664281067013663, + "grad_norm": 0.1856188178062439, + "learning_rate": 0.0001, + "loss": 1.7166, + "step": 546 + }, + { + "epoch": 0.2669160702667534, + "grad_norm": 0.17565833032131195, + "learning_rate": 0.0001, + "loss": 1.7206, + "step": 547 + }, + { + "epoch": 0.2674040338321405, + "grad_norm": 0.18267709016799927, + "learning_rate": 0.0001, + "loss": 1.6728, + "step": 548 + }, + { + "epoch": 0.26789199739752767, + "grad_norm": 0.18981780111789703, + "learning_rate": 0.0001, + "loss": 1.7425, + "step": 549 + }, + { + "epoch": 0.2683799609629148, + "grad_norm": 0.18254795670509338, + "learning_rate": 0.0001, + "loss": 1.6948, + "step": 550 + }, + { + "epoch": 0.2688679245283019, + "grad_norm": 0.18846552073955536, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 551 + }, + { + "epoch": 0.269355888093689, + "grad_norm": 0.1776316910982132, + "learning_rate": 0.0001, + "loss": 1.618, + "step": 552 + }, + { + "epoch": 0.2698438516590761, + "grad_norm": 0.1822226643562317, + "learning_rate": 0.0001, + "loss": 1.8876, + "step": 553 + }, + { + "epoch": 0.27033181522446326, + "grad_norm": 0.1873788982629776, + "learning_rate": 0.0001, + "loss": 1.7301, + "step": 554 + }, + { + "epoch": 0.27081977878985036, + "grad_norm": 0.19234952330589294, + "learning_rate": 0.0001, + "loss": 1.7235, + "step": 555 + }, + { + "epoch": 0.27130774235523747, + "grad_norm": 0.17642012238502502, + "learning_rate": 0.0001, + "loss": 1.7258, + "step": 556 + }, + { + "epoch": 0.2717957059206246, + "grad_norm": 0.21255896985530853, + "learning_rate": 0.0001, + "loss": 1.6937, + "step": 557 + }, + { + "epoch": 0.27228366948601174, + "grad_norm": 0.2181590497493744, + "learning_rate": 0.0001, + "loss": 1.9076, + "step": 558 + }, + { + "epoch": 0.27277163305139884, + "grad_norm": 0.16595962643623352, + "learning_rate": 0.0001, + "loss": 1.5664, + "step": 559 + }, + { + "epoch": 0.27325959661678595, + "grad_norm": 0.1832776963710785, + "learning_rate": 0.0001, + "loss": 1.658, + "step": 560 + }, + { + "epoch": 0.27374756018217306, + "grad_norm": 0.18969666957855225, + "learning_rate": 0.0001, + "loss": 1.8031, + "step": 561 + }, + { + "epoch": 0.27423552374756016, + "grad_norm": 0.1813500076532364, + "learning_rate": 0.0001, + "loss": 1.7209, + "step": 562 + }, + { + "epoch": 0.2747234873129473, + "grad_norm": 0.18055056035518646, + "learning_rate": 0.0001, + "loss": 1.7658, + "step": 563 + }, + { + "epoch": 0.27521145087833443, + "grad_norm": 0.17362233996391296, + "learning_rate": 0.0001, + "loss": 1.7746, + "step": 564 + }, + { + "epoch": 0.27569941444372154, + "grad_norm": 0.19305916130542755, + "learning_rate": 0.0001, + "loss": 1.9062, + "step": 565 + }, + { + "epoch": 0.27618737800910864, + "grad_norm": 0.17458635568618774, + "learning_rate": 0.0001, + "loss": 1.6339, + "step": 566 + }, + { + "epoch": 0.27667534157449575, + "grad_norm": 0.18760624527931213, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 567 + }, + { + "epoch": 0.2771633051398829, + "grad_norm": 0.17057117819786072, + "learning_rate": 0.0001, + "loss": 1.6318, + "step": 568 + }, + { + "epoch": 0.27765126870527, + "grad_norm": 0.17930074036121368, + "learning_rate": 0.0001, + "loss": 1.7227, + "step": 569 + }, + { + "epoch": 0.2781392322706571, + "grad_norm": 0.17012158036231995, + "learning_rate": 0.0001, + "loss": 1.6309, + "step": 570 + }, + { + "epoch": 0.27862719583604423, + "grad_norm": 0.17562495172023773, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 571 + }, + { + "epoch": 0.27911515940143133, + "grad_norm": 0.18494853377342224, + "learning_rate": 0.0001, + "loss": 1.8355, + "step": 572 + }, + { + "epoch": 0.2796031229668185, + "grad_norm": 0.18261797726154327, + "learning_rate": 0.0001, + "loss": 1.6015, + "step": 573 + }, + { + "epoch": 0.2800910865322056, + "grad_norm": 0.18148979544639587, + "learning_rate": 0.0001, + "loss": 1.797, + "step": 574 + }, + { + "epoch": 0.2805790500975927, + "grad_norm": 0.16941653192043304, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 575 + }, + { + "epoch": 0.2810670136629798, + "grad_norm": 0.18611697852611542, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 576 + }, + { + "epoch": 0.281554977228367, + "grad_norm": 0.16945675015449524, + "learning_rate": 0.0001, + "loss": 1.6678, + "step": 577 + }, + { + "epoch": 0.2820429407937541, + "grad_norm": 0.17999336123466492, + "learning_rate": 0.0001, + "loss": 1.7161, + "step": 578 + }, + { + "epoch": 0.2825309043591412, + "grad_norm": 0.185410276055336, + "learning_rate": 0.0001, + "loss": 1.6731, + "step": 579 + }, + { + "epoch": 0.2830188679245283, + "grad_norm": 0.1757509708404541, + "learning_rate": 0.0001, + "loss": 1.7162, + "step": 580 + }, + { + "epoch": 0.2835068314899154, + "grad_norm": 0.1721939593553543, + "learning_rate": 0.0001, + "loss": 1.6374, + "step": 581 + }, + { + "epoch": 0.28399479505530256, + "grad_norm": 0.17961697280406952, + "learning_rate": 0.0001, + "loss": 1.5798, + "step": 582 + }, + { + "epoch": 0.28448275862068967, + "grad_norm": 0.18612822890281677, + "learning_rate": 0.0001, + "loss": 1.7694, + "step": 583 + }, + { + "epoch": 0.2849707221860768, + "grad_norm": 0.18089883029460907, + "learning_rate": 0.0001, + "loss": 1.7426, + "step": 584 + }, + { + "epoch": 0.2854586857514639, + "grad_norm": 0.19402338564395905, + "learning_rate": 0.0001, + "loss": 1.7604, + "step": 585 + }, + { + "epoch": 0.285946649316851, + "grad_norm": 0.18208986520767212, + "learning_rate": 0.0001, + "loss": 1.6998, + "step": 586 + }, + { + "epoch": 0.28643461288223815, + "grad_norm": 0.19270221889019012, + "learning_rate": 0.0001, + "loss": 1.6564, + "step": 587 + }, + { + "epoch": 0.28692257644762525, + "grad_norm": 0.17604075372219086, + "learning_rate": 0.0001, + "loss": 1.653, + "step": 588 + }, + { + "epoch": 0.28741054001301236, + "grad_norm": 0.17964652180671692, + "learning_rate": 0.0001, + "loss": 1.7613, + "step": 589 + }, + { + "epoch": 0.28789850357839947, + "grad_norm": 0.18317797780036926, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 590 + }, + { + "epoch": 0.2883864671437866, + "grad_norm": 0.18271799385547638, + "learning_rate": 0.0001, + "loss": 1.8067, + "step": 591 + }, + { + "epoch": 0.28887443070917374, + "grad_norm": 0.19613641500473022, + "learning_rate": 0.0001, + "loss": 1.8544, + "step": 592 + }, + { + "epoch": 0.28936239427456084, + "grad_norm": 0.19165842235088348, + "learning_rate": 0.0001, + "loss": 1.8834, + "step": 593 + }, + { + "epoch": 0.28985035783994795, + "grad_norm": 0.18238607048988342, + "learning_rate": 0.0001, + "loss": 1.7776, + "step": 594 + }, + { + "epoch": 0.29033832140533505, + "grad_norm": 0.16585291922092438, + "learning_rate": 0.0001, + "loss": 1.5959, + "step": 595 + }, + { + "epoch": 0.29082628497072216, + "grad_norm": 0.1774480640888214, + "learning_rate": 0.0001, + "loss": 1.6114, + "step": 596 + }, + { + "epoch": 0.2913142485361093, + "grad_norm": 0.17970281839370728, + "learning_rate": 0.0001, + "loss": 1.79, + "step": 597 + }, + { + "epoch": 0.2918022121014964, + "grad_norm": 0.18806995451450348, + "learning_rate": 0.0001, + "loss": 1.7842, + "step": 598 + }, + { + "epoch": 0.29229017566688353, + "grad_norm": 0.16845998167991638, + "learning_rate": 0.0001, + "loss": 1.6788, + "step": 599 + }, + { + "epoch": 0.29277813923227064, + "grad_norm": 0.18506960570812225, + "learning_rate": 0.0001, + "loss": 1.758, + "step": 600 + }, + { + "epoch": 0.2932661027976578, + "grad_norm": 0.1771155744791031, + "learning_rate": 0.0001, + "loss": 1.7259, + "step": 601 + }, + { + "epoch": 0.2937540663630449, + "grad_norm": 0.1760523021221161, + "learning_rate": 0.0001, + "loss": 1.7807, + "step": 602 + }, + { + "epoch": 0.294242029928432, + "grad_norm": 0.1765487641096115, + "learning_rate": 0.0001, + "loss": 1.5886, + "step": 603 + }, + { + "epoch": 0.2947299934938191, + "grad_norm": 0.17646710574626923, + "learning_rate": 0.0001, + "loss": 1.6508, + "step": 604 + }, + { + "epoch": 0.2952179570592062, + "grad_norm": 0.18383362889289856, + "learning_rate": 0.0001, + "loss": 1.7049, + "step": 605 + }, + { + "epoch": 0.2957059206245934, + "grad_norm": 0.18808609247207642, + "learning_rate": 0.0001, + "loss": 1.6948, + "step": 606 + }, + { + "epoch": 0.2961938841899805, + "grad_norm": 0.18178711831569672, + "learning_rate": 0.0001, + "loss": 1.7306, + "step": 607 + }, + { + "epoch": 0.2966818477553676, + "grad_norm": 0.18499815464019775, + "learning_rate": 0.0001, + "loss": 1.6072, + "step": 608 + }, + { + "epoch": 0.2971698113207547, + "grad_norm": 0.18511821329593658, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 609 + }, + { + "epoch": 0.2976577748861418, + "grad_norm": 0.17731331288814545, + "learning_rate": 0.0001, + "loss": 1.738, + "step": 610 + }, + { + "epoch": 0.298145738451529, + "grad_norm": 0.19273065030574799, + "learning_rate": 0.0001, + "loss": 1.6286, + "step": 611 + }, + { + "epoch": 0.2986337020169161, + "grad_norm": 0.1858029067516327, + "learning_rate": 0.0001, + "loss": 1.6565, + "step": 612 + }, + { + "epoch": 0.2991216655823032, + "grad_norm": 0.18791264295578003, + "learning_rate": 0.0001, + "loss": 1.6857, + "step": 613 + }, + { + "epoch": 0.2996096291476903, + "grad_norm": 0.19478711485862732, + "learning_rate": 0.0001, + "loss": 1.6655, + "step": 614 + }, + { + "epoch": 0.3000975927130774, + "grad_norm": 0.18538743257522583, + "learning_rate": 0.0001, + "loss": 1.701, + "step": 615 + }, + { + "epoch": 0.30058555627846456, + "grad_norm": 0.1899065524339676, + "learning_rate": 0.0001, + "loss": 1.7014, + "step": 616 + }, + { + "epoch": 0.30107351984385167, + "grad_norm": 0.19550780951976776, + "learning_rate": 0.0001, + "loss": 1.8021, + "step": 617 + }, + { + "epoch": 0.3015614834092388, + "grad_norm": 0.1695028841495514, + "learning_rate": 0.0001, + "loss": 1.6423, + "step": 618 + }, + { + "epoch": 0.3020494469746259, + "grad_norm": 0.18605121970176697, + "learning_rate": 0.0001, + "loss": 1.7441, + "step": 619 + }, + { + "epoch": 0.302537410540013, + "grad_norm": 0.20526890456676483, + "learning_rate": 0.0001, + "loss": 1.7878, + "step": 620 + }, + { + "epoch": 0.30302537410540015, + "grad_norm": 0.17033647000789642, + "learning_rate": 0.0001, + "loss": 1.688, + "step": 621 + }, + { + "epoch": 0.30351333767078725, + "grad_norm": 0.1756584197282791, + "learning_rate": 0.0001, + "loss": 1.6914, + "step": 622 + }, + { + "epoch": 0.30400130123617436, + "grad_norm": 0.18451380729675293, + "learning_rate": 0.0001, + "loss": 1.6135, + "step": 623 + }, + { + "epoch": 0.30448926480156147, + "grad_norm": 0.17828862369060516, + "learning_rate": 0.0001, + "loss": 1.677, + "step": 624 + }, + { + "epoch": 0.3049772283669486, + "grad_norm": 0.17056816816329956, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 625 + }, + { + "epoch": 0.30546519193233573, + "grad_norm": 0.1786261945962906, + "learning_rate": 0.0001, + "loss": 1.7212, + "step": 626 + }, + { + "epoch": 0.30595315549772284, + "grad_norm": 0.1788036823272705, + "learning_rate": 0.0001, + "loss": 1.6646, + "step": 627 + }, + { + "epoch": 0.30644111906310995, + "grad_norm": 0.17864547669887543, + "learning_rate": 0.0001, + "loss": 1.7123, + "step": 628 + }, + { + "epoch": 0.30692908262849705, + "grad_norm": 0.19462743401527405, + "learning_rate": 0.0001, + "loss": 1.7975, + "step": 629 + }, + { + "epoch": 0.3074170461938842, + "grad_norm": 0.17800424993038177, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 630 + }, + { + "epoch": 0.3079050097592713, + "grad_norm": 0.1856238692998886, + "learning_rate": 0.0001, + "loss": 1.9104, + "step": 631 + }, + { + "epoch": 0.3083929733246584, + "grad_norm": 0.17673279345035553, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 632 + }, + { + "epoch": 0.30888093689004553, + "grad_norm": 0.18032853305339813, + "learning_rate": 0.0001, + "loss": 1.7374, + "step": 633 + }, + { + "epoch": 0.30936890045543264, + "grad_norm": 0.17968174815177917, + "learning_rate": 0.0001, + "loss": 1.662, + "step": 634 + }, + { + "epoch": 0.3098568640208198, + "grad_norm": 0.1789749562740326, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 635 + }, + { + "epoch": 0.3103448275862069, + "grad_norm": 0.175074502825737, + "learning_rate": 0.0001, + "loss": 1.7047, + "step": 636 + }, + { + "epoch": 0.310832791151594, + "grad_norm": 0.17318876087665558, + "learning_rate": 0.0001, + "loss": 1.6148, + "step": 637 + }, + { + "epoch": 0.3113207547169811, + "grad_norm": 0.20739412307739258, + "learning_rate": 0.0001, + "loss": 1.9162, + "step": 638 + }, + { + "epoch": 0.3118087182823682, + "grad_norm": 0.1787186861038208, + "learning_rate": 0.0001, + "loss": 1.6657, + "step": 639 + }, + { + "epoch": 0.3122966818477554, + "grad_norm": 0.1855590045452118, + "learning_rate": 0.0001, + "loss": 1.7058, + "step": 640 + }, + { + "epoch": 0.3127846454131425, + "grad_norm": 0.17939618229866028, + "learning_rate": 0.0001, + "loss": 1.7663, + "step": 641 + }, + { + "epoch": 0.3132726089785296, + "grad_norm": 0.17440925538539886, + "learning_rate": 0.0001, + "loss": 1.6337, + "step": 642 + }, + { + "epoch": 0.3137605725439167, + "grad_norm": 0.19695165753364563, + "learning_rate": 0.0001, + "loss": 1.6048, + "step": 643 + }, + { + "epoch": 0.3142485361093038, + "grad_norm": 0.16877804696559906, + "learning_rate": 0.0001, + "loss": 1.6677, + "step": 644 + }, + { + "epoch": 0.314736499674691, + "grad_norm": 0.1742711365222931, + "learning_rate": 0.0001, + "loss": 1.6459, + "step": 645 + }, + { + "epoch": 0.3152244632400781, + "grad_norm": 0.18073154985904694, + "learning_rate": 0.0001, + "loss": 1.7392, + "step": 646 + }, + { + "epoch": 0.3157124268054652, + "grad_norm": 0.1714729368686676, + "learning_rate": 0.0001, + "loss": 1.6981, + "step": 647 + }, + { + "epoch": 0.3162003903708523, + "grad_norm": 0.17316888272762299, + "learning_rate": 0.0001, + "loss": 1.6746, + "step": 648 + }, + { + "epoch": 0.31668835393623945, + "grad_norm": 0.1779533475637436, + "learning_rate": 0.0001, + "loss": 1.7709, + "step": 649 + }, + { + "epoch": 0.31717631750162656, + "grad_norm": 0.1709679216146469, + "learning_rate": 0.0001, + "loss": 1.5822, + "step": 650 + }, + { + "epoch": 0.31766428106701367, + "grad_norm": 0.17804761230945587, + "learning_rate": 0.0001, + "loss": 1.7638, + "step": 651 + }, + { + "epoch": 0.31815224463240077, + "grad_norm": 0.18509989976882935, + "learning_rate": 0.0001, + "loss": 1.8712, + "step": 652 + }, + { + "epoch": 0.3186402081977879, + "grad_norm": 0.1751030832529068, + "learning_rate": 0.0001, + "loss": 1.7032, + "step": 653 + }, + { + "epoch": 0.31912817176317504, + "grad_norm": 0.17232050001621246, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 654 + }, + { + "epoch": 0.31961613532856215, + "grad_norm": 0.17198053002357483, + "learning_rate": 0.0001, + "loss": 1.7067, + "step": 655 + }, + { + "epoch": 0.32010409889394925, + "grad_norm": 0.1797952950000763, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 656 + }, + { + "epoch": 0.32059206245933636, + "grad_norm": 0.1817045360803604, + "learning_rate": 0.0001, + "loss": 1.7448, + "step": 657 + }, + { + "epoch": 0.32108002602472346, + "grad_norm": 0.1710105687379837, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 658 + }, + { + "epoch": 0.3215679895901106, + "grad_norm": 0.19661752879619598, + "learning_rate": 0.0001, + "loss": 1.7867, + "step": 659 + }, + { + "epoch": 0.32205595315549773, + "grad_norm": 0.1723627746105194, + "learning_rate": 0.0001, + "loss": 1.5887, + "step": 660 + }, + { + "epoch": 0.32254391672088484, + "grad_norm": 0.21364371478557587, + "learning_rate": 0.0001, + "loss": 1.8418, + "step": 661 + }, + { + "epoch": 0.32303188028627194, + "grad_norm": 0.17605622112751007, + "learning_rate": 0.0001, + "loss": 1.6892, + "step": 662 + }, + { + "epoch": 0.32351984385165905, + "grad_norm": 0.17851850390434265, + "learning_rate": 0.0001, + "loss": 1.7639, + "step": 663 + }, + { + "epoch": 0.3240078074170462, + "grad_norm": 0.1816173940896988, + "learning_rate": 0.0001, + "loss": 1.6567, + "step": 664 + }, + { + "epoch": 0.3244957709824333, + "grad_norm": 0.17529702186584473, + "learning_rate": 0.0001, + "loss": 1.6945, + "step": 665 + }, + { + "epoch": 0.3249837345478204, + "grad_norm": 0.16997535526752472, + "learning_rate": 0.0001, + "loss": 1.6833, + "step": 666 + }, + { + "epoch": 0.32547169811320753, + "grad_norm": 0.18423834443092346, + "learning_rate": 0.0001, + "loss": 1.7486, + "step": 667 + }, + { + "epoch": 0.3259596616785947, + "grad_norm": 0.18737761676311493, + "learning_rate": 0.0001, + "loss": 1.7561, + "step": 668 + }, + { + "epoch": 0.3264476252439818, + "grad_norm": 0.17731069028377533, + "learning_rate": 0.0001, + "loss": 1.5679, + "step": 669 + }, + { + "epoch": 0.3269355888093689, + "grad_norm": 0.197565495967865, + "learning_rate": 0.0001, + "loss": 1.7457, + "step": 670 + }, + { + "epoch": 0.327423552374756, + "grad_norm": 0.19319871068000793, + "learning_rate": 0.0001, + "loss": 1.8458, + "step": 671 + }, + { + "epoch": 0.3279115159401431, + "grad_norm": 0.18049995601177216, + "learning_rate": 0.0001, + "loss": 1.7076, + "step": 672 + }, + { + "epoch": 0.3283994795055303, + "grad_norm": 0.18907921016216278, + "learning_rate": 0.0001, + "loss": 1.7031, + "step": 673 + }, + { + "epoch": 0.3288874430709174, + "grad_norm": 0.18252240121364594, + "learning_rate": 0.0001, + "loss": 1.6304, + "step": 674 + }, + { + "epoch": 0.3293754066363045, + "grad_norm": 0.1798553168773651, + "learning_rate": 0.0001, + "loss": 1.6504, + "step": 675 + }, + { + "epoch": 0.3298633702016916, + "grad_norm": 0.1712959110736847, + "learning_rate": 0.0001, + "loss": 1.6827, + "step": 676 + }, + { + "epoch": 0.3303513337670787, + "grad_norm": 0.169499009847641, + "learning_rate": 0.0001, + "loss": 1.67, + "step": 677 + }, + { + "epoch": 0.33083929733246586, + "grad_norm": 0.17921562492847443, + "learning_rate": 0.0001, + "loss": 1.6913, + "step": 678 + }, + { + "epoch": 0.33132726089785297, + "grad_norm": 0.16730189323425293, + "learning_rate": 0.0001, + "loss": 1.6585, + "step": 679 + }, + { + "epoch": 0.3318152244632401, + "grad_norm": 0.1731245219707489, + "learning_rate": 0.0001, + "loss": 1.6891, + "step": 680 + }, + { + "epoch": 0.3323031880286272, + "grad_norm": 0.18989908695220947, + "learning_rate": 0.0001, + "loss": 1.8335, + "step": 681 + }, + { + "epoch": 0.3327911515940143, + "grad_norm": 0.17079797387123108, + "learning_rate": 0.0001, + "loss": 1.6074, + "step": 682 + }, + { + "epoch": 0.33327911515940145, + "grad_norm": 0.1855732947587967, + "learning_rate": 0.0001, + "loss": 1.8051, + "step": 683 + }, + { + "epoch": 0.33376707872478856, + "grad_norm": 0.19362801313400269, + "learning_rate": 0.0001, + "loss": 1.7934, + "step": 684 + }, + { + "epoch": 0.33425504229017566, + "grad_norm": 0.18407447636127472, + "learning_rate": 0.0001, + "loss": 1.7676, + "step": 685 + }, + { + "epoch": 0.33474300585556277, + "grad_norm": 0.17326807975769043, + "learning_rate": 0.0001, + "loss": 1.6867, + "step": 686 + }, + { + "epoch": 0.3352309694209499, + "grad_norm": 0.18629767000675201, + "learning_rate": 0.0001, + "loss": 1.7577, + "step": 687 + }, + { + "epoch": 0.33571893298633704, + "grad_norm": 0.19202108681201935, + "learning_rate": 0.0001, + "loss": 1.7742, + "step": 688 + }, + { + "epoch": 0.33620689655172414, + "grad_norm": 0.1923230141401291, + "learning_rate": 0.0001, + "loss": 1.7646, + "step": 689 + }, + { + "epoch": 0.33669486011711125, + "grad_norm": 0.1855097860097885, + "learning_rate": 0.0001, + "loss": 1.7189, + "step": 690 + }, + { + "epoch": 0.33718282368249836, + "grad_norm": 0.17661595344543457, + "learning_rate": 0.0001, + "loss": 1.6404, + "step": 691 + }, + { + "epoch": 0.3376707872478855, + "grad_norm": 0.19284093379974365, + "learning_rate": 0.0001, + "loss": 1.7621, + "step": 692 + }, + { + "epoch": 0.3381587508132726, + "grad_norm": 0.18006063997745514, + "learning_rate": 0.0001, + "loss": 1.6163, + "step": 693 + }, + { + "epoch": 0.33864671437865973, + "grad_norm": 0.1881456822156906, + "learning_rate": 0.0001, + "loss": 1.732, + "step": 694 + }, + { + "epoch": 0.33913467794404684, + "grad_norm": 0.17196986079216003, + "learning_rate": 0.0001, + "loss": 1.7099, + "step": 695 + }, + { + "epoch": 0.33962264150943394, + "grad_norm": 0.186056986451149, + "learning_rate": 0.0001, + "loss": 1.8247, + "step": 696 + }, + { + "epoch": 0.3401106050748211, + "grad_norm": 0.18548524379730225, + "learning_rate": 0.0001, + "loss": 1.7185, + "step": 697 + }, + { + "epoch": 0.3405985686402082, + "grad_norm": 0.182390958070755, + "learning_rate": 0.0001, + "loss": 1.8278, + "step": 698 + }, + { + "epoch": 0.3410865322055953, + "grad_norm": 0.18355803191661835, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 699 + }, + { + "epoch": 0.3415744957709824, + "grad_norm": 0.176362544298172, + "learning_rate": 0.0001, + "loss": 1.71, + "step": 700 + }, + { + "epoch": 0.34206245933636953, + "grad_norm": 0.1753791868686676, + "learning_rate": 0.0001, + "loss": 1.7079, + "step": 701 + }, + { + "epoch": 0.3425504229017567, + "grad_norm": 0.17833958566188812, + "learning_rate": 0.0001, + "loss": 1.6155, + "step": 702 + }, + { + "epoch": 0.3430383864671438, + "grad_norm": 0.18626241385936737, + "learning_rate": 0.0001, + "loss": 1.8164, + "step": 703 + }, + { + "epoch": 0.3435263500325309, + "grad_norm": 0.18040528893470764, + "learning_rate": 0.0001, + "loss": 1.7061, + "step": 704 + }, + { + "epoch": 0.344014313597918, + "grad_norm": 0.18248948454856873, + "learning_rate": 0.0001, + "loss": 1.7002, + "step": 705 + }, + { + "epoch": 0.3445022771633051, + "grad_norm": 0.18155597150325775, + "learning_rate": 0.0001, + "loss": 1.7623, + "step": 706 + }, + { + "epoch": 0.3449902407286923, + "grad_norm": 0.18167854845523834, + "learning_rate": 0.0001, + "loss": 1.7209, + "step": 707 + }, + { + "epoch": 0.3454782042940794, + "grad_norm": 0.18228544294834137, + "learning_rate": 0.0001, + "loss": 1.7166, + "step": 708 + }, + { + "epoch": 0.3459661678594665, + "grad_norm": 0.1872456818819046, + "learning_rate": 0.0001, + "loss": 1.8073, + "step": 709 + }, + { + "epoch": 0.3464541314248536, + "grad_norm": 0.17062440514564514, + "learning_rate": 0.0001, + "loss": 1.653, + "step": 710 + }, + { + "epoch": 0.3469420949902407, + "grad_norm": 0.17459101974964142, + "learning_rate": 0.0001, + "loss": 1.6982, + "step": 711 + }, + { + "epoch": 0.34743005855562786, + "grad_norm": 0.1724562644958496, + "learning_rate": 0.0001, + "loss": 1.7638, + "step": 712 + }, + { + "epoch": 0.34791802212101497, + "grad_norm": 0.16791169345378876, + "learning_rate": 0.0001, + "loss": 1.5451, + "step": 713 + }, + { + "epoch": 0.3484059856864021, + "grad_norm": 0.17250396311283112, + "learning_rate": 0.0001, + "loss": 1.6266, + "step": 714 + }, + { + "epoch": 0.3488939492517892, + "grad_norm": 0.17893101274967194, + "learning_rate": 0.0001, + "loss": 1.7786, + "step": 715 + }, + { + "epoch": 0.34938191281717634, + "grad_norm": 0.1739955097436905, + "learning_rate": 0.0001, + "loss": 1.6286, + "step": 716 + }, + { + "epoch": 0.34986987638256345, + "grad_norm": 0.183289036154747, + "learning_rate": 0.0001, + "loss": 1.7026, + "step": 717 + }, + { + "epoch": 0.35035783994795056, + "grad_norm": 0.1769326776266098, + "learning_rate": 0.0001, + "loss": 1.7008, + "step": 718 + }, + { + "epoch": 0.35084580351333766, + "grad_norm": 0.1857866495847702, + "learning_rate": 0.0001, + "loss": 1.6844, + "step": 719 + }, + { + "epoch": 0.35133376707872477, + "grad_norm": 0.18651182949543, + "learning_rate": 0.0001, + "loss": 1.7033, + "step": 720 + }, + { + "epoch": 0.35182173064411193, + "grad_norm": 0.18966244161128998, + "learning_rate": 0.0001, + "loss": 1.7673, + "step": 721 + }, + { + "epoch": 0.35230969420949904, + "grad_norm": 0.1810387372970581, + "learning_rate": 0.0001, + "loss": 1.7161, + "step": 722 + }, + { + "epoch": 0.35279765777488614, + "grad_norm": 0.17334793508052826, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 723 + }, + { + "epoch": 0.35328562134027325, + "grad_norm": 0.18044047057628632, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 724 + }, + { + "epoch": 0.35377358490566035, + "grad_norm": 0.18923179805278778, + "learning_rate": 0.0001, + "loss": 1.7244, + "step": 725 + }, + { + "epoch": 0.3542615484710475, + "grad_norm": 0.18003158271312714, + "learning_rate": 0.0001, + "loss": 1.7655, + "step": 726 + }, + { + "epoch": 0.3547495120364346, + "grad_norm": 0.18161289393901825, + "learning_rate": 0.0001, + "loss": 1.7199, + "step": 727 + }, + { + "epoch": 0.35523747560182173, + "grad_norm": 0.19969268143177032, + "learning_rate": 0.0001, + "loss": 1.7138, + "step": 728 + }, + { + "epoch": 0.35572543916720883, + "grad_norm": 0.1782398670911789, + "learning_rate": 0.0001, + "loss": 1.6231, + "step": 729 + }, + { + "epoch": 0.35621340273259594, + "grad_norm": 0.20619311928749084, + "learning_rate": 0.0001, + "loss": 1.7745, + "step": 730 + }, + { + "epoch": 0.3567013662979831, + "grad_norm": 0.1790829598903656, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 731 + }, + { + "epoch": 0.3571893298633702, + "grad_norm": 0.17978286743164062, + "learning_rate": 0.0001, + "loss": 1.6495, + "step": 732 + }, + { + "epoch": 0.3576772934287573, + "grad_norm": 0.20410868525505066, + "learning_rate": 0.0001, + "loss": 1.7264, + "step": 733 + }, + { + "epoch": 0.3581652569941444, + "grad_norm": 0.18116474151611328, + "learning_rate": 0.0001, + "loss": 1.7379, + "step": 734 + }, + { + "epoch": 0.3586532205595316, + "grad_norm": 0.20212259888648987, + "learning_rate": 0.0001, + "loss": 1.6964, + "step": 735 + }, + { + "epoch": 0.3591411841249187, + "grad_norm": 0.17794452607631683, + "learning_rate": 0.0001, + "loss": 1.6666, + "step": 736 + }, + { + "epoch": 0.3596291476903058, + "grad_norm": 0.17267604172229767, + "learning_rate": 0.0001, + "loss": 1.5783, + "step": 737 + }, + { + "epoch": 0.3601171112556929, + "grad_norm": 0.21285639703273773, + "learning_rate": 0.0001, + "loss": 1.7575, + "step": 738 + }, + { + "epoch": 0.36060507482108, + "grad_norm": 0.1822413057088852, + "learning_rate": 0.0001, + "loss": 1.7244, + "step": 739 + }, + { + "epoch": 0.36109303838646717, + "grad_norm": 0.1909700185060501, + "learning_rate": 0.0001, + "loss": 1.7614, + "step": 740 + }, + { + "epoch": 0.3615810019518543, + "grad_norm": 0.19396358728408813, + "learning_rate": 0.0001, + "loss": 1.701, + "step": 741 + }, + { + "epoch": 0.3620689655172414, + "grad_norm": 0.18860898911952972, + "learning_rate": 0.0001, + "loss": 1.7215, + "step": 742 + }, + { + "epoch": 0.3625569290826285, + "grad_norm": 0.1891864836215973, + "learning_rate": 0.0001, + "loss": 1.7127, + "step": 743 + }, + { + "epoch": 0.3630448926480156, + "grad_norm": 0.18963932991027832, + "learning_rate": 0.0001, + "loss": 1.6591, + "step": 744 + }, + { + "epoch": 0.36353285621340276, + "grad_norm": 0.17823189496994019, + "learning_rate": 0.0001, + "loss": 1.7356, + "step": 745 + }, + { + "epoch": 0.36402081977878986, + "grad_norm": 0.19020548462867737, + "learning_rate": 0.0001, + "loss": 1.7591, + "step": 746 + }, + { + "epoch": 0.36450878334417697, + "grad_norm": 0.1983988732099533, + "learning_rate": 0.0001, + "loss": 1.6688, + "step": 747 + }, + { + "epoch": 0.3649967469095641, + "grad_norm": 0.17455948889255524, + "learning_rate": 0.0001, + "loss": 1.6981, + "step": 748 + }, + { + "epoch": 0.3654847104749512, + "grad_norm": 0.19214113056659698, + "learning_rate": 0.0001, + "loss": 1.6858, + "step": 749 + }, + { + "epoch": 0.36597267404033834, + "grad_norm": 0.19815075397491455, + "learning_rate": 0.0001, + "loss": 1.7088, + "step": 750 + }, + { + "epoch": 0.36646063760572545, + "grad_norm": 0.18052172660827637, + "learning_rate": 0.0001, + "loss": 1.7046, + "step": 751 + }, + { + "epoch": 0.36694860117111255, + "grad_norm": 0.19308723509311676, + "learning_rate": 0.0001, + "loss": 1.7145, + "step": 752 + }, + { + "epoch": 0.36743656473649966, + "grad_norm": 0.20036271214485168, + "learning_rate": 0.0001, + "loss": 1.6666, + "step": 753 + }, + { + "epoch": 0.36792452830188677, + "grad_norm": 0.18619637191295624, + "learning_rate": 0.0001, + "loss": 1.7144, + "step": 754 + }, + { + "epoch": 0.36841249186727393, + "grad_norm": 0.19576376676559448, + "learning_rate": 0.0001, + "loss": 1.7653, + "step": 755 + }, + { + "epoch": 0.36890045543266103, + "grad_norm": 0.18974775075912476, + "learning_rate": 0.0001, + "loss": 1.7836, + "step": 756 + }, + { + "epoch": 0.36938841899804814, + "grad_norm": 0.17752085626125336, + "learning_rate": 0.0001, + "loss": 1.6496, + "step": 757 + }, + { + "epoch": 0.36987638256343525, + "grad_norm": 0.1844092309474945, + "learning_rate": 0.0001, + "loss": 1.6863, + "step": 758 + }, + { + "epoch": 0.3703643461288224, + "grad_norm": 0.18102730810642242, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 759 + }, + { + "epoch": 0.3708523096942095, + "grad_norm": 0.1773853898048401, + "learning_rate": 0.0001, + "loss": 1.7169, + "step": 760 + }, + { + "epoch": 0.3713402732595966, + "grad_norm": 0.17917506396770477, + "learning_rate": 0.0001, + "loss": 1.705, + "step": 761 + }, + { + "epoch": 0.3718282368249837, + "grad_norm": 0.1869056671857834, + "learning_rate": 0.0001, + "loss": 1.5653, + "step": 762 + }, + { + "epoch": 0.37231620039037083, + "grad_norm": 0.1744174063205719, + "learning_rate": 0.0001, + "loss": 1.7014, + "step": 763 + }, + { + "epoch": 0.372804163955758, + "grad_norm": 0.18072061240673065, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 764 + }, + { + "epoch": 0.3732921275211451, + "grad_norm": 0.17331485450267792, + "learning_rate": 0.0001, + "loss": 1.6642, + "step": 765 + }, + { + "epoch": 0.3737800910865322, + "grad_norm": 0.1780969500541687, + "learning_rate": 0.0001, + "loss": 1.6563, + "step": 766 + }, + { + "epoch": 0.3742680546519193, + "grad_norm": 0.1959829479455948, + "learning_rate": 0.0001, + "loss": 1.8421, + "step": 767 + }, + { + "epoch": 0.3747560182173064, + "grad_norm": 0.18532420694828033, + "learning_rate": 0.0001, + "loss": 1.7752, + "step": 768 + }, + { + "epoch": 0.3752439817826936, + "grad_norm": 0.1861323118209839, + "learning_rate": 0.0001, + "loss": 1.6672, + "step": 769 + }, + { + "epoch": 0.3757319453480807, + "grad_norm": 0.17399415373802185, + "learning_rate": 0.0001, + "loss": 1.506, + "step": 770 + }, + { + "epoch": 0.3762199089134678, + "grad_norm": 0.1861727237701416, + "learning_rate": 0.0001, + "loss": 1.7164, + "step": 771 + }, + { + "epoch": 0.3767078724788549, + "grad_norm": 0.17571841180324554, + "learning_rate": 0.0001, + "loss": 1.6256, + "step": 772 + }, + { + "epoch": 0.377195836044242, + "grad_norm": 0.1843421310186386, + "learning_rate": 0.0001, + "loss": 1.7273, + "step": 773 + }, + { + "epoch": 0.37768379960962917, + "grad_norm": 0.17336313426494598, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 774 + }, + { + "epoch": 0.3781717631750163, + "grad_norm": 0.173604816198349, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 775 + }, + { + "epoch": 0.3786597267404034, + "grad_norm": 0.19042102992534637, + "learning_rate": 0.0001, + "loss": 1.7671, + "step": 776 + }, + { + "epoch": 0.3791476903057905, + "grad_norm": 0.19237715005874634, + "learning_rate": 0.0001, + "loss": 1.6948, + "step": 777 + }, + { + "epoch": 0.3796356538711776, + "grad_norm": 0.1934320628643036, + "learning_rate": 0.0001, + "loss": 1.7704, + "step": 778 + }, + { + "epoch": 0.38012361743656475, + "grad_norm": 0.18237414956092834, + "learning_rate": 0.0001, + "loss": 1.7163, + "step": 779 + }, + { + "epoch": 0.38061158100195186, + "grad_norm": 0.1750539243221283, + "learning_rate": 0.0001, + "loss": 1.675, + "step": 780 + }, + { + "epoch": 0.38109954456733897, + "grad_norm": 0.18425478041172028, + "learning_rate": 0.0001, + "loss": 1.803, + "step": 781 + }, + { + "epoch": 0.38158750813272607, + "grad_norm": 0.17386333644390106, + "learning_rate": 0.0001, + "loss": 1.5968, + "step": 782 + }, + { + "epoch": 0.38207547169811323, + "grad_norm": 0.1958070695400238, + "learning_rate": 0.0001, + "loss": 1.7117, + "step": 783 + }, + { + "epoch": 0.38256343526350034, + "grad_norm": 0.18313884735107422, + "learning_rate": 0.0001, + "loss": 1.7634, + "step": 784 + }, + { + "epoch": 0.38305139882888745, + "grad_norm": 0.1904529333114624, + "learning_rate": 0.0001, + "loss": 1.7944, + "step": 785 + }, + { + "epoch": 0.38353936239427455, + "grad_norm": 0.18762192130088806, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 786 + }, + { + "epoch": 0.38402732595966166, + "grad_norm": 0.1828492432832718, + "learning_rate": 0.0001, + "loss": 1.6451, + "step": 787 + }, + { + "epoch": 0.3845152895250488, + "grad_norm": 0.19027890264987946, + "learning_rate": 0.0001, + "loss": 1.7919, + "step": 788 + }, + { + "epoch": 0.3850032530904359, + "grad_norm": 0.17186413705348969, + "learning_rate": 0.0001, + "loss": 1.6794, + "step": 789 + }, + { + "epoch": 0.38549121665582303, + "grad_norm": 0.1878061145544052, + "learning_rate": 0.0001, + "loss": 1.6987, + "step": 790 + }, + { + "epoch": 0.38597918022121014, + "grad_norm": 0.18121576309204102, + "learning_rate": 0.0001, + "loss": 1.796, + "step": 791 + }, + { + "epoch": 0.38646714378659724, + "grad_norm": 0.19097453355789185, + "learning_rate": 0.0001, + "loss": 1.7155, + "step": 792 + }, + { + "epoch": 0.3869551073519844, + "grad_norm": 0.18126630783081055, + "learning_rate": 0.0001, + "loss": 1.7499, + "step": 793 + }, + { + "epoch": 0.3874430709173715, + "grad_norm": 0.1922173947095871, + "learning_rate": 0.0001, + "loss": 1.7447, + "step": 794 + }, + { + "epoch": 0.3879310344827586, + "grad_norm": 0.17474421858787537, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 795 + }, + { + "epoch": 0.3884189980481457, + "grad_norm": 0.19023337960243225, + "learning_rate": 0.0001, + "loss": 1.7285, + "step": 796 + }, + { + "epoch": 0.38890696161353283, + "grad_norm": 0.17856378853321075, + "learning_rate": 0.0001, + "loss": 1.598, + "step": 797 + }, + { + "epoch": 0.38939492517892, + "grad_norm": 0.17470918595790863, + "learning_rate": 0.0001, + "loss": 1.7021, + "step": 798 + }, + { + "epoch": 0.3898828887443071, + "grad_norm": 0.20127350091934204, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 799 + }, + { + "epoch": 0.3903708523096942, + "grad_norm": 0.17676322162151337, + "learning_rate": 0.0001, + "loss": 1.6967, + "step": 800 + }, + { + "epoch": 0.3908588158750813, + "grad_norm": 0.17519530653953552, + "learning_rate": 0.0001, + "loss": 1.7357, + "step": 801 + }, + { + "epoch": 0.3913467794404684, + "grad_norm": 0.19061584770679474, + "learning_rate": 0.0001, + "loss": 1.7182, + "step": 802 + }, + { + "epoch": 0.3918347430058556, + "grad_norm": 0.18246081471443176, + "learning_rate": 0.0001, + "loss": 1.7688, + "step": 803 + }, + { + "epoch": 0.3923227065712427, + "grad_norm": 0.20583999156951904, + "learning_rate": 0.0001, + "loss": 1.8205, + "step": 804 + }, + { + "epoch": 0.3928106701366298, + "grad_norm": 0.18392029404640198, + "learning_rate": 0.0001, + "loss": 1.7499, + "step": 805 + }, + { + "epoch": 0.3932986337020169, + "grad_norm": 0.18296070396900177, + "learning_rate": 0.0001, + "loss": 1.7422, + "step": 806 + }, + { + "epoch": 0.39378659726740406, + "grad_norm": 0.176628977060318, + "learning_rate": 0.0001, + "loss": 1.6818, + "step": 807 + }, + { + "epoch": 0.39427456083279117, + "grad_norm": 0.17783887684345245, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 808 + }, + { + "epoch": 0.39476252439817827, + "grad_norm": 0.18225261569023132, + "learning_rate": 0.0001, + "loss": 1.7117, + "step": 809 + }, + { + "epoch": 0.3952504879635654, + "grad_norm": 0.18413884937763214, + "learning_rate": 0.0001, + "loss": 1.6266, + "step": 810 + }, + { + "epoch": 0.3957384515289525, + "grad_norm": 0.18847863376140594, + "learning_rate": 0.0001, + "loss": 1.6942, + "step": 811 + }, + { + "epoch": 0.39622641509433965, + "grad_norm": 0.177464559674263, + "learning_rate": 0.0001, + "loss": 1.7731, + "step": 812 + }, + { + "epoch": 0.39671437865972675, + "grad_norm": 0.18517576158046722, + "learning_rate": 0.0001, + "loss": 1.709, + "step": 813 + }, + { + "epoch": 0.39720234222511386, + "grad_norm": 0.18677739799022675, + "learning_rate": 0.0001, + "loss": 1.709, + "step": 814 + }, + { + "epoch": 0.39769030579050096, + "grad_norm": 0.1786472350358963, + "learning_rate": 0.0001, + "loss": 1.6966, + "step": 815 + }, + { + "epoch": 0.39817826935588807, + "grad_norm": 0.18321356177330017, + "learning_rate": 0.0001, + "loss": 1.6611, + "step": 816 + }, + { + "epoch": 0.39866623292127523, + "grad_norm": 0.19883863627910614, + "learning_rate": 0.0001, + "loss": 1.7824, + "step": 817 + }, + { + "epoch": 0.39915419648666234, + "grad_norm": 0.18374767899513245, + "learning_rate": 0.0001, + "loss": 1.8102, + "step": 818 + }, + { + "epoch": 0.39964216005204944, + "grad_norm": 0.1768617182970047, + "learning_rate": 0.0001, + "loss": 1.6278, + "step": 819 + }, + { + "epoch": 0.40013012361743655, + "grad_norm": 0.17839239537715912, + "learning_rate": 0.0001, + "loss": 1.5887, + "step": 820 + }, + { + "epoch": 0.40061808718282366, + "grad_norm": 0.18420036137104034, + "learning_rate": 0.0001, + "loss": 1.7334, + "step": 821 + }, + { + "epoch": 0.4011060507482108, + "grad_norm": 0.18662692606449127, + "learning_rate": 0.0001, + "loss": 1.7035, + "step": 822 + }, + { + "epoch": 0.4015940143135979, + "grad_norm": 0.1809212863445282, + "learning_rate": 0.0001, + "loss": 1.6425, + "step": 823 + }, + { + "epoch": 0.40208197787898503, + "grad_norm": 0.18343691527843475, + "learning_rate": 0.0001, + "loss": 1.6915, + "step": 824 + }, + { + "epoch": 0.40256994144437214, + "grad_norm": 0.19546520709991455, + "learning_rate": 0.0001, + "loss": 1.5398, + "step": 825 + }, + { + "epoch": 0.4030579050097593, + "grad_norm": 0.18498557806015015, + "learning_rate": 0.0001, + "loss": 1.76, + "step": 826 + }, + { + "epoch": 0.4035458685751464, + "grad_norm": 0.1787293255329132, + "learning_rate": 0.0001, + "loss": 1.7072, + "step": 827 + }, + { + "epoch": 0.4040338321405335, + "grad_norm": 0.18626105785369873, + "learning_rate": 0.0001, + "loss": 1.6154, + "step": 828 + }, + { + "epoch": 0.4045217957059206, + "grad_norm": 0.18181754648685455, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 829 + }, + { + "epoch": 0.4050097592713077, + "grad_norm": 0.1738763153553009, + "learning_rate": 0.0001, + "loss": 1.6003, + "step": 830 + }, + { + "epoch": 0.4054977228366949, + "grad_norm": 0.19205868244171143, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 831 + }, + { + "epoch": 0.405985686402082, + "grad_norm": 0.17389516532421112, + "learning_rate": 0.0001, + "loss": 1.6675, + "step": 832 + }, + { + "epoch": 0.4064736499674691, + "grad_norm": 0.17901460826396942, + "learning_rate": 0.0001, + "loss": 1.7835, + "step": 833 + }, + { + "epoch": 0.4069616135328562, + "grad_norm": 0.16918572783470154, + "learning_rate": 0.0001, + "loss": 1.5688, + "step": 834 + }, + { + "epoch": 0.4074495770982433, + "grad_norm": 0.17327755689620972, + "learning_rate": 0.0001, + "loss": 1.612, + "step": 835 + }, + { + "epoch": 0.40793754066363047, + "grad_norm": 0.17260931432247162, + "learning_rate": 0.0001, + "loss": 1.5631, + "step": 836 + }, + { + "epoch": 0.4084255042290176, + "grad_norm": 0.18616695702075958, + "learning_rate": 0.0001, + "loss": 1.8026, + "step": 837 + }, + { + "epoch": 0.4089134677944047, + "grad_norm": 0.1833159476518631, + "learning_rate": 0.0001, + "loss": 1.7407, + "step": 838 + }, + { + "epoch": 0.4094014313597918, + "grad_norm": 0.17563556134700775, + "learning_rate": 0.0001, + "loss": 1.6497, + "step": 839 + }, + { + "epoch": 0.4098893949251789, + "grad_norm": 0.1728363335132599, + "learning_rate": 0.0001, + "loss": 1.7369, + "step": 840 + }, + { + "epoch": 0.41037735849056606, + "grad_norm": 0.16742554306983948, + "learning_rate": 0.0001, + "loss": 1.5323, + "step": 841 + }, + { + "epoch": 0.41086532205595316, + "grad_norm": 0.18149816989898682, + "learning_rate": 0.0001, + "loss": 1.6658, + "step": 842 + }, + { + "epoch": 0.41135328562134027, + "grad_norm": 0.1730806678533554, + "learning_rate": 0.0001, + "loss": 1.6736, + "step": 843 + }, + { + "epoch": 0.4118412491867274, + "grad_norm": 0.19350793957710266, + "learning_rate": 0.0001, + "loss": 1.7305, + "step": 844 + }, + { + "epoch": 0.4123292127521145, + "grad_norm": 0.17669609189033508, + "learning_rate": 0.0001, + "loss": 1.7208, + "step": 845 + }, + { + "epoch": 0.41281717631750164, + "grad_norm": 0.18896430730819702, + "learning_rate": 0.0001, + "loss": 1.7677, + "step": 846 + }, + { + "epoch": 0.41330513988288875, + "grad_norm": 0.18296490609645844, + "learning_rate": 0.0001, + "loss": 1.7551, + "step": 847 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 0.18311992287635803, + "learning_rate": 0.0001, + "loss": 1.6724, + "step": 848 + }, + { + "epoch": 0.41428106701366296, + "grad_norm": 0.1732887476682663, + "learning_rate": 0.0001, + "loss": 1.6779, + "step": 849 + }, + { + "epoch": 0.4147690305790501, + "grad_norm": 0.18442484736442566, + "learning_rate": 0.0001, + "loss": 1.6707, + "step": 850 + }, + { + "epoch": 0.41525699414443723, + "grad_norm": 0.18358947336673737, + "learning_rate": 0.0001, + "loss": 1.7059, + "step": 851 + }, + { + "epoch": 0.41574495770982434, + "grad_norm": 0.17849397659301758, + "learning_rate": 0.0001, + "loss": 1.6633, + "step": 852 + }, + { + "epoch": 0.41623292127521144, + "grad_norm": 0.17558790743350983, + "learning_rate": 0.0001, + "loss": 1.7351, + "step": 853 + }, + { + "epoch": 0.41672088484059855, + "grad_norm": 0.18554963171482086, + "learning_rate": 0.0001, + "loss": 1.722, + "step": 854 + }, + { + "epoch": 0.4172088484059857, + "grad_norm": 0.17529337108135223, + "learning_rate": 0.0001, + "loss": 1.7565, + "step": 855 + }, + { + "epoch": 0.4176968119713728, + "grad_norm": 0.1806408166885376, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 856 + }, + { + "epoch": 0.4181847755367599, + "grad_norm": 0.17640672624111176, + "learning_rate": 0.0001, + "loss": 1.6622, + "step": 857 + }, + { + "epoch": 0.41867273910214703, + "grad_norm": 0.18511973321437836, + "learning_rate": 0.0001, + "loss": 1.7708, + "step": 858 + }, + { + "epoch": 0.41916070266753414, + "grad_norm": 0.17402327060699463, + "learning_rate": 0.0001, + "loss": 1.5703, + "step": 859 + }, + { + "epoch": 0.4196486662329213, + "grad_norm": 0.1716722548007965, + "learning_rate": 0.0001, + "loss": 1.6326, + "step": 860 + }, + { + "epoch": 0.4201366297983084, + "grad_norm": 0.18517763912677765, + "learning_rate": 0.0001, + "loss": 1.638, + "step": 861 + }, + { + "epoch": 0.4206245933636955, + "grad_norm": 0.18149396777153015, + "learning_rate": 0.0001, + "loss": 1.772, + "step": 862 + }, + { + "epoch": 0.4211125569290826, + "grad_norm": 0.1842370480298996, + "learning_rate": 0.0001, + "loss": 1.7326, + "step": 863 + }, + { + "epoch": 0.4216005204944697, + "grad_norm": 0.1832754909992218, + "learning_rate": 0.0001, + "loss": 1.571, + "step": 864 + }, + { + "epoch": 0.4220884840598569, + "grad_norm": 0.18610063195228577, + "learning_rate": 0.0001, + "loss": 1.6853, + "step": 865 + }, + { + "epoch": 0.422576447625244, + "grad_norm": 0.18227741122245789, + "learning_rate": 0.0001, + "loss": 1.7299, + "step": 866 + }, + { + "epoch": 0.4230644111906311, + "grad_norm": 0.1710875779390335, + "learning_rate": 0.0001, + "loss": 1.6311, + "step": 867 + }, + { + "epoch": 0.4235523747560182, + "grad_norm": 0.1772422045469284, + "learning_rate": 0.0001, + "loss": 1.6997, + "step": 868 + }, + { + "epoch": 0.4240403383214053, + "grad_norm": 0.18706001341342926, + "learning_rate": 0.0001, + "loss": 1.7453, + "step": 869 + }, + { + "epoch": 0.42452830188679247, + "grad_norm": 0.18400168418884277, + "learning_rate": 0.0001, + "loss": 1.7748, + "step": 870 + }, + { + "epoch": 0.4250162654521796, + "grad_norm": 0.1813107579946518, + "learning_rate": 0.0001, + "loss": 1.6386, + "step": 871 + }, + { + "epoch": 0.4255042290175667, + "grad_norm": 0.18432138860225677, + "learning_rate": 0.0001, + "loss": 1.6548, + "step": 872 + }, + { + "epoch": 0.4259921925829538, + "grad_norm": 0.1701667755842209, + "learning_rate": 0.0001, + "loss": 1.7228, + "step": 873 + }, + { + "epoch": 0.42648015614834095, + "grad_norm": 0.17490911483764648, + "learning_rate": 0.0001, + "loss": 1.6574, + "step": 874 + }, + { + "epoch": 0.42696811971372806, + "grad_norm": 0.1863052397966385, + "learning_rate": 0.0001, + "loss": 1.6902, + "step": 875 + }, + { + "epoch": 0.42745608327911516, + "grad_norm": 0.17869678139686584, + "learning_rate": 0.0001, + "loss": 1.7961, + "step": 876 + }, + { + "epoch": 0.42794404684450227, + "grad_norm": 0.17393270134925842, + "learning_rate": 0.0001, + "loss": 1.6968, + "step": 877 + }, + { + "epoch": 0.4284320104098894, + "grad_norm": 0.1801164150238037, + "learning_rate": 0.0001, + "loss": 1.8419, + "step": 878 + }, + { + "epoch": 0.42891997397527654, + "grad_norm": 0.17271965742111206, + "learning_rate": 0.0001, + "loss": 1.6948, + "step": 879 + }, + { + "epoch": 0.42940793754066364, + "grad_norm": 0.18875744938850403, + "learning_rate": 0.0001, + "loss": 1.7529, + "step": 880 + }, + { + "epoch": 0.42989590110605075, + "grad_norm": 0.18350331485271454, + "learning_rate": 0.0001, + "loss": 1.7162, + "step": 881 + }, + { + "epoch": 0.43038386467143785, + "grad_norm": 0.18316605687141418, + "learning_rate": 0.0001, + "loss": 1.7071, + "step": 882 + }, + { + "epoch": 0.43087182823682496, + "grad_norm": 0.17159631848335266, + "learning_rate": 0.0001, + "loss": 1.5494, + "step": 883 + }, + { + "epoch": 0.4313597918022121, + "grad_norm": 0.1835523098707199, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 884 + }, + { + "epoch": 0.43184775536759923, + "grad_norm": 0.18305568397045135, + "learning_rate": 0.0001, + "loss": 1.6616, + "step": 885 + }, + { + "epoch": 0.43233571893298633, + "grad_norm": 0.18325333297252655, + "learning_rate": 0.0001, + "loss": 1.71, + "step": 886 + }, + { + "epoch": 0.43282368249837344, + "grad_norm": 0.16807565093040466, + "learning_rate": 0.0001, + "loss": 1.5946, + "step": 887 + }, + { + "epoch": 0.43331164606376055, + "grad_norm": 0.17560525238513947, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 888 + }, + { + "epoch": 0.4337996096291477, + "grad_norm": 0.1823277622461319, + "learning_rate": 0.0001, + "loss": 1.616, + "step": 889 + }, + { + "epoch": 0.4342875731945348, + "grad_norm": 0.17946025729179382, + "learning_rate": 0.0001, + "loss": 1.5907, + "step": 890 + }, + { + "epoch": 0.4347755367599219, + "grad_norm": 0.18940189480781555, + "learning_rate": 0.0001, + "loss": 1.6697, + "step": 891 + }, + { + "epoch": 0.435263500325309, + "grad_norm": 0.17899388074874878, + "learning_rate": 0.0001, + "loss": 1.6849, + "step": 892 + }, + { + "epoch": 0.4357514638906962, + "grad_norm": 0.1885358840227127, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 893 + }, + { + "epoch": 0.4362394274560833, + "grad_norm": 0.1721390187740326, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 894 + }, + { + "epoch": 0.4367273910214704, + "grad_norm": 0.19019658863544464, + "learning_rate": 0.0001, + "loss": 1.7234, + "step": 895 + }, + { + "epoch": 0.4372153545868575, + "grad_norm": 0.17101971805095673, + "learning_rate": 0.0001, + "loss": 1.6003, + "step": 896 + }, + { + "epoch": 0.4377033181522446, + "grad_norm": 0.192877396941185, + "learning_rate": 0.0001, + "loss": 1.8151, + "step": 897 + }, + { + "epoch": 0.4381912817176318, + "grad_norm": 0.17775356769561768, + "learning_rate": 0.0001, + "loss": 1.5926, + "step": 898 + }, + { + "epoch": 0.4386792452830189, + "grad_norm": 0.19545124471187592, + "learning_rate": 0.0001, + "loss": 1.7123, + "step": 899 + }, + { + "epoch": 0.439167208848406, + "grad_norm": 0.17418169975280762, + "learning_rate": 0.0001, + "loss": 1.6774, + "step": 900 + }, + { + "epoch": 0.4396551724137931, + "grad_norm": 0.19206389784812927, + "learning_rate": 0.0001, + "loss": 1.7278, + "step": 901 + }, + { + "epoch": 0.4401431359791802, + "grad_norm": 0.18674510717391968, + "learning_rate": 0.0001, + "loss": 1.6049, + "step": 902 + }, + { + "epoch": 0.44063109954456736, + "grad_norm": 0.18307790160179138, + "learning_rate": 0.0001, + "loss": 1.6985, + "step": 903 + }, + { + "epoch": 0.44111906310995447, + "grad_norm": 0.1894843429327011, + "learning_rate": 0.0001, + "loss": 1.676, + "step": 904 + }, + { + "epoch": 0.4416070266753416, + "grad_norm": 0.17619220912456512, + "learning_rate": 0.0001, + "loss": 1.6807, + "step": 905 + }, + { + "epoch": 0.4420949902407287, + "grad_norm": 0.1805913895368576, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 906 + }, + { + "epoch": 0.4425829538061158, + "grad_norm": 0.17293816804885864, + "learning_rate": 0.0001, + "loss": 1.597, + "step": 907 + }, + { + "epoch": 0.44307091737150295, + "grad_norm": 0.17609193921089172, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 908 + }, + { + "epoch": 0.44355888093689005, + "grad_norm": 0.17432111501693726, + "learning_rate": 0.0001, + "loss": 1.594, + "step": 909 + }, + { + "epoch": 0.44404684450227716, + "grad_norm": 0.17889589071273804, + "learning_rate": 0.0001, + "loss": 1.8029, + "step": 910 + }, + { + "epoch": 0.44453480806766427, + "grad_norm": 0.17299845814704895, + "learning_rate": 0.0001, + "loss": 1.6116, + "step": 911 + }, + { + "epoch": 0.4450227716330514, + "grad_norm": 0.17839674651622772, + "learning_rate": 0.0001, + "loss": 1.7055, + "step": 912 + }, + { + "epoch": 0.44551073519843853, + "grad_norm": 0.1751437783241272, + "learning_rate": 0.0001, + "loss": 1.6218, + "step": 913 + }, + { + "epoch": 0.44599869876382564, + "grad_norm": 0.1901925653219223, + "learning_rate": 0.0001, + "loss": 1.6578, + "step": 914 + }, + { + "epoch": 0.44648666232921275, + "grad_norm": 0.17236626148223877, + "learning_rate": 0.0001, + "loss": 1.6951, + "step": 915 + }, + { + "epoch": 0.44697462589459985, + "grad_norm": 0.17387427389621735, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 916 + }, + { + "epoch": 0.447462589459987, + "grad_norm": 0.1684548258781433, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 917 + }, + { + "epoch": 0.4479505530253741, + "grad_norm": 0.18070632219314575, + "learning_rate": 0.0001, + "loss": 1.6904, + "step": 918 + }, + { + "epoch": 0.4484385165907612, + "grad_norm": 0.1905713975429535, + "learning_rate": 0.0001, + "loss": 1.8206, + "step": 919 + }, + { + "epoch": 0.44892648015614833, + "grad_norm": 0.1828422248363495, + "learning_rate": 0.0001, + "loss": 1.7974, + "step": 920 + }, + { + "epoch": 0.44941444372153544, + "grad_norm": 0.17595981061458588, + "learning_rate": 0.0001, + "loss": 1.7308, + "step": 921 + }, + { + "epoch": 0.4499024072869226, + "grad_norm": 0.18210361897945404, + "learning_rate": 0.0001, + "loss": 1.6915, + "step": 922 + }, + { + "epoch": 0.4503903708523097, + "grad_norm": 0.18826089799404144, + "learning_rate": 0.0001, + "loss": 1.7588, + "step": 923 + }, + { + "epoch": 0.4508783344176968, + "grad_norm": 0.17665328085422516, + "learning_rate": 0.0001, + "loss": 1.6797, + "step": 924 + }, + { + "epoch": 0.4513662979830839, + "grad_norm": 0.17838731408119202, + "learning_rate": 0.0001, + "loss": 1.6644, + "step": 925 + }, + { + "epoch": 0.451854261548471, + "grad_norm": 0.18045654892921448, + "learning_rate": 0.0001, + "loss": 1.689, + "step": 926 + }, + { + "epoch": 0.4523422251138582, + "grad_norm": 0.18226969242095947, + "learning_rate": 0.0001, + "loss": 1.8157, + "step": 927 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 0.17917855083942413, + "learning_rate": 0.0001, + "loss": 1.7772, + "step": 928 + }, + { + "epoch": 0.4533181522446324, + "grad_norm": 0.1778966784477234, + "learning_rate": 0.0001, + "loss": 1.6912, + "step": 929 + }, + { + "epoch": 0.4538061158100195, + "grad_norm": 0.18105091154575348, + "learning_rate": 0.0001, + "loss": 1.7072, + "step": 930 + }, + { + "epoch": 0.4542940793754066, + "grad_norm": 0.17502936720848083, + "learning_rate": 0.0001, + "loss": 1.6462, + "step": 931 + }, + { + "epoch": 0.4547820429407938, + "grad_norm": 0.1830134093761444, + "learning_rate": 0.0001, + "loss": 1.6876, + "step": 932 + }, + { + "epoch": 0.4552700065061809, + "grad_norm": 0.18607327342033386, + "learning_rate": 0.0001, + "loss": 1.7082, + "step": 933 + }, + { + "epoch": 0.455757970071568, + "grad_norm": 0.18888945877552032, + "learning_rate": 0.0001, + "loss": 1.7509, + "step": 934 + }, + { + "epoch": 0.4562459336369551, + "grad_norm": 0.1867811232805252, + "learning_rate": 0.0001, + "loss": 1.7233, + "step": 935 + }, + { + "epoch": 0.4567338972023422, + "grad_norm": 0.1898915022611618, + "learning_rate": 0.0001, + "loss": 1.6237, + "step": 936 + }, + { + "epoch": 0.45722186076772936, + "grad_norm": 0.1797095388174057, + "learning_rate": 0.0001, + "loss": 1.7404, + "step": 937 + }, + { + "epoch": 0.45770982433311647, + "grad_norm": 0.17534306645393372, + "learning_rate": 0.0001, + "loss": 1.6726, + "step": 938 + }, + { + "epoch": 0.4581977878985036, + "grad_norm": 0.19073282182216644, + "learning_rate": 0.0001, + "loss": 1.8081, + "step": 939 + }, + { + "epoch": 0.4586857514638907, + "grad_norm": 0.1878473460674286, + "learning_rate": 0.0001, + "loss": 1.6855, + "step": 940 + }, + { + "epoch": 0.45917371502927784, + "grad_norm": 0.18376657366752625, + "learning_rate": 0.0001, + "loss": 1.6833, + "step": 941 + }, + { + "epoch": 0.45966167859466495, + "grad_norm": 0.18948735296726227, + "learning_rate": 0.0001, + "loss": 1.7525, + "step": 942 + }, + { + "epoch": 0.46014964216005205, + "grad_norm": 0.18738175928592682, + "learning_rate": 0.0001, + "loss": 1.752, + "step": 943 + }, + { + "epoch": 0.46063760572543916, + "grad_norm": 0.1765458881855011, + "learning_rate": 0.0001, + "loss": 1.696, + "step": 944 + }, + { + "epoch": 0.46112556929082626, + "grad_norm": 0.18650664389133453, + "learning_rate": 0.0001, + "loss": 1.7409, + "step": 945 + }, + { + "epoch": 0.4616135328562134, + "grad_norm": 0.1759469360113144, + "learning_rate": 0.0001, + "loss": 1.6119, + "step": 946 + }, + { + "epoch": 0.46210149642160053, + "grad_norm": 0.18343883752822876, + "learning_rate": 0.0001, + "loss": 1.7091, + "step": 947 + }, + { + "epoch": 0.46258945998698764, + "grad_norm": 0.1964959353208542, + "learning_rate": 0.0001, + "loss": 1.7388, + "step": 948 + }, + { + "epoch": 0.46307742355237475, + "grad_norm": 0.18265226483345032, + "learning_rate": 0.0001, + "loss": 1.7036, + "step": 949 + }, + { + "epoch": 0.46356538711776185, + "grad_norm": 0.18132254481315613, + "learning_rate": 0.0001, + "loss": 1.688, + "step": 950 + }, + { + "epoch": 0.464053350683149, + "grad_norm": 0.18742497265338898, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 951 + }, + { + "epoch": 0.4645413142485361, + "grad_norm": 0.1776818335056305, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 952 + }, + { + "epoch": 0.4650292778139232, + "grad_norm": 0.193990558385849, + "learning_rate": 0.0001, + "loss": 1.6852, + "step": 953 + }, + { + "epoch": 0.46551724137931033, + "grad_norm": 0.1853352040052414, + "learning_rate": 0.0001, + "loss": 1.6057, + "step": 954 + }, + { + "epoch": 0.46600520494469744, + "grad_norm": 0.2000368982553482, + "learning_rate": 0.0001, + "loss": 1.7329, + "step": 955 + }, + { + "epoch": 0.4664931685100846, + "grad_norm": 0.20909981429576874, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 956 + }, + { + "epoch": 0.4669811320754717, + "grad_norm": 0.21065653860569, + "learning_rate": 0.0001, + "loss": 1.7239, + "step": 957 + }, + { + "epoch": 0.4674690956408588, + "grad_norm": 0.1819789707660675, + "learning_rate": 0.0001, + "loss": 1.7258, + "step": 958 + }, + { + "epoch": 0.4679570592062459, + "grad_norm": 0.20444951951503754, + "learning_rate": 0.0001, + "loss": 1.679, + "step": 959 + }, + { + "epoch": 0.468445022771633, + "grad_norm": 0.19722609221935272, + "learning_rate": 0.0001, + "loss": 1.6114, + "step": 960 + }, + { + "epoch": 0.4689329863370202, + "grad_norm": 0.18290160596370697, + "learning_rate": 0.0001, + "loss": 1.7676, + "step": 961 + }, + { + "epoch": 0.4694209499024073, + "grad_norm": 0.20910906791687012, + "learning_rate": 0.0001, + "loss": 1.6688, + "step": 962 + }, + { + "epoch": 0.4699089134677944, + "grad_norm": 0.2053229659795761, + "learning_rate": 0.0001, + "loss": 1.7208, + "step": 963 + }, + { + "epoch": 0.4703968770331815, + "grad_norm": 0.18317236006259918, + "learning_rate": 0.0001, + "loss": 1.6808, + "step": 964 + }, + { + "epoch": 0.47088484059856867, + "grad_norm": 0.20331262052059174, + "learning_rate": 0.0001, + "loss": 1.7621, + "step": 965 + }, + { + "epoch": 0.47137280416395577, + "grad_norm": 0.194210484623909, + "learning_rate": 0.0001, + "loss": 1.7045, + "step": 966 + }, + { + "epoch": 0.4718607677293429, + "grad_norm": 0.18274177610874176, + "learning_rate": 0.0001, + "loss": 1.7462, + "step": 967 + }, + { + "epoch": 0.47234873129473, + "grad_norm": 0.211595356464386, + "learning_rate": 0.0001, + "loss": 1.7322, + "step": 968 + }, + { + "epoch": 0.4728366948601171, + "grad_norm": 0.1885220855474472, + "learning_rate": 0.0001, + "loss": 1.6825, + "step": 969 + }, + { + "epoch": 0.47332465842550425, + "grad_norm": 0.17875580489635468, + "learning_rate": 0.0001, + "loss": 1.6192, + "step": 970 + }, + { + "epoch": 0.47381262199089136, + "grad_norm": 0.1805390864610672, + "learning_rate": 0.0001, + "loss": 1.6668, + "step": 971 + }, + { + "epoch": 0.47430058555627846, + "grad_norm": 0.19222760200500488, + "learning_rate": 0.0001, + "loss": 1.7478, + "step": 972 + }, + { + "epoch": 0.47478854912166557, + "grad_norm": 0.18637999892234802, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 973 + }, + { + "epoch": 0.4752765126870527, + "grad_norm": 0.18341195583343506, + "learning_rate": 0.0001, + "loss": 1.7021, + "step": 974 + }, + { + "epoch": 0.47576447625243984, + "grad_norm": 0.17885076999664307, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 975 + }, + { + "epoch": 0.47625243981782694, + "grad_norm": 0.1952183097600937, + "learning_rate": 0.0001, + "loss": 1.9142, + "step": 976 + }, + { + "epoch": 0.47674040338321405, + "grad_norm": 0.18243496119976044, + "learning_rate": 0.0001, + "loss": 1.6983, + "step": 977 + }, + { + "epoch": 0.47722836694860116, + "grad_norm": 0.18224705755710602, + "learning_rate": 0.0001, + "loss": 1.5847, + "step": 978 + }, + { + "epoch": 0.47771633051398826, + "grad_norm": 0.25170522928237915, + "learning_rate": 0.0001, + "loss": 1.9113, + "step": 979 + }, + { + "epoch": 0.4782042940793754, + "grad_norm": 0.18615500628948212, + "learning_rate": 0.0001, + "loss": 1.7893, + "step": 980 + }, + { + "epoch": 0.47869225764476253, + "grad_norm": 0.18177960813045502, + "learning_rate": 0.0001, + "loss": 1.753, + "step": 981 + }, + { + "epoch": 0.47918022121014964, + "grad_norm": 0.17566373944282532, + "learning_rate": 0.0001, + "loss": 1.749, + "step": 982 + }, + { + "epoch": 0.47966818477553674, + "grad_norm": 0.18363641202449799, + "learning_rate": 0.0001, + "loss": 1.7202, + "step": 983 + }, + { + "epoch": 0.4801561483409239, + "grad_norm": 0.18019676208496094, + "learning_rate": 0.0001, + "loss": 1.7756, + "step": 984 + }, + { + "epoch": 0.480644111906311, + "grad_norm": 0.18838275969028473, + "learning_rate": 0.0001, + "loss": 1.6533, + "step": 985 + }, + { + "epoch": 0.4811320754716981, + "grad_norm": 0.17840002477169037, + "learning_rate": 0.0001, + "loss": 1.6495, + "step": 986 + }, + { + "epoch": 0.4816200390370852, + "grad_norm": 0.18629398941993713, + "learning_rate": 0.0001, + "loss": 1.746, + "step": 987 + }, + { + "epoch": 0.48210800260247233, + "grad_norm": 0.19068728387355804, + "learning_rate": 0.0001, + "loss": 1.7956, + "step": 988 + }, + { + "epoch": 0.4825959661678595, + "grad_norm": 0.17752403020858765, + "learning_rate": 0.0001, + "loss": 1.6085, + "step": 989 + }, + { + "epoch": 0.4830839297332466, + "grad_norm": 0.17869940400123596, + "learning_rate": 0.0001, + "loss": 1.687, + "step": 990 + }, + { + "epoch": 0.4835718932986337, + "grad_norm": 0.19462576508522034, + "learning_rate": 0.0001, + "loss": 1.766, + "step": 991 + }, + { + "epoch": 0.4840598568640208, + "grad_norm": 0.17635509371757507, + "learning_rate": 0.0001, + "loss": 1.6512, + "step": 992 + }, + { + "epoch": 0.4845478204294079, + "grad_norm": 0.18457075953483582, + "learning_rate": 0.0001, + "loss": 1.6829, + "step": 993 + }, + { + "epoch": 0.4850357839947951, + "grad_norm": 0.19008415937423706, + "learning_rate": 0.0001, + "loss": 1.8335, + "step": 994 + }, + { + "epoch": 0.4855237475601822, + "grad_norm": 0.1748104840517044, + "learning_rate": 0.0001, + "loss": 1.6822, + "step": 995 + }, + { + "epoch": 0.4860117111255693, + "grad_norm": 0.18871375918388367, + "learning_rate": 0.0001, + "loss": 1.7749, + "step": 996 + }, + { + "epoch": 0.4864996746909564, + "grad_norm": 0.19204716384410858, + "learning_rate": 0.0001, + "loss": 1.7027, + "step": 997 + }, + { + "epoch": 0.4869876382563435, + "grad_norm": 0.17363031208515167, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 998 + }, + { + "epoch": 0.48747560182173066, + "grad_norm": 0.18046556413173676, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 999 + }, + { + "epoch": 0.48796356538711777, + "grad_norm": 0.18280474841594696, + "learning_rate": 0.0001, + "loss": 1.7468, + "step": 1000 + }, + { + "epoch": 0.4884515289525049, + "grad_norm": 0.1856307089328766, + "learning_rate": 0.0001, + "loss": 1.8059, + "step": 1001 + }, + { + "epoch": 0.488939492517892, + "grad_norm": 0.18734587728977203, + "learning_rate": 0.0001, + "loss": 1.7482, + "step": 1002 + }, + { + "epoch": 0.4894274560832791, + "grad_norm": 0.18201518058776855, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 1003 + }, + { + "epoch": 0.48991541964866625, + "grad_norm": 0.18317224085330963, + "learning_rate": 0.0001, + "loss": 1.6556, + "step": 1004 + }, + { + "epoch": 0.49040338321405336, + "grad_norm": 0.18233336508274078, + "learning_rate": 0.0001, + "loss": 1.7073, + "step": 1005 + }, + { + "epoch": 0.49089134677944046, + "grad_norm": 0.19454477727413177, + "learning_rate": 0.0001, + "loss": 1.5993, + "step": 1006 + }, + { + "epoch": 0.49137931034482757, + "grad_norm": 0.1874353140592575, + "learning_rate": 0.0001, + "loss": 1.6976, + "step": 1007 + }, + { + "epoch": 0.49186727391021473, + "grad_norm": 0.18378609418869019, + "learning_rate": 0.0001, + "loss": 1.7292, + "step": 1008 + }, + { + "epoch": 0.49235523747560184, + "grad_norm": 0.18301472067832947, + "learning_rate": 0.0001, + "loss": 1.6702, + "step": 1009 + }, + { + "epoch": 0.49284320104098894, + "grad_norm": 0.18581345677375793, + "learning_rate": 0.0001, + "loss": 1.769, + "step": 1010 + }, + { + "epoch": 0.49333116460637605, + "grad_norm": 0.18604816496372223, + "learning_rate": 0.0001, + "loss": 1.7022, + "step": 1011 + }, + { + "epoch": 0.49381912817176316, + "grad_norm": 0.1670636236667633, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 1012 + }, + { + "epoch": 0.4943070917371503, + "grad_norm": 0.18545298278331757, + "learning_rate": 0.0001, + "loss": 1.777, + "step": 1013 + }, + { + "epoch": 0.4947950553025374, + "grad_norm": 0.18108947575092316, + "learning_rate": 0.0001, + "loss": 1.7066, + "step": 1014 + }, + { + "epoch": 0.49528301886792453, + "grad_norm": 0.18042118847370148, + "learning_rate": 0.0001, + "loss": 1.6393, + "step": 1015 + }, + { + "epoch": 0.49577098243331164, + "grad_norm": 0.19193610548973083, + "learning_rate": 0.0001, + "loss": 1.8438, + "step": 1016 + }, + { + "epoch": 0.49625894599869874, + "grad_norm": 0.18542861938476562, + "learning_rate": 0.0001, + "loss": 1.8076, + "step": 1017 + }, + { + "epoch": 0.4967469095640859, + "grad_norm": 0.17646706104278564, + "learning_rate": 0.0001, + "loss": 1.4699, + "step": 1018 + }, + { + "epoch": 0.497234873129473, + "grad_norm": 0.18862095475196838, + "learning_rate": 0.0001, + "loss": 1.7165, + "step": 1019 + }, + { + "epoch": 0.4977228366948601, + "grad_norm": 0.18618489801883698, + "learning_rate": 0.0001, + "loss": 1.7683, + "step": 1020 + }, + { + "epoch": 0.4982108002602472, + "grad_norm": 0.18750105798244476, + "learning_rate": 0.0001, + "loss": 1.6681, + "step": 1021 + }, + { + "epoch": 0.49869876382563433, + "grad_norm": 0.1942930370569229, + "learning_rate": 0.0001, + "loss": 1.6555, + "step": 1022 + }, + { + "epoch": 0.4991867273910215, + "grad_norm": 0.18165245652198792, + "learning_rate": 0.0001, + "loss": 1.7059, + "step": 1023 + }, + { + "epoch": 0.4996746909564086, + "grad_norm": 0.18349111080169678, + "learning_rate": 0.0001, + "loss": 1.7165, + "step": 1024 + }, + { + "epoch": 0.5001626545217958, + "grad_norm": 0.17459173500537872, + "learning_rate": 0.0001, + "loss": 1.6784, + "step": 1025 + }, + { + "epoch": 0.5006506180871828, + "grad_norm": 0.19236469268798828, + "learning_rate": 0.0001, + "loss": 1.6727, + "step": 1026 + }, + { + "epoch": 0.50113858165257, + "grad_norm": 0.18120145797729492, + "learning_rate": 0.0001, + "loss": 1.7109, + "step": 1027 + }, + { + "epoch": 0.501626545217957, + "grad_norm": 0.18319325149059296, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 1028 + }, + { + "epoch": 0.5021145087833442, + "grad_norm": 0.1807912439107895, + "learning_rate": 0.0001, + "loss": 1.6866, + "step": 1029 + }, + { + "epoch": 0.5026024723487313, + "grad_norm": 0.1748090237379074, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 1030 + }, + { + "epoch": 0.5030904359141184, + "grad_norm": 0.1822468489408493, + "learning_rate": 0.0001, + "loss": 1.7539, + "step": 1031 + }, + { + "epoch": 0.5035783994795056, + "grad_norm": 0.18360479176044464, + "learning_rate": 0.0001, + "loss": 1.6853, + "step": 1032 + }, + { + "epoch": 0.5040663630448926, + "grad_norm": 0.18836341798305511, + "learning_rate": 0.0001, + "loss": 1.6796, + "step": 1033 + }, + { + "epoch": 0.5045543266102798, + "grad_norm": 0.18044047057628632, + "learning_rate": 0.0001, + "loss": 1.6929, + "step": 1034 + }, + { + "epoch": 0.5050422901756669, + "grad_norm": 0.18836145102977753, + "learning_rate": 0.0001, + "loss": 1.8204, + "step": 1035 + }, + { + "epoch": 0.505530253741054, + "grad_norm": 0.1829444319009781, + "learning_rate": 0.0001, + "loss": 1.7364, + "step": 1036 + }, + { + "epoch": 0.5060182173064411, + "grad_norm": 0.1847165822982788, + "learning_rate": 0.0001, + "loss": 1.6792, + "step": 1037 + }, + { + "epoch": 0.5065061808718282, + "grad_norm": 0.17972713708877563, + "learning_rate": 0.0001, + "loss": 1.5694, + "step": 1038 + }, + { + "epoch": 0.5069941444372154, + "grad_norm": 0.1910099983215332, + "learning_rate": 0.0001, + "loss": 1.6189, + "step": 1039 + }, + { + "epoch": 0.5074821080026025, + "grad_norm": 0.18901146948337555, + "learning_rate": 0.0001, + "loss": 1.7515, + "step": 1040 + }, + { + "epoch": 0.5079700715679896, + "grad_norm": 0.18210864067077637, + "learning_rate": 0.0001, + "loss": 1.729, + "step": 1041 + }, + { + "epoch": 0.5084580351333767, + "grad_norm": 0.18417298793792725, + "learning_rate": 0.0001, + "loss": 1.7392, + "step": 1042 + }, + { + "epoch": 0.5089459986987638, + "grad_norm": 0.18548882007598877, + "learning_rate": 0.0001, + "loss": 1.7452, + "step": 1043 + }, + { + "epoch": 0.5094339622641509, + "grad_norm": 0.17644409835338593, + "learning_rate": 0.0001, + "loss": 1.5658, + "step": 1044 + }, + { + "epoch": 0.5099219258295381, + "grad_norm": 0.18809697031974792, + "learning_rate": 0.0001, + "loss": 1.6806, + "step": 1045 + }, + { + "epoch": 0.5104098893949252, + "grad_norm": 0.18309113383293152, + "learning_rate": 0.0001, + "loss": 1.7068, + "step": 1046 + }, + { + "epoch": 0.5108978529603123, + "grad_norm": 0.1873452365398407, + "learning_rate": 0.0001, + "loss": 1.7401, + "step": 1047 + }, + { + "epoch": 0.5113858165256994, + "grad_norm": 0.18118296563625336, + "learning_rate": 0.0001, + "loss": 1.6853, + "step": 1048 + }, + { + "epoch": 0.5118737800910865, + "grad_norm": 0.19551081955432892, + "learning_rate": 0.0001, + "loss": 1.6851, + "step": 1049 + }, + { + "epoch": 0.5123617436564737, + "grad_norm": 0.19051168859004974, + "learning_rate": 0.0001, + "loss": 1.7153, + "step": 1050 + }, + { + "epoch": 0.5128497072218607, + "grad_norm": 0.1723107546567917, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 1051 + }, + { + "epoch": 0.5133376707872479, + "grad_norm": 0.18448057770729065, + "learning_rate": 0.0001, + "loss": 1.6798, + "step": 1052 + }, + { + "epoch": 0.513825634352635, + "grad_norm": 0.1888912320137024, + "learning_rate": 0.0001, + "loss": 1.7696, + "step": 1053 + }, + { + "epoch": 0.5143135979180221, + "grad_norm": 0.19481922686100006, + "learning_rate": 0.0001, + "loss": 1.6657, + "step": 1054 + }, + { + "epoch": 0.5148015614834093, + "grad_norm": 0.17614057660102844, + "learning_rate": 0.0001, + "loss": 1.6758, + "step": 1055 + }, + { + "epoch": 0.5152895250487963, + "grad_norm": 0.1752062737941742, + "learning_rate": 0.0001, + "loss": 1.644, + "step": 1056 + }, + { + "epoch": 0.5157774886141835, + "grad_norm": 0.1882951855659485, + "learning_rate": 0.0001, + "loss": 1.6644, + "step": 1057 + }, + { + "epoch": 0.5162654521795705, + "grad_norm": 0.20255088806152344, + "learning_rate": 0.0001, + "loss": 1.7119, + "step": 1058 + }, + { + "epoch": 0.5167534157449577, + "grad_norm": 0.181501105427742, + "learning_rate": 0.0001, + "loss": 1.662, + "step": 1059 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 0.1865651160478592, + "learning_rate": 0.0001, + "loss": 1.7279, + "step": 1060 + }, + { + "epoch": 0.5177293428757319, + "grad_norm": 0.1911836862564087, + "learning_rate": 0.0001, + "loss": 1.6795, + "step": 1061 + }, + { + "epoch": 0.5182173064411191, + "grad_norm": 0.18534213304519653, + "learning_rate": 0.0001, + "loss": 1.7126, + "step": 1062 + }, + { + "epoch": 0.5187052700065062, + "grad_norm": 0.1829744428396225, + "learning_rate": 0.0001, + "loss": 1.6598, + "step": 1063 + }, + { + "epoch": 0.5191932335718933, + "grad_norm": 0.17899416387081146, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 1064 + }, + { + "epoch": 0.5196811971372804, + "grad_norm": 0.17233431339263916, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 1065 + }, + { + "epoch": 0.5201691607026675, + "grad_norm": 0.1891251802444458, + "learning_rate": 0.0001, + "loss": 1.72, + "step": 1066 + }, + { + "epoch": 0.5206571242680547, + "grad_norm": 0.19288107752799988, + "learning_rate": 0.0001, + "loss": 1.8331, + "step": 1067 + }, + { + "epoch": 0.5211450878334418, + "grad_norm": 0.18534426391124725, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 1068 + }, + { + "epoch": 0.5216330513988289, + "grad_norm": 0.19013041257858276, + "learning_rate": 0.0001, + "loss": 1.7331, + "step": 1069 + }, + { + "epoch": 0.522121014964216, + "grad_norm": 0.18765857815742493, + "learning_rate": 0.0001, + "loss": 1.6951, + "step": 1070 + }, + { + "epoch": 0.5226089785296031, + "grad_norm": 0.17150448262691498, + "learning_rate": 0.0001, + "loss": 1.6581, + "step": 1071 + }, + { + "epoch": 0.5230969420949902, + "grad_norm": 0.20504555106163025, + "learning_rate": 0.0001, + "loss": 1.7247, + "step": 1072 + }, + { + "epoch": 0.5235849056603774, + "grad_norm": 0.17816084623336792, + "learning_rate": 0.0001, + "loss": 1.5654, + "step": 1073 + }, + { + "epoch": 0.5240728692257645, + "grad_norm": 0.1842648684978485, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 1074 + }, + { + "epoch": 0.5245608327911516, + "grad_norm": 0.18370290100574493, + "learning_rate": 0.0001, + "loss": 1.6369, + "step": 1075 + }, + { + "epoch": 0.5250487963565387, + "grad_norm": 0.18270552158355713, + "learning_rate": 0.0001, + "loss": 1.6541, + "step": 1076 + }, + { + "epoch": 0.5255367599219258, + "grad_norm": 0.1808508038520813, + "learning_rate": 0.0001, + "loss": 1.6598, + "step": 1077 + }, + { + "epoch": 0.526024723487313, + "grad_norm": 0.17794300615787506, + "learning_rate": 0.0001, + "loss": 1.7294, + "step": 1078 + }, + { + "epoch": 0.5265126870527, + "grad_norm": 0.18382461369037628, + "learning_rate": 0.0001, + "loss": 1.6901, + "step": 1079 + }, + { + "epoch": 0.5270006506180872, + "grad_norm": 0.1806422621011734, + "learning_rate": 0.0001, + "loss": 1.6193, + "step": 1080 + }, + { + "epoch": 0.5274886141834743, + "grad_norm": 0.18108539283275604, + "learning_rate": 0.0001, + "loss": 1.6911, + "step": 1081 + }, + { + "epoch": 0.5279765777488614, + "grad_norm": 0.18681305646896362, + "learning_rate": 0.0001, + "loss": 1.726, + "step": 1082 + }, + { + "epoch": 0.5284645413142486, + "grad_norm": 0.18909889459609985, + "learning_rate": 0.0001, + "loss": 1.6857, + "step": 1083 + }, + { + "epoch": 0.5289525048796356, + "grad_norm": 0.18421509861946106, + "learning_rate": 0.0001, + "loss": 1.6564, + "step": 1084 + }, + { + "epoch": 0.5294404684450228, + "grad_norm": 0.18811306357383728, + "learning_rate": 0.0001, + "loss": 1.7817, + "step": 1085 + }, + { + "epoch": 0.5299284320104098, + "grad_norm": 0.17478449642658234, + "learning_rate": 0.0001, + "loss": 1.681, + "step": 1086 + }, + { + "epoch": 0.530416395575797, + "grad_norm": 0.1789132058620453, + "learning_rate": 0.0001, + "loss": 1.6906, + "step": 1087 + }, + { + "epoch": 0.5309043591411842, + "grad_norm": 0.18358959257602692, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 1088 + }, + { + "epoch": 0.5313923227065712, + "grad_norm": 0.18565410375595093, + "learning_rate": 0.0001, + "loss": 1.7078, + "step": 1089 + }, + { + "epoch": 0.5318802862719584, + "grad_norm": 0.19210746884346008, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 1090 + }, + { + "epoch": 0.5323682498373454, + "grad_norm": 0.18205370008945465, + "learning_rate": 0.0001, + "loss": 1.6541, + "step": 1091 + }, + { + "epoch": 0.5328562134027326, + "grad_norm": 0.19181987643241882, + "learning_rate": 0.0001, + "loss": 1.8033, + "step": 1092 + }, + { + "epoch": 0.5333441769681198, + "grad_norm": 0.20362940430641174, + "learning_rate": 0.0001, + "loss": 1.7497, + "step": 1093 + }, + { + "epoch": 0.5338321405335068, + "grad_norm": 0.1858234405517578, + "learning_rate": 0.0001, + "loss": 1.6342, + "step": 1094 + }, + { + "epoch": 0.534320104098894, + "grad_norm": 0.19925346970558167, + "learning_rate": 0.0001, + "loss": 1.686, + "step": 1095 + }, + { + "epoch": 0.534808067664281, + "grad_norm": 0.19114282727241516, + "learning_rate": 0.0001, + "loss": 1.7186, + "step": 1096 + }, + { + "epoch": 0.5352960312296682, + "grad_norm": 0.1771971732378006, + "learning_rate": 0.0001, + "loss": 1.776, + "step": 1097 + }, + { + "epoch": 0.5357839947950553, + "grad_norm": 0.18942809104919434, + "learning_rate": 0.0001, + "loss": 1.7179, + "step": 1098 + }, + { + "epoch": 0.5362719583604424, + "grad_norm": 0.1868084967136383, + "learning_rate": 0.0001, + "loss": 1.6454, + "step": 1099 + }, + { + "epoch": 0.5367599219258296, + "grad_norm": 0.18689820170402527, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 1100 + }, + { + "epoch": 0.5372478854912166, + "grad_norm": 0.1820572018623352, + "learning_rate": 0.0001, + "loss": 1.6673, + "step": 1101 + }, + { + "epoch": 0.5377358490566038, + "grad_norm": 0.17870689928531647, + "learning_rate": 0.0001, + "loss": 1.5968, + "step": 1102 + }, + { + "epoch": 0.5382238126219909, + "grad_norm": 0.18118569254875183, + "learning_rate": 0.0001, + "loss": 1.7227, + "step": 1103 + }, + { + "epoch": 0.538711776187378, + "grad_norm": 0.1880924552679062, + "learning_rate": 0.0001, + "loss": 1.6108, + "step": 1104 + }, + { + "epoch": 0.5391997397527651, + "grad_norm": 0.18598206341266632, + "learning_rate": 0.0001, + "loss": 1.6542, + "step": 1105 + }, + { + "epoch": 0.5396877033181522, + "grad_norm": 0.1872934103012085, + "learning_rate": 0.0001, + "loss": 1.737, + "step": 1106 + }, + { + "epoch": 0.5401756668835394, + "grad_norm": 0.1890784651041031, + "learning_rate": 0.0001, + "loss": 1.6661, + "step": 1107 + }, + { + "epoch": 0.5406636304489265, + "grad_norm": 0.18039381504058838, + "learning_rate": 0.0001, + "loss": 1.6276, + "step": 1108 + }, + { + "epoch": 0.5411515940143136, + "grad_norm": 0.18550348281860352, + "learning_rate": 0.0001, + "loss": 1.6828, + "step": 1109 + }, + { + "epoch": 0.5416395575797007, + "grad_norm": 0.17449964582920074, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 1110 + }, + { + "epoch": 0.5421275211450879, + "grad_norm": 0.18202394247055054, + "learning_rate": 0.0001, + "loss": 1.6561, + "step": 1111 + }, + { + "epoch": 0.5426154847104749, + "grad_norm": 0.19365155696868896, + "learning_rate": 0.0001, + "loss": 1.5, + "step": 1112 + }, + { + "epoch": 0.5431034482758621, + "grad_norm": 0.17744717001914978, + "learning_rate": 0.0001, + "loss": 1.5921, + "step": 1113 + }, + { + "epoch": 0.5435914118412492, + "grad_norm": 0.17965885996818542, + "learning_rate": 0.0001, + "loss": 1.6819, + "step": 1114 + }, + { + "epoch": 0.5440793754066363, + "grad_norm": 0.17675574123859406, + "learning_rate": 0.0001, + "loss": 1.6471, + "step": 1115 + }, + { + "epoch": 0.5445673389720235, + "grad_norm": 0.17376431822776794, + "learning_rate": 0.0001, + "loss": 1.7007, + "step": 1116 + }, + { + "epoch": 0.5450553025374105, + "grad_norm": 0.18188650906085968, + "learning_rate": 0.0001, + "loss": 1.774, + "step": 1117 + }, + { + "epoch": 0.5455432661027977, + "grad_norm": 0.17877081036567688, + "learning_rate": 0.0001, + "loss": 1.6535, + "step": 1118 + }, + { + "epoch": 0.5460312296681847, + "grad_norm": 0.17933769524097443, + "learning_rate": 0.0001, + "loss": 1.7362, + "step": 1119 + }, + { + "epoch": 0.5465191932335719, + "grad_norm": 0.1805192083120346, + "learning_rate": 0.0001, + "loss": 1.7321, + "step": 1120 + }, + { + "epoch": 0.5470071567989591, + "grad_norm": 0.17312046885490417, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 1121 + }, + { + "epoch": 0.5474951203643461, + "grad_norm": 0.18119437992572784, + "learning_rate": 0.0001, + "loss": 1.7104, + "step": 1122 + }, + { + "epoch": 0.5479830839297333, + "grad_norm": 0.182356595993042, + "learning_rate": 0.0001, + "loss": 1.6866, + "step": 1123 + }, + { + "epoch": 0.5484710474951203, + "grad_norm": 0.1846156120300293, + "learning_rate": 0.0001, + "loss": 1.6612, + "step": 1124 + }, + { + "epoch": 0.5489590110605075, + "grad_norm": 0.17960377037525177, + "learning_rate": 0.0001, + "loss": 1.6848, + "step": 1125 + }, + { + "epoch": 0.5494469746258946, + "grad_norm": 0.17133495211601257, + "learning_rate": 0.0001, + "loss": 1.5885, + "step": 1126 + }, + { + "epoch": 0.5499349381912817, + "grad_norm": 0.18075834214687347, + "learning_rate": 0.0001, + "loss": 1.7428, + "step": 1127 + }, + { + "epoch": 0.5504229017566689, + "grad_norm": 0.18319405615329742, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 1128 + }, + { + "epoch": 0.5509108653220559, + "grad_norm": 0.17644239962100983, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 1129 + }, + { + "epoch": 0.5513988288874431, + "grad_norm": 0.18394580483436584, + "learning_rate": 0.0001, + "loss": 1.6435, + "step": 1130 + }, + { + "epoch": 0.5518867924528302, + "grad_norm": 0.1763201355934143, + "learning_rate": 0.0001, + "loss": 1.6975, + "step": 1131 + }, + { + "epoch": 0.5523747560182173, + "grad_norm": 0.16742850840091705, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 1132 + }, + { + "epoch": 0.5528627195836044, + "grad_norm": 0.1892685890197754, + "learning_rate": 0.0001, + "loss": 1.6111, + "step": 1133 + }, + { + "epoch": 0.5533506831489915, + "grad_norm": 0.18346691131591797, + "learning_rate": 0.0001, + "loss": 1.6844, + "step": 1134 + }, + { + "epoch": 0.5538386467143787, + "grad_norm": 0.1796543449163437, + "learning_rate": 0.0001, + "loss": 1.7746, + "step": 1135 + }, + { + "epoch": 0.5543266102797658, + "grad_norm": 0.18673722445964813, + "learning_rate": 0.0001, + "loss": 1.6464, + "step": 1136 + }, + { + "epoch": 0.5548145738451529, + "grad_norm": 0.17763900756835938, + "learning_rate": 0.0001, + "loss": 1.6237, + "step": 1137 + }, + { + "epoch": 0.55530253741054, + "grad_norm": 0.17686204612255096, + "learning_rate": 0.0001, + "loss": 1.5131, + "step": 1138 + }, + { + "epoch": 0.5557905009759271, + "grad_norm": 0.18360872566699982, + "learning_rate": 0.0001, + "loss": 1.6699, + "step": 1139 + }, + { + "epoch": 0.5562784645413142, + "grad_norm": 0.1827259063720703, + "learning_rate": 0.0001, + "loss": 1.746, + "step": 1140 + }, + { + "epoch": 0.5567664281067014, + "grad_norm": 0.17962484061717987, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 1141 + }, + { + "epoch": 0.5572543916720885, + "grad_norm": 0.18114878237247467, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 1142 + }, + { + "epoch": 0.5577423552374756, + "grad_norm": 0.18968282639980316, + "learning_rate": 0.0001, + "loss": 1.7798, + "step": 1143 + }, + { + "epoch": 0.5582303188028627, + "grad_norm": 0.18505877256393433, + "learning_rate": 0.0001, + "loss": 1.708, + "step": 1144 + }, + { + "epoch": 0.5587182823682498, + "grad_norm": 0.1776040643453598, + "learning_rate": 0.0001, + "loss": 1.7424, + "step": 1145 + }, + { + "epoch": 0.559206245933637, + "grad_norm": 0.17982693016529083, + "learning_rate": 0.0001, + "loss": 1.6197, + "step": 1146 + }, + { + "epoch": 0.559694209499024, + "grad_norm": 0.19187504053115845, + "learning_rate": 0.0001, + "loss": 1.7451, + "step": 1147 + }, + { + "epoch": 0.5601821730644112, + "grad_norm": 0.17975229024887085, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 1148 + }, + { + "epoch": 0.5606701366297983, + "grad_norm": 0.18996664881706238, + "learning_rate": 0.0001, + "loss": 1.7377, + "step": 1149 + }, + { + "epoch": 0.5611581001951854, + "grad_norm": 0.18252383172512054, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 1150 + }, + { + "epoch": 0.5616460637605726, + "grad_norm": 0.18448345363140106, + "learning_rate": 0.0001, + "loss": 1.7109, + "step": 1151 + }, + { + "epoch": 0.5621340273259596, + "grad_norm": 0.17741243541240692, + "learning_rate": 0.0001, + "loss": 1.6088, + "step": 1152 + }, + { + "epoch": 0.5626219908913468, + "grad_norm": 0.19825778901576996, + "learning_rate": 0.0001, + "loss": 1.5972, + "step": 1153 + }, + { + "epoch": 0.563109954456734, + "grad_norm": 0.18595324456691742, + "learning_rate": 0.0001, + "loss": 1.672, + "step": 1154 + }, + { + "epoch": 0.563597918022121, + "grad_norm": 0.18176652491092682, + "learning_rate": 0.0001, + "loss": 1.6216, + "step": 1155 + }, + { + "epoch": 0.5640858815875082, + "grad_norm": 0.1950223743915558, + "learning_rate": 0.0001, + "loss": 1.705, + "step": 1156 + }, + { + "epoch": 0.5645738451528952, + "grad_norm": 0.1990990787744522, + "learning_rate": 0.0001, + "loss": 1.7031, + "step": 1157 + }, + { + "epoch": 0.5650618087182824, + "grad_norm": 0.1937246173620224, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 1158 + }, + { + "epoch": 0.5655497722836695, + "grad_norm": 0.1884077787399292, + "learning_rate": 0.0001, + "loss": 1.5994, + "step": 1159 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 0.19293847680091858, + "learning_rate": 0.0001, + "loss": 1.7657, + "step": 1160 + }, + { + "epoch": 0.5665256994144438, + "grad_norm": 0.18362392485141754, + "learning_rate": 0.0001, + "loss": 1.7443, + "step": 1161 + }, + { + "epoch": 0.5670136629798308, + "grad_norm": 0.17800559103488922, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 1162 + }, + { + "epoch": 0.567501626545218, + "grad_norm": 0.1774267852306366, + "learning_rate": 0.0001, + "loss": 1.6468, + "step": 1163 + }, + { + "epoch": 0.5679895901106051, + "grad_norm": 0.18834517896175385, + "learning_rate": 0.0001, + "loss": 1.7715, + "step": 1164 + }, + { + "epoch": 0.5684775536759922, + "grad_norm": 0.1841384768486023, + "learning_rate": 0.0001, + "loss": 1.7604, + "step": 1165 + }, + { + "epoch": 0.5689655172413793, + "grad_norm": 0.18285635113716125, + "learning_rate": 0.0001, + "loss": 1.6941, + "step": 1166 + }, + { + "epoch": 0.5694534808067664, + "grad_norm": 0.1796160191297531, + "learning_rate": 0.0001, + "loss": 1.6835, + "step": 1167 + }, + { + "epoch": 0.5699414443721535, + "grad_norm": 0.18359331786632538, + "learning_rate": 0.0001, + "loss": 1.6658, + "step": 1168 + }, + { + "epoch": 0.5704294079375407, + "grad_norm": 0.17833665013313293, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 1169 + }, + { + "epoch": 0.5709173715029278, + "grad_norm": 0.17929013073444366, + "learning_rate": 0.0001, + "loss": 1.5912, + "step": 1170 + }, + { + "epoch": 0.5714053350683149, + "grad_norm": 0.18901382386684418, + "learning_rate": 0.0001, + "loss": 1.7305, + "step": 1171 + }, + { + "epoch": 0.571893298633702, + "grad_norm": 0.18040084838867188, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 1172 + }, + { + "epoch": 0.5723812621990891, + "grad_norm": 0.1832232028245926, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 1173 + }, + { + "epoch": 0.5728692257644763, + "grad_norm": 0.1900448203086853, + "learning_rate": 0.0001, + "loss": 1.7176, + "step": 1174 + }, + { + "epoch": 0.5733571893298633, + "grad_norm": 0.1859886199235916, + "learning_rate": 0.0001, + "loss": 1.6823, + "step": 1175 + }, + { + "epoch": 0.5738451528952505, + "grad_norm": 0.1816965788602829, + "learning_rate": 0.0001, + "loss": 1.6936, + "step": 1176 + }, + { + "epoch": 0.5743331164606376, + "grad_norm": 0.1927751749753952, + "learning_rate": 0.0001, + "loss": 1.7069, + "step": 1177 + }, + { + "epoch": 0.5748210800260247, + "grad_norm": 0.20290379226207733, + "learning_rate": 0.0001, + "loss": 1.7987, + "step": 1178 + }, + { + "epoch": 0.5753090435914119, + "grad_norm": 0.1756032556295395, + "learning_rate": 0.0001, + "loss": 1.656, + "step": 1179 + }, + { + "epoch": 0.5757970071567989, + "grad_norm": 0.19676676392555237, + "learning_rate": 0.0001, + "loss": 1.8415, + "step": 1180 + }, + { + "epoch": 0.5762849707221861, + "grad_norm": 0.18112622201442719, + "learning_rate": 0.0001, + "loss": 1.6081, + "step": 1181 + }, + { + "epoch": 0.5767729342875731, + "grad_norm": 0.20109887421131134, + "learning_rate": 0.0001, + "loss": 1.7772, + "step": 1182 + }, + { + "epoch": 0.5772608978529603, + "grad_norm": 0.191656693816185, + "learning_rate": 0.0001, + "loss": 1.6869, + "step": 1183 + }, + { + "epoch": 0.5777488614183475, + "grad_norm": 0.17886236310005188, + "learning_rate": 0.0001, + "loss": 1.5931, + "step": 1184 + }, + { + "epoch": 0.5782368249837345, + "grad_norm": 0.18148286640644073, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 1185 + }, + { + "epoch": 0.5787247885491217, + "grad_norm": 0.20596817135810852, + "learning_rate": 0.0001, + "loss": 1.6129, + "step": 1186 + }, + { + "epoch": 0.5792127521145087, + "grad_norm": 0.17900511622428894, + "learning_rate": 0.0001, + "loss": 1.6487, + "step": 1187 + }, + { + "epoch": 0.5797007156798959, + "grad_norm": 0.1893642693758011, + "learning_rate": 0.0001, + "loss": 1.7566, + "step": 1188 + }, + { + "epoch": 0.5801886792452831, + "grad_norm": 0.19354504346847534, + "learning_rate": 0.0001, + "loss": 1.6665, + "step": 1189 + }, + { + "epoch": 0.5806766428106701, + "grad_norm": 0.18692192435264587, + "learning_rate": 0.0001, + "loss": 1.7069, + "step": 1190 + }, + { + "epoch": 0.5811646063760573, + "grad_norm": 0.204212948679924, + "learning_rate": 0.0001, + "loss": 1.7943, + "step": 1191 + }, + { + "epoch": 0.5816525699414443, + "grad_norm": 0.18666908144950867, + "learning_rate": 0.0001, + "loss": 1.7031, + "step": 1192 + }, + { + "epoch": 0.5821405335068315, + "grad_norm": 0.1859620362520218, + "learning_rate": 0.0001, + "loss": 1.7443, + "step": 1193 + }, + { + "epoch": 0.5826284970722186, + "grad_norm": 0.1774389147758484, + "learning_rate": 0.0001, + "loss": 1.6697, + "step": 1194 + }, + { + "epoch": 0.5831164606376057, + "grad_norm": 0.17645440995693207, + "learning_rate": 0.0001, + "loss": 1.7566, + "step": 1195 + }, + { + "epoch": 0.5836044242029929, + "grad_norm": 0.17927305400371552, + "learning_rate": 0.0001, + "loss": 1.5341, + "step": 1196 + }, + { + "epoch": 0.5840923877683799, + "grad_norm": 0.19179411232471466, + "learning_rate": 0.0001, + "loss": 1.706, + "step": 1197 + }, + { + "epoch": 0.5845803513337671, + "grad_norm": 0.18921273946762085, + "learning_rate": 0.0001, + "loss": 1.6651, + "step": 1198 + }, + { + "epoch": 0.5850683148991542, + "grad_norm": 0.20988748967647552, + "learning_rate": 0.0001, + "loss": 1.8307, + "step": 1199 + }, + { + "epoch": 0.5855562784645413, + "grad_norm": 0.1767909973859787, + "learning_rate": 0.0001, + "loss": 1.7116, + "step": 1200 + }, + { + "epoch": 0.5860442420299284, + "grad_norm": 0.18889738619327545, + "learning_rate": 0.0001, + "loss": 1.6623, + "step": 1201 + }, + { + "epoch": 0.5865322055953156, + "grad_norm": 0.17658928036689758, + "learning_rate": 0.0001, + "loss": 1.6147, + "step": 1202 + }, + { + "epoch": 0.5870201691607027, + "grad_norm": 0.181167870759964, + "learning_rate": 0.0001, + "loss": 1.658, + "step": 1203 + }, + { + "epoch": 0.5875081327260898, + "grad_norm": 0.18597833812236786, + "learning_rate": 0.0001, + "loss": 1.6392, + "step": 1204 + }, + { + "epoch": 0.5879960962914769, + "grad_norm": 0.1838957518339157, + "learning_rate": 0.0001, + "loss": 1.699, + "step": 1205 + }, + { + "epoch": 0.588484059856864, + "grad_norm": 0.18274423480033875, + "learning_rate": 0.0001, + "loss": 1.7159, + "step": 1206 + }, + { + "epoch": 0.5889720234222512, + "grad_norm": 0.19154992699623108, + "learning_rate": 0.0001, + "loss": 1.7488, + "step": 1207 + }, + { + "epoch": 0.5894599869876382, + "grad_norm": 0.17971384525299072, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 1208 + }, + { + "epoch": 0.5899479505530254, + "grad_norm": 0.17908671498298645, + "learning_rate": 0.0001, + "loss": 1.6836, + "step": 1209 + }, + { + "epoch": 0.5904359141184125, + "grad_norm": 0.17960527539253235, + "learning_rate": 0.0001, + "loss": 1.6949, + "step": 1210 + }, + { + "epoch": 0.5909238776837996, + "grad_norm": 0.18325302004814148, + "learning_rate": 0.0001, + "loss": 1.636, + "step": 1211 + }, + { + "epoch": 0.5914118412491868, + "grad_norm": 0.18727539479732513, + "learning_rate": 0.0001, + "loss": 1.6365, + "step": 1212 + }, + { + "epoch": 0.5918998048145738, + "grad_norm": 0.1794605702161789, + "learning_rate": 0.0001, + "loss": 1.6403, + "step": 1213 + }, + { + "epoch": 0.592387768379961, + "grad_norm": 0.17613062262535095, + "learning_rate": 0.0001, + "loss": 1.5417, + "step": 1214 + }, + { + "epoch": 0.592875731945348, + "grad_norm": 0.1804569661617279, + "learning_rate": 0.0001, + "loss": 1.6316, + "step": 1215 + }, + { + "epoch": 0.5933636955107352, + "grad_norm": 0.1809697449207306, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 1216 + }, + { + "epoch": 0.5938516590761224, + "grad_norm": 0.19974054396152496, + "learning_rate": 0.0001, + "loss": 1.6771, + "step": 1217 + }, + { + "epoch": 0.5943396226415094, + "grad_norm": 0.1866472214460373, + "learning_rate": 0.0001, + "loss": 1.7422, + "step": 1218 + }, + { + "epoch": 0.5948275862068966, + "grad_norm": 0.20864078402519226, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 1219 + }, + { + "epoch": 0.5953155497722836, + "grad_norm": 0.18205468356609344, + "learning_rate": 0.0001, + "loss": 1.7434, + "step": 1220 + }, + { + "epoch": 0.5958035133376708, + "grad_norm": 0.19407768547534943, + "learning_rate": 0.0001, + "loss": 1.752, + "step": 1221 + }, + { + "epoch": 0.596291476903058, + "grad_norm": 0.1877565234899521, + "learning_rate": 0.0001, + "loss": 1.711, + "step": 1222 + }, + { + "epoch": 0.596779440468445, + "grad_norm": 0.18702515959739685, + "learning_rate": 0.0001, + "loss": 1.7298, + "step": 1223 + }, + { + "epoch": 0.5972674040338322, + "grad_norm": 0.17825458943843842, + "learning_rate": 0.0001, + "loss": 1.7, + "step": 1224 + }, + { + "epoch": 0.5977553675992192, + "grad_norm": 0.18612068891525269, + "learning_rate": 0.0001, + "loss": 1.7307, + "step": 1225 + }, + { + "epoch": 0.5982433311646064, + "grad_norm": 0.1892668455839157, + "learning_rate": 0.0001, + "loss": 1.7661, + "step": 1226 + }, + { + "epoch": 0.5987312947299935, + "grad_norm": 0.18714402616024017, + "learning_rate": 0.0001, + "loss": 1.7071, + "step": 1227 + }, + { + "epoch": 0.5992192582953806, + "grad_norm": 0.21308167278766632, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 1228 + }, + { + "epoch": 0.5997072218607677, + "grad_norm": 0.21097207069396973, + "learning_rate": 0.0001, + "loss": 1.8387, + "step": 1229 + }, + { + "epoch": 0.6001951854261548, + "grad_norm": 0.19734272360801697, + "learning_rate": 0.0001, + "loss": 1.6462, + "step": 1230 + }, + { + "epoch": 0.600683148991542, + "grad_norm": 0.17935802042484283, + "learning_rate": 0.0001, + "loss": 1.6129, + "step": 1231 + }, + { + "epoch": 0.6011711125569291, + "grad_norm": 0.1758161038160324, + "learning_rate": 0.0001, + "loss": 1.5895, + "step": 1232 + }, + { + "epoch": 0.6016590761223162, + "grad_norm": 0.19049344956874847, + "learning_rate": 0.0001, + "loss": 1.5845, + "step": 1233 + }, + { + "epoch": 0.6021470396877033, + "grad_norm": 0.19208753108978271, + "learning_rate": 0.0001, + "loss": 1.5719, + "step": 1234 + }, + { + "epoch": 0.6026350032530904, + "grad_norm": 0.18235936760902405, + "learning_rate": 0.0001, + "loss": 1.7095, + "step": 1235 + }, + { + "epoch": 0.6031229668184775, + "grad_norm": 0.19607332348823547, + "learning_rate": 0.0001, + "loss": 1.6312, + "step": 1236 + }, + { + "epoch": 0.6036109303838647, + "grad_norm": 0.18990549445152283, + "learning_rate": 0.0001, + "loss": 1.5929, + "step": 1237 + }, + { + "epoch": 0.6040988939492518, + "grad_norm": 0.1892758309841156, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 1238 + }, + { + "epoch": 0.6045868575146389, + "grad_norm": 0.19703449308872223, + "learning_rate": 0.0001, + "loss": 1.7751, + "step": 1239 + }, + { + "epoch": 0.605074821080026, + "grad_norm": 0.18029844760894775, + "learning_rate": 0.0001, + "loss": 1.5848, + "step": 1240 + }, + { + "epoch": 0.6055627846454131, + "grad_norm": 0.18993432819843292, + "learning_rate": 0.0001, + "loss": 1.6906, + "step": 1241 + }, + { + "epoch": 0.6060507482108003, + "grad_norm": 0.18150927126407623, + "learning_rate": 0.0001, + "loss": 1.6235, + "step": 1242 + }, + { + "epoch": 0.6065387117761873, + "grad_norm": 0.1799500286579132, + "learning_rate": 0.0001, + "loss": 1.6826, + "step": 1243 + }, + { + "epoch": 0.6070266753415745, + "grad_norm": 0.18196311593055725, + "learning_rate": 0.0001, + "loss": 1.5883, + "step": 1244 + }, + { + "epoch": 0.6075146389069617, + "grad_norm": 0.19016975164413452, + "learning_rate": 0.0001, + "loss": 1.685, + "step": 1245 + }, + { + "epoch": 0.6080026024723487, + "grad_norm": 0.18666522204875946, + "learning_rate": 0.0001, + "loss": 1.7852, + "step": 1246 + }, + { + "epoch": 0.6084905660377359, + "grad_norm": 0.18055546283721924, + "learning_rate": 0.0001, + "loss": 1.6591, + "step": 1247 + }, + { + "epoch": 0.6089785296031229, + "grad_norm": 0.18201833963394165, + "learning_rate": 0.0001, + "loss": 1.7492, + "step": 1248 + }, + { + "epoch": 0.6094664931685101, + "grad_norm": 0.18506184220314026, + "learning_rate": 0.0001, + "loss": 1.7207, + "step": 1249 + }, + { + "epoch": 0.6099544567338973, + "grad_norm": 0.17904452979564667, + "learning_rate": 0.0001, + "loss": 1.6761, + "step": 1250 + }, + { + "epoch": 0.6104424202992843, + "grad_norm": 0.18653587996959686, + "learning_rate": 0.0001, + "loss": 1.6684, + "step": 1251 + }, + { + "epoch": 0.6109303838646715, + "grad_norm": 0.19012029469013214, + "learning_rate": 0.0001, + "loss": 1.7476, + "step": 1252 + }, + { + "epoch": 0.6114183474300585, + "grad_norm": 0.17272864282131195, + "learning_rate": 0.0001, + "loss": 1.6047, + "step": 1253 + }, + { + "epoch": 0.6119063109954457, + "grad_norm": 0.19090582430362701, + "learning_rate": 0.0001, + "loss": 1.7275, + "step": 1254 + }, + { + "epoch": 0.6123942745608328, + "grad_norm": 0.18830102682113647, + "learning_rate": 0.0001, + "loss": 1.6075, + "step": 1255 + }, + { + "epoch": 0.6128822381262199, + "grad_norm": 0.1959345042705536, + "learning_rate": 0.0001, + "loss": 1.77, + "step": 1256 + }, + { + "epoch": 0.613370201691607, + "grad_norm": 0.18547998368740082, + "learning_rate": 0.0001, + "loss": 1.6256, + "step": 1257 + }, + { + "epoch": 0.6138581652569941, + "grad_norm": 0.18706414103507996, + "learning_rate": 0.0001, + "loss": 1.6926, + "step": 1258 + }, + { + "epoch": 0.6143461288223813, + "grad_norm": 0.18563984334468842, + "learning_rate": 0.0001, + "loss": 1.6137, + "step": 1259 + }, + { + "epoch": 0.6148340923877684, + "grad_norm": 0.18717099726200104, + "learning_rate": 0.0001, + "loss": 1.717, + "step": 1260 + }, + { + "epoch": 0.6153220559531555, + "grad_norm": 0.18817085027694702, + "learning_rate": 0.0001, + "loss": 1.765, + "step": 1261 + }, + { + "epoch": 0.6158100195185426, + "grad_norm": 0.18568897247314453, + "learning_rate": 0.0001, + "loss": 1.6533, + "step": 1262 + }, + { + "epoch": 0.6162979830839297, + "grad_norm": 0.18605171144008636, + "learning_rate": 0.0001, + "loss": 1.602, + "step": 1263 + }, + { + "epoch": 0.6167859466493169, + "grad_norm": 0.19337786734104156, + "learning_rate": 0.0001, + "loss": 1.7735, + "step": 1264 + }, + { + "epoch": 0.617273910214704, + "grad_norm": 0.1964695155620575, + "learning_rate": 0.0001, + "loss": 1.6991, + "step": 1265 + }, + { + "epoch": 0.6177618737800911, + "grad_norm": 0.19506755471229553, + "learning_rate": 0.0001, + "loss": 1.6753, + "step": 1266 + }, + { + "epoch": 0.6182498373454782, + "grad_norm": 0.19231939315795898, + "learning_rate": 0.0001, + "loss": 1.6501, + "step": 1267 + }, + { + "epoch": 0.6187378009108653, + "grad_norm": 0.1804661899805069, + "learning_rate": 0.0001, + "loss": 1.7394, + "step": 1268 + }, + { + "epoch": 0.6192257644762524, + "grad_norm": 0.1843184381723404, + "learning_rate": 0.0001, + "loss": 1.7195, + "step": 1269 + }, + { + "epoch": 0.6197137280416396, + "grad_norm": 0.18432582914829254, + "learning_rate": 0.0001, + "loss": 1.6494, + "step": 1270 + }, + { + "epoch": 0.6202016916070267, + "grad_norm": 0.18056923151016235, + "learning_rate": 0.0001, + "loss": 1.6848, + "step": 1271 + }, + { + "epoch": 0.6206896551724138, + "grad_norm": 0.18145865201950073, + "learning_rate": 0.0001, + "loss": 1.5716, + "step": 1272 + }, + { + "epoch": 0.6211776187378009, + "grad_norm": 0.1727244108915329, + "learning_rate": 0.0001, + "loss": 1.5528, + "step": 1273 + }, + { + "epoch": 0.621665582303188, + "grad_norm": 0.17910513281822205, + "learning_rate": 0.0001, + "loss": 1.58, + "step": 1274 + }, + { + "epoch": 0.6221535458685752, + "grad_norm": 0.19149386882781982, + "learning_rate": 0.0001, + "loss": 1.7281, + "step": 1275 + }, + { + "epoch": 0.6226415094339622, + "grad_norm": 0.1800622195005417, + "learning_rate": 0.0001, + "loss": 1.7371, + "step": 1276 + }, + { + "epoch": 0.6231294729993494, + "grad_norm": 0.19336798787117004, + "learning_rate": 0.0001, + "loss": 1.7774, + "step": 1277 + }, + { + "epoch": 0.6236174365647364, + "grad_norm": 0.18681703507900238, + "learning_rate": 0.0001, + "loss": 1.7692, + "step": 1278 + }, + { + "epoch": 0.6241054001301236, + "grad_norm": 0.1942637413740158, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 1279 + }, + { + "epoch": 0.6245933636955108, + "grad_norm": 0.18045265972614288, + "learning_rate": 0.0001, + "loss": 1.6797, + "step": 1280 + }, + { + "epoch": 0.6250813272608978, + "grad_norm": 0.20641352236270905, + "learning_rate": 0.0001, + "loss": 1.669, + "step": 1281 + }, + { + "epoch": 0.625569290826285, + "grad_norm": 0.1820315718650818, + "learning_rate": 0.0001, + "loss": 1.6098, + "step": 1282 + }, + { + "epoch": 0.626057254391672, + "grad_norm": 0.1736179143190384, + "learning_rate": 0.0001, + "loss": 1.4763, + "step": 1283 + }, + { + "epoch": 0.6265452179570592, + "grad_norm": 0.18899646401405334, + "learning_rate": 0.0001, + "loss": 1.6033, + "step": 1284 + }, + { + "epoch": 0.6270331815224464, + "grad_norm": 0.18059246242046356, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 1285 + }, + { + "epoch": 0.6275211450878334, + "grad_norm": 0.1879289597272873, + "learning_rate": 0.0001, + "loss": 1.7003, + "step": 1286 + }, + { + "epoch": 0.6280091086532206, + "grad_norm": 0.1910688430070877, + "learning_rate": 0.0001, + "loss": 1.64, + "step": 1287 + }, + { + "epoch": 0.6284970722186076, + "grad_norm": 0.18205375969409943, + "learning_rate": 0.0001, + "loss": 1.6633, + "step": 1288 + }, + { + "epoch": 0.6289850357839948, + "grad_norm": 0.1857621818780899, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 1289 + }, + { + "epoch": 0.629472999349382, + "grad_norm": 0.18360133469104767, + "learning_rate": 0.0001, + "loss": 1.6645, + "step": 1290 + }, + { + "epoch": 0.629960962914769, + "grad_norm": 0.1781957596540451, + "learning_rate": 0.0001, + "loss": 1.6996, + "step": 1291 + }, + { + "epoch": 0.6304489264801562, + "grad_norm": 0.16980469226837158, + "learning_rate": 0.0001, + "loss": 1.5369, + "step": 1292 + }, + { + "epoch": 0.6309368900455433, + "grad_norm": 0.19171112775802612, + "learning_rate": 0.0001, + "loss": 1.7421, + "step": 1293 + }, + { + "epoch": 0.6314248536109304, + "grad_norm": 0.1753898411989212, + "learning_rate": 0.0001, + "loss": 1.67, + "step": 1294 + }, + { + "epoch": 0.6319128171763175, + "grad_norm": 0.1746547669172287, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 1295 + }, + { + "epoch": 0.6324007807417046, + "grad_norm": 0.19930396974086761, + "learning_rate": 0.0001, + "loss": 1.7406, + "step": 1296 + }, + { + "epoch": 0.6328887443070917, + "grad_norm": 0.18658863008022308, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 1297 + }, + { + "epoch": 0.6333767078724789, + "grad_norm": 0.19694779813289642, + "learning_rate": 0.0001, + "loss": 1.6941, + "step": 1298 + }, + { + "epoch": 0.633864671437866, + "grad_norm": 0.18741701543331146, + "learning_rate": 0.0001, + "loss": 1.7258, + "step": 1299 + }, + { + "epoch": 0.6343526350032531, + "grad_norm": 0.18276375532150269, + "learning_rate": 0.0001, + "loss": 1.5375, + "step": 1300 + }, + { + "epoch": 0.6348405985686402, + "grad_norm": 0.19398179650306702, + "learning_rate": 0.0001, + "loss": 1.7516, + "step": 1301 + }, + { + "epoch": 0.6353285621340273, + "grad_norm": 0.18019351363182068, + "learning_rate": 0.0001, + "loss": 1.6373, + "step": 1302 + }, + { + "epoch": 0.6358165256994145, + "grad_norm": 0.17721763253211975, + "learning_rate": 0.0001, + "loss": 1.518, + "step": 1303 + }, + { + "epoch": 0.6363044892648015, + "grad_norm": 0.20623503625392914, + "learning_rate": 0.0001, + "loss": 1.7557, + "step": 1304 + }, + { + "epoch": 0.6367924528301887, + "grad_norm": 0.17641960084438324, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 1305 + }, + { + "epoch": 0.6372804163955758, + "grad_norm": 0.18402546644210815, + "learning_rate": 0.0001, + "loss": 1.7142, + "step": 1306 + }, + { + "epoch": 0.6377683799609629, + "grad_norm": 0.18915514647960663, + "learning_rate": 0.0001, + "loss": 1.7845, + "step": 1307 + }, + { + "epoch": 0.6382563435263501, + "grad_norm": 0.17517401278018951, + "learning_rate": 0.0001, + "loss": 1.6117, + "step": 1308 + }, + { + "epoch": 0.6387443070917371, + "grad_norm": 0.1806933432817459, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 1309 + }, + { + "epoch": 0.6392322706571243, + "grad_norm": 0.1815006136894226, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 1310 + }, + { + "epoch": 0.6397202342225113, + "grad_norm": 0.18762600421905518, + "learning_rate": 0.0001, + "loss": 1.6741, + "step": 1311 + }, + { + "epoch": 0.6402081977878985, + "grad_norm": 0.17323340475559235, + "learning_rate": 0.0001, + "loss": 1.5812, + "step": 1312 + }, + { + "epoch": 0.6406961613532857, + "grad_norm": 0.18929678201675415, + "learning_rate": 0.0001, + "loss": 1.6747, + "step": 1313 + }, + { + "epoch": 0.6411841249186727, + "grad_norm": 0.18279722332954407, + "learning_rate": 0.0001, + "loss": 1.6429, + "step": 1314 + }, + { + "epoch": 0.6416720884840599, + "grad_norm": 0.1740237921476364, + "learning_rate": 0.0001, + "loss": 1.6276, + "step": 1315 + }, + { + "epoch": 0.6421600520494469, + "grad_norm": 0.18474610149860382, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 1316 + }, + { + "epoch": 0.6426480156148341, + "grad_norm": 0.18610845506191254, + "learning_rate": 0.0001, + "loss": 1.7036, + "step": 1317 + }, + { + "epoch": 0.6431359791802213, + "grad_norm": 0.18621689081192017, + "learning_rate": 0.0001, + "loss": 1.7495, + "step": 1318 + }, + { + "epoch": 0.6436239427456083, + "grad_norm": 0.1806156188249588, + "learning_rate": 0.0001, + "loss": 1.6756, + "step": 1319 + }, + { + "epoch": 0.6441119063109955, + "grad_norm": 0.18515653908252716, + "learning_rate": 0.0001, + "loss": 1.5946, + "step": 1320 + }, + { + "epoch": 0.6445998698763825, + "grad_norm": 0.17863605916500092, + "learning_rate": 0.0001, + "loss": 1.6519, + "step": 1321 + }, + { + "epoch": 0.6450878334417697, + "grad_norm": 0.17926158010959625, + "learning_rate": 0.0001, + "loss": 1.6493, + "step": 1322 + }, + { + "epoch": 0.6455757970071568, + "grad_norm": 0.19456753134727478, + "learning_rate": 0.0001, + "loss": 1.7178, + "step": 1323 + }, + { + "epoch": 0.6460637605725439, + "grad_norm": 0.17429687082767487, + "learning_rate": 0.0001, + "loss": 1.5933, + "step": 1324 + }, + { + "epoch": 0.646551724137931, + "grad_norm": 0.18286961317062378, + "learning_rate": 0.0001, + "loss": 1.7709, + "step": 1325 + }, + { + "epoch": 0.6470396877033181, + "grad_norm": 0.18863536417484283, + "learning_rate": 0.0001, + "loss": 1.707, + "step": 1326 + }, + { + "epoch": 0.6475276512687053, + "grad_norm": 0.18341319262981415, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 1327 + }, + { + "epoch": 0.6480156148340924, + "grad_norm": 0.2063741534948349, + "learning_rate": 0.0001, + "loss": 1.7651, + "step": 1328 + }, + { + "epoch": 0.6485035783994795, + "grad_norm": 0.18247577548027039, + "learning_rate": 0.0001, + "loss": 1.6966, + "step": 1329 + }, + { + "epoch": 0.6489915419648666, + "grad_norm": 0.18382957577705383, + "learning_rate": 0.0001, + "loss": 1.6692, + "step": 1330 + }, + { + "epoch": 0.6494795055302537, + "grad_norm": 0.19317232072353363, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 1331 + }, + { + "epoch": 0.6499674690956408, + "grad_norm": 0.18904589116573334, + "learning_rate": 0.0001, + "loss": 1.7119, + "step": 1332 + }, + { + "epoch": 0.650455432661028, + "grad_norm": 0.19716934859752655, + "learning_rate": 0.0001, + "loss": 1.639, + "step": 1333 + }, + { + "epoch": 0.6509433962264151, + "grad_norm": 0.18755610287189484, + "learning_rate": 0.0001, + "loss": 1.7698, + "step": 1334 + }, + { + "epoch": 0.6514313597918022, + "grad_norm": 0.18484559655189514, + "learning_rate": 0.0001, + "loss": 1.6491, + "step": 1335 + }, + { + "epoch": 0.6519193233571894, + "grad_norm": 0.1828099489212036, + "learning_rate": 0.0001, + "loss": 1.7007, + "step": 1336 + }, + { + "epoch": 0.6524072869225764, + "grad_norm": 0.1784173548221588, + "learning_rate": 0.0001, + "loss": 1.7014, + "step": 1337 + }, + { + "epoch": 0.6528952504879636, + "grad_norm": 0.18998552858829498, + "learning_rate": 0.0001, + "loss": 1.7141, + "step": 1338 + }, + { + "epoch": 0.6533832140533506, + "grad_norm": 0.18050242960453033, + "learning_rate": 0.0001, + "loss": 1.5419, + "step": 1339 + }, + { + "epoch": 0.6538711776187378, + "grad_norm": 0.17193758487701416, + "learning_rate": 0.0001, + "loss": 1.6478, + "step": 1340 + }, + { + "epoch": 0.654359141184125, + "grad_norm": 0.1884956806898117, + "learning_rate": 0.0001, + "loss": 1.6398, + "step": 1341 + }, + { + "epoch": 0.654847104749512, + "grad_norm": 0.18393546342849731, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 1342 + }, + { + "epoch": 0.6553350683148992, + "grad_norm": 0.17768503725528717, + "learning_rate": 0.0001, + "loss": 1.6551, + "step": 1343 + }, + { + "epoch": 0.6558230318802862, + "grad_norm": 0.1898057609796524, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 1344 + }, + { + "epoch": 0.6563109954456734, + "grad_norm": 0.18605998158454895, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 1345 + }, + { + "epoch": 0.6567989590110606, + "grad_norm": 0.17978940904140472, + "learning_rate": 0.0001, + "loss": 1.6983, + "step": 1346 + }, + { + "epoch": 0.6572869225764476, + "grad_norm": 0.18616552650928497, + "learning_rate": 0.0001, + "loss": 1.7164, + "step": 1347 + }, + { + "epoch": 0.6577748861418348, + "grad_norm": 0.1831643134355545, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 1348 + }, + { + "epoch": 0.6582628497072218, + "grad_norm": 0.18316183984279633, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 1349 + }, + { + "epoch": 0.658750813272609, + "grad_norm": 0.1927359700202942, + "learning_rate": 0.0001, + "loss": 1.7889, + "step": 1350 + }, + { + "epoch": 0.6592387768379961, + "grad_norm": 0.17629116773605347, + "learning_rate": 0.0001, + "loss": 1.6653, + "step": 1351 + }, + { + "epoch": 0.6597267404033832, + "grad_norm": 0.17519547045230865, + "learning_rate": 0.0001, + "loss": 1.7317, + "step": 1352 + }, + { + "epoch": 0.6602147039687704, + "grad_norm": 0.1796695441007614, + "learning_rate": 0.0001, + "loss": 1.6736, + "step": 1353 + }, + { + "epoch": 0.6607026675341574, + "grad_norm": 0.17477834224700928, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 1354 + }, + { + "epoch": 0.6611906310995446, + "grad_norm": 0.1775176078081131, + "learning_rate": 0.0001, + "loss": 1.6762, + "step": 1355 + }, + { + "epoch": 0.6616785946649317, + "grad_norm": 0.18086951971054077, + "learning_rate": 0.0001, + "loss": 1.7373, + "step": 1356 + }, + { + "epoch": 0.6621665582303188, + "grad_norm": 0.18718330562114716, + "learning_rate": 0.0001, + "loss": 1.74, + "step": 1357 + }, + { + "epoch": 0.6626545217957059, + "grad_norm": 0.19314613938331604, + "learning_rate": 0.0001, + "loss": 1.7523, + "step": 1358 + }, + { + "epoch": 0.663142485361093, + "grad_norm": 0.18622739613056183, + "learning_rate": 0.0001, + "loss": 1.6772, + "step": 1359 + }, + { + "epoch": 0.6636304489264802, + "grad_norm": 0.18634377419948578, + "learning_rate": 0.0001, + "loss": 1.7391, + "step": 1360 + }, + { + "epoch": 0.6641184124918673, + "grad_norm": 0.19122491776943207, + "learning_rate": 0.0001, + "loss": 1.6955, + "step": 1361 + }, + { + "epoch": 0.6646063760572544, + "grad_norm": 0.18408794701099396, + "learning_rate": 0.0001, + "loss": 1.6577, + "step": 1362 + }, + { + "epoch": 0.6650943396226415, + "grad_norm": 0.17737893760204315, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 1363 + }, + { + "epoch": 0.6655823031880286, + "grad_norm": 0.19772779941558838, + "learning_rate": 0.0001, + "loss": 1.5194, + "step": 1364 + }, + { + "epoch": 0.6660702667534157, + "grad_norm": 0.18935418128967285, + "learning_rate": 0.0001, + "loss": 1.759, + "step": 1365 + }, + { + "epoch": 0.6665582303188029, + "grad_norm": 0.1936458796262741, + "learning_rate": 0.0001, + "loss": 1.672, + "step": 1366 + }, + { + "epoch": 0.66704619388419, + "grad_norm": 0.18454033136367798, + "learning_rate": 0.0001, + "loss": 1.6954, + "step": 1367 + }, + { + "epoch": 0.6675341574495771, + "grad_norm": 0.18430058658123016, + "learning_rate": 0.0001, + "loss": 1.5498, + "step": 1368 + }, + { + "epoch": 0.6680221210149642, + "grad_norm": 0.1890435814857483, + "learning_rate": 0.0001, + "loss": 1.6311, + "step": 1369 + }, + { + "epoch": 0.6685100845803513, + "grad_norm": 0.20457454025745392, + "learning_rate": 0.0001, + "loss": 1.8362, + "step": 1370 + }, + { + "epoch": 0.6689980481457385, + "grad_norm": 0.18559999763965607, + "learning_rate": 0.0001, + "loss": 1.6873, + "step": 1371 + }, + { + "epoch": 0.6694860117111255, + "grad_norm": 0.1795533299446106, + "learning_rate": 0.0001, + "loss": 1.5727, + "step": 1372 + }, + { + "epoch": 0.6699739752765127, + "grad_norm": 0.17595787346363068, + "learning_rate": 0.0001, + "loss": 1.6365, + "step": 1373 + }, + { + "epoch": 0.6704619388418998, + "grad_norm": 0.18084342777729034, + "learning_rate": 0.0001, + "loss": 1.728, + "step": 1374 + }, + { + "epoch": 0.6709499024072869, + "grad_norm": 0.18217813968658447, + "learning_rate": 0.0001, + "loss": 1.5737, + "step": 1375 + }, + { + "epoch": 0.6714378659726741, + "grad_norm": 0.19856606423854828, + "learning_rate": 0.0001, + "loss": 1.7181, + "step": 1376 + }, + { + "epoch": 0.6719258295380611, + "grad_norm": 0.18344613909721375, + "learning_rate": 0.0001, + "loss": 1.6762, + "step": 1377 + }, + { + "epoch": 0.6724137931034483, + "grad_norm": 0.1860368400812149, + "learning_rate": 0.0001, + "loss": 1.7163, + "step": 1378 + }, + { + "epoch": 0.6729017566688354, + "grad_norm": 0.1970399022102356, + "learning_rate": 0.0001, + "loss": 1.6938, + "step": 1379 + }, + { + "epoch": 0.6733897202342225, + "grad_norm": 0.18704956769943237, + "learning_rate": 0.0001, + "loss": 1.68, + "step": 1380 + }, + { + "epoch": 0.6738776837996097, + "grad_norm": 0.18241243064403534, + "learning_rate": 0.0001, + "loss": 1.556, + "step": 1381 + }, + { + "epoch": 0.6743656473649967, + "grad_norm": 0.1883855015039444, + "learning_rate": 0.0001, + "loss": 1.6499, + "step": 1382 + }, + { + "epoch": 0.6748536109303839, + "grad_norm": 0.19078750908374786, + "learning_rate": 0.0001, + "loss": 1.6102, + "step": 1383 + }, + { + "epoch": 0.675341574495771, + "grad_norm": 0.18574097752571106, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 1384 + }, + { + "epoch": 0.6758295380611581, + "grad_norm": 0.18113356828689575, + "learning_rate": 0.0001, + "loss": 1.6558, + "step": 1385 + }, + { + "epoch": 0.6763175016265452, + "grad_norm": 0.1948838084936142, + "learning_rate": 0.0001, + "loss": 1.6773, + "step": 1386 + }, + { + "epoch": 0.6768054651919323, + "grad_norm": 0.18788839876651764, + "learning_rate": 0.0001, + "loss": 1.6001, + "step": 1387 + }, + { + "epoch": 0.6772934287573195, + "grad_norm": 0.18287061154842377, + "learning_rate": 0.0001, + "loss": 1.6765, + "step": 1388 + }, + { + "epoch": 0.6777813923227066, + "grad_norm": 0.1944217085838318, + "learning_rate": 0.0001, + "loss": 1.7176, + "step": 1389 + }, + { + "epoch": 0.6782693558880937, + "grad_norm": 0.19344620406627655, + "learning_rate": 0.0001, + "loss": 1.7028, + "step": 1390 + }, + { + "epoch": 0.6787573194534808, + "grad_norm": 0.18965065479278564, + "learning_rate": 0.0001, + "loss": 1.6866, + "step": 1391 + }, + { + "epoch": 0.6792452830188679, + "grad_norm": 0.18736335635185242, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 1392 + }, + { + "epoch": 0.679733246584255, + "grad_norm": 0.19293658435344696, + "learning_rate": 0.0001, + "loss": 1.6772, + "step": 1393 + }, + { + "epoch": 0.6802212101496422, + "grad_norm": 0.1829851269721985, + "learning_rate": 0.0001, + "loss": 1.5907, + "step": 1394 + }, + { + "epoch": 0.6807091737150293, + "grad_norm": 0.18881766498088837, + "learning_rate": 0.0001, + "loss": 1.6568, + "step": 1395 + }, + { + "epoch": 0.6811971372804164, + "grad_norm": 0.1797613501548767, + "learning_rate": 0.0001, + "loss": 1.7498, + "step": 1396 + }, + { + "epoch": 0.6816851008458035, + "grad_norm": 0.19345468282699585, + "learning_rate": 0.0001, + "loss": 1.7604, + "step": 1397 + }, + { + "epoch": 0.6821730644111906, + "grad_norm": 0.1797412633895874, + "learning_rate": 0.0001, + "loss": 1.6099, + "step": 1398 + }, + { + "epoch": 0.6826610279765778, + "grad_norm": 0.17828069627285004, + "learning_rate": 0.0001, + "loss": 1.5906, + "step": 1399 + }, + { + "epoch": 0.6831489915419648, + "grad_norm": 0.17425017058849335, + "learning_rate": 0.0001, + "loss": 1.655, + "step": 1400 + }, + { + "epoch": 0.683636955107352, + "grad_norm": 0.1832888275384903, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 1401 + }, + { + "epoch": 0.6841249186727391, + "grad_norm": 0.17417742311954498, + "learning_rate": 0.0001, + "loss": 1.6001, + "step": 1402 + }, + { + "epoch": 0.6846128822381262, + "grad_norm": 0.17659293115139008, + "learning_rate": 0.0001, + "loss": 1.6074, + "step": 1403 + }, + { + "epoch": 0.6851008458035134, + "grad_norm": 0.18741555511951447, + "learning_rate": 0.0001, + "loss": 1.6052, + "step": 1404 + }, + { + "epoch": 0.6855888093689004, + "grad_norm": 0.20556053519248962, + "learning_rate": 0.0001, + "loss": 1.8142, + "step": 1405 + }, + { + "epoch": 0.6860767729342876, + "grad_norm": 0.1742892563343048, + "learning_rate": 0.0001, + "loss": 1.5287, + "step": 1406 + }, + { + "epoch": 0.6865647364996746, + "grad_norm": 0.17847485840320587, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 1407 + }, + { + "epoch": 0.6870527000650618, + "grad_norm": 0.17445972561836243, + "learning_rate": 0.0001, + "loss": 1.6276, + "step": 1408 + }, + { + "epoch": 0.687540663630449, + "grad_norm": 0.20477156341075897, + "learning_rate": 0.0001, + "loss": 1.8143, + "step": 1409 + }, + { + "epoch": 0.688028627195836, + "grad_norm": 0.19618012011051178, + "learning_rate": 0.0001, + "loss": 1.7586, + "step": 1410 + }, + { + "epoch": 0.6885165907612232, + "grad_norm": 0.18680322170257568, + "learning_rate": 0.0001, + "loss": 1.5995, + "step": 1411 + }, + { + "epoch": 0.6890045543266102, + "grad_norm": 0.18768328428268433, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 1412 + }, + { + "epoch": 0.6894925178919974, + "grad_norm": 0.1906110793352127, + "learning_rate": 0.0001, + "loss": 1.6941, + "step": 1413 + }, + { + "epoch": 0.6899804814573846, + "grad_norm": 0.182110995054245, + "learning_rate": 0.0001, + "loss": 1.7078, + "step": 1414 + }, + { + "epoch": 0.6904684450227716, + "grad_norm": 0.182440385222435, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 1415 + }, + { + "epoch": 0.6909564085881588, + "grad_norm": 0.1977446973323822, + "learning_rate": 0.0001, + "loss": 1.7053, + "step": 1416 + }, + { + "epoch": 0.6914443721535458, + "grad_norm": 0.18622663617134094, + "learning_rate": 0.0001, + "loss": 1.6786, + "step": 1417 + }, + { + "epoch": 0.691932335718933, + "grad_norm": 0.1755010485649109, + "learning_rate": 0.0001, + "loss": 1.6419, + "step": 1418 + }, + { + "epoch": 0.6924202992843201, + "grad_norm": 0.18327659368515015, + "learning_rate": 0.0001, + "loss": 1.6468, + "step": 1419 + }, + { + "epoch": 0.6929082628497072, + "grad_norm": 0.19392666220664978, + "learning_rate": 0.0001, + "loss": 1.7399, + "step": 1420 + }, + { + "epoch": 0.6933962264150944, + "grad_norm": 0.17711861431598663, + "learning_rate": 0.0001, + "loss": 1.5096, + "step": 1421 + }, + { + "epoch": 0.6938841899804814, + "grad_norm": 0.19558769464492798, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 1422 + }, + { + "epoch": 0.6943721535458686, + "grad_norm": 0.18583737313747406, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 1423 + }, + { + "epoch": 0.6948601171112557, + "grad_norm": 0.19811202585697174, + "learning_rate": 0.0001, + "loss": 1.685, + "step": 1424 + }, + { + "epoch": 0.6953480806766428, + "grad_norm": 0.1776018738746643, + "learning_rate": 0.0001, + "loss": 1.5341, + "step": 1425 + }, + { + "epoch": 0.6958360442420299, + "grad_norm": 0.18943732976913452, + "learning_rate": 0.0001, + "loss": 1.7079, + "step": 1426 + }, + { + "epoch": 0.6963240078074171, + "grad_norm": 0.1881314069032669, + "learning_rate": 0.0001, + "loss": 1.7115, + "step": 1427 + }, + { + "epoch": 0.6968119713728042, + "grad_norm": 0.17805875837802887, + "learning_rate": 0.0001, + "loss": 1.7172, + "step": 1428 + }, + { + "epoch": 0.6972999349381913, + "grad_norm": 0.18958143889904022, + "learning_rate": 0.0001, + "loss": 1.6772, + "step": 1429 + }, + { + "epoch": 0.6977878985035784, + "grad_norm": 0.18394924700260162, + "learning_rate": 0.0001, + "loss": 1.7144, + "step": 1430 + }, + { + "epoch": 0.6982758620689655, + "grad_norm": 0.1808011531829834, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 1431 + }, + { + "epoch": 0.6987638256343527, + "grad_norm": 0.19299408793449402, + "learning_rate": 0.0001, + "loss": 1.7708, + "step": 1432 + }, + { + "epoch": 0.6992517891997397, + "grad_norm": 0.18791189789772034, + "learning_rate": 0.0001, + "loss": 1.6471, + "step": 1433 + }, + { + "epoch": 0.6997397527651269, + "grad_norm": 0.18509036302566528, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 1434 + }, + { + "epoch": 0.700227716330514, + "grad_norm": 0.181147500872612, + "learning_rate": 0.0001, + "loss": 1.6402, + "step": 1435 + }, + { + "epoch": 0.7007156798959011, + "grad_norm": 0.18755219876766205, + "learning_rate": 0.0001, + "loss": 1.616, + "step": 1436 + }, + { + "epoch": 0.7012036434612883, + "grad_norm": 0.1845817118883133, + "learning_rate": 0.0001, + "loss": 1.6785, + "step": 1437 + }, + { + "epoch": 0.7016916070266753, + "grad_norm": 0.1759112924337387, + "learning_rate": 0.0001, + "loss": 1.5915, + "step": 1438 + }, + { + "epoch": 0.7021795705920625, + "grad_norm": 0.19621609151363373, + "learning_rate": 0.0001, + "loss": 1.7141, + "step": 1439 + }, + { + "epoch": 0.7026675341574495, + "grad_norm": 0.19262106716632843, + "learning_rate": 0.0001, + "loss": 1.6559, + "step": 1440 + }, + { + "epoch": 0.7031554977228367, + "grad_norm": 0.18579107522964478, + "learning_rate": 0.0001, + "loss": 1.7255, + "step": 1441 + }, + { + "epoch": 0.7036434612882239, + "grad_norm": 0.1896117478609085, + "learning_rate": 0.0001, + "loss": 1.6592, + "step": 1442 + }, + { + "epoch": 0.7041314248536109, + "grad_norm": 0.19127638638019562, + "learning_rate": 0.0001, + "loss": 1.6569, + "step": 1443 + }, + { + "epoch": 0.7046193884189981, + "grad_norm": 0.18615250289440155, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 1444 + }, + { + "epoch": 0.7051073519843851, + "grad_norm": 0.20700882375240326, + "learning_rate": 0.0001, + "loss": 1.7548, + "step": 1445 + }, + { + "epoch": 0.7055953155497723, + "grad_norm": 0.18569111824035645, + "learning_rate": 0.0001, + "loss": 1.6141, + "step": 1446 + }, + { + "epoch": 0.7060832791151594, + "grad_norm": 0.1929357945919037, + "learning_rate": 0.0001, + "loss": 1.6579, + "step": 1447 + }, + { + "epoch": 0.7065712426805465, + "grad_norm": 0.19292321801185608, + "learning_rate": 0.0001, + "loss": 1.7026, + "step": 1448 + }, + { + "epoch": 0.7070592062459337, + "grad_norm": 0.1860547810792923, + "learning_rate": 0.0001, + "loss": 1.7159, + "step": 1449 + }, + { + "epoch": 0.7075471698113207, + "grad_norm": 0.18196798861026764, + "learning_rate": 0.0001, + "loss": 1.6683, + "step": 1450 + }, + { + "epoch": 0.7080351333767079, + "grad_norm": 0.1784311681985855, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 1451 + }, + { + "epoch": 0.708523096942095, + "grad_norm": 0.1859826296567917, + "learning_rate": 0.0001, + "loss": 1.686, + "step": 1452 + }, + { + "epoch": 0.7090110605074821, + "grad_norm": 0.18661972880363464, + "learning_rate": 0.0001, + "loss": 1.6035, + "step": 1453 + }, + { + "epoch": 0.7094990240728692, + "grad_norm": 0.18923242390155792, + "learning_rate": 0.0001, + "loss": 1.7313, + "step": 1454 + }, + { + "epoch": 0.7099869876382563, + "grad_norm": 0.1933595836162567, + "learning_rate": 0.0001, + "loss": 1.7847, + "step": 1455 + }, + { + "epoch": 0.7104749512036435, + "grad_norm": 0.19189570844173431, + "learning_rate": 0.0001, + "loss": 1.6822, + "step": 1456 + }, + { + "epoch": 0.7109629147690306, + "grad_norm": 0.18658806383609772, + "learning_rate": 0.0001, + "loss": 1.7259, + "step": 1457 + }, + { + "epoch": 0.7114508783344177, + "grad_norm": 0.19276390969753265, + "learning_rate": 0.0001, + "loss": 1.7137, + "step": 1458 + }, + { + "epoch": 0.7119388418998048, + "grad_norm": 0.19343958795070648, + "learning_rate": 0.0001, + "loss": 1.85, + "step": 1459 + }, + { + "epoch": 0.7124268054651919, + "grad_norm": 0.18348975479602814, + "learning_rate": 0.0001, + "loss": 1.6551, + "step": 1460 + }, + { + "epoch": 0.712914769030579, + "grad_norm": 0.18842749297618866, + "learning_rate": 0.0001, + "loss": 1.6364, + "step": 1461 + }, + { + "epoch": 0.7134027325959662, + "grad_norm": 0.1927134394645691, + "learning_rate": 0.0001, + "loss": 1.7096, + "step": 1462 + }, + { + "epoch": 0.7138906961613533, + "grad_norm": 0.18802672624588013, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 1463 + }, + { + "epoch": 0.7143786597267404, + "grad_norm": 0.19619612395763397, + "learning_rate": 0.0001, + "loss": 1.774, + "step": 1464 + }, + { + "epoch": 0.7148666232921275, + "grad_norm": 0.18309350311756134, + "learning_rate": 0.0001, + "loss": 1.7173, + "step": 1465 + }, + { + "epoch": 0.7153545868575146, + "grad_norm": 0.190412700176239, + "learning_rate": 0.0001, + "loss": 1.6976, + "step": 1466 + }, + { + "epoch": 0.7158425504229018, + "grad_norm": 0.1945875585079193, + "learning_rate": 0.0001, + "loss": 1.6, + "step": 1467 + }, + { + "epoch": 0.7163305139882888, + "grad_norm": 0.19393402338027954, + "learning_rate": 0.0001, + "loss": 1.6569, + "step": 1468 + }, + { + "epoch": 0.716818477553676, + "grad_norm": 0.19331271946430206, + "learning_rate": 0.0001, + "loss": 1.6866, + "step": 1469 + }, + { + "epoch": 0.7173064411190632, + "grad_norm": 0.18295609951019287, + "learning_rate": 0.0001, + "loss": 1.6991, + "step": 1470 + }, + { + "epoch": 0.7177944046844502, + "grad_norm": 0.1889050453901291, + "learning_rate": 0.0001, + "loss": 1.6893, + "step": 1471 + }, + { + "epoch": 0.7182823682498374, + "grad_norm": 0.18971437215805054, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 1472 + }, + { + "epoch": 0.7187703318152244, + "grad_norm": 0.19291648268699646, + "learning_rate": 0.0001, + "loss": 1.5201, + "step": 1473 + }, + { + "epoch": 0.7192582953806116, + "grad_norm": 0.18243971467018127, + "learning_rate": 0.0001, + "loss": 1.6197, + "step": 1474 + }, + { + "epoch": 0.7197462589459988, + "grad_norm": 0.2129974663257599, + "learning_rate": 0.0001, + "loss": 1.6374, + "step": 1475 + }, + { + "epoch": 0.7202342225113858, + "grad_norm": 0.23039822280406952, + "learning_rate": 0.0001, + "loss": 1.7185, + "step": 1476 + }, + { + "epoch": 0.720722186076773, + "grad_norm": 0.19735904037952423, + "learning_rate": 0.0001, + "loss": 1.7196, + "step": 1477 + }, + { + "epoch": 0.72121014964216, + "grad_norm": 0.19186371564865112, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 1478 + }, + { + "epoch": 0.7216981132075472, + "grad_norm": 0.1926344484090805, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 1479 + }, + { + "epoch": 0.7221860767729343, + "grad_norm": 0.18684318661689758, + "learning_rate": 0.0001, + "loss": 1.6809, + "step": 1480 + }, + { + "epoch": 0.7226740403383214, + "grad_norm": 0.17722366750240326, + "learning_rate": 0.0001, + "loss": 1.5618, + "step": 1481 + }, + { + "epoch": 0.7231620039037086, + "grad_norm": 0.17876088619232178, + "learning_rate": 0.0001, + "loss": 1.5692, + "step": 1482 + }, + { + "epoch": 0.7236499674690956, + "grad_norm": 0.19231794774532318, + "learning_rate": 0.0001, + "loss": 1.7443, + "step": 1483 + }, + { + "epoch": 0.7241379310344828, + "grad_norm": 0.19395712018013, + "learning_rate": 0.0001, + "loss": 1.6295, + "step": 1484 + }, + { + "epoch": 0.7246258945998699, + "grad_norm": 0.18045175075531006, + "learning_rate": 0.0001, + "loss": 1.636, + "step": 1485 + }, + { + "epoch": 0.725113858165257, + "grad_norm": 0.19451959431171417, + "learning_rate": 0.0001, + "loss": 1.7643, + "step": 1486 + }, + { + "epoch": 0.7256018217306441, + "grad_norm": 0.18595635890960693, + "learning_rate": 0.0001, + "loss": 1.7065, + "step": 1487 + }, + { + "epoch": 0.7260897852960312, + "grad_norm": 0.1827252060174942, + "learning_rate": 0.0001, + "loss": 1.6617, + "step": 1488 + }, + { + "epoch": 0.7265777488614183, + "grad_norm": 0.18135575950145721, + "learning_rate": 0.0001, + "loss": 1.5711, + "step": 1489 + }, + { + "epoch": 0.7270657124268055, + "grad_norm": 0.17621076107025146, + "learning_rate": 0.0001, + "loss": 1.4973, + "step": 1490 + }, + { + "epoch": 0.7275536759921926, + "grad_norm": 0.17968687415122986, + "learning_rate": 0.0001, + "loss": 1.5858, + "step": 1491 + }, + { + "epoch": 0.7280416395575797, + "grad_norm": 0.19981656968593597, + "learning_rate": 0.0001, + "loss": 1.6636, + "step": 1492 + }, + { + "epoch": 0.7285296031229668, + "grad_norm": 0.18831577897071838, + "learning_rate": 0.0001, + "loss": 1.6519, + "step": 1493 + }, + { + "epoch": 0.7290175666883539, + "grad_norm": 0.18130916357040405, + "learning_rate": 0.0001, + "loss": 1.7075, + "step": 1494 + }, + { + "epoch": 0.7295055302537411, + "grad_norm": 0.18603913486003876, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 1495 + }, + { + "epoch": 0.7299934938191281, + "grad_norm": 0.18082112073898315, + "learning_rate": 0.0001, + "loss": 1.5542, + "step": 1496 + }, + { + "epoch": 0.7304814573845153, + "grad_norm": 0.19021904468536377, + "learning_rate": 0.0001, + "loss": 1.6491, + "step": 1497 + }, + { + "epoch": 0.7309694209499024, + "grad_norm": 0.18982046842575073, + "learning_rate": 0.0001, + "loss": 1.7551, + "step": 1498 + }, + { + "epoch": 0.7314573845152895, + "grad_norm": 0.17388087511062622, + "learning_rate": 0.0001, + "loss": 1.5713, + "step": 1499 + }, + { + "epoch": 0.7319453480806767, + "grad_norm": 0.18898619711399078, + "learning_rate": 0.0001, + "loss": 1.6261, + "step": 1500 + }, + { + "epoch": 0.7324333116460637, + "grad_norm": 0.1856594830751419, + "learning_rate": 0.0001, + "loss": 1.7416, + "step": 1501 + }, + { + "epoch": 0.7329212752114509, + "grad_norm": 0.1952165812253952, + "learning_rate": 0.0001, + "loss": 1.8623, + "step": 1502 + }, + { + "epoch": 0.733409238776838, + "grad_norm": 0.18985500931739807, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 1503 + }, + { + "epoch": 0.7338972023422251, + "grad_norm": 0.19051025807857513, + "learning_rate": 0.0001, + "loss": 1.6933, + "step": 1504 + }, + { + "epoch": 0.7343851659076123, + "grad_norm": 0.19088402390480042, + "learning_rate": 0.0001, + "loss": 1.7145, + "step": 1505 + }, + { + "epoch": 0.7348731294729993, + "grad_norm": 0.20936012268066406, + "learning_rate": 0.0001, + "loss": 1.854, + "step": 1506 + }, + { + "epoch": 0.7353610930383865, + "grad_norm": 0.17536798119544983, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 1507 + }, + { + "epoch": 0.7358490566037735, + "grad_norm": 0.1752844750881195, + "learning_rate": 0.0001, + "loss": 1.6902, + "step": 1508 + }, + { + "epoch": 0.7363370201691607, + "grad_norm": 0.17726869881153107, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 1509 + }, + { + "epoch": 0.7368249837345479, + "grad_norm": 0.17950955033302307, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 1510 + }, + { + "epoch": 0.7373129472999349, + "grad_norm": 0.18425075709819794, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 1511 + }, + { + "epoch": 0.7378009108653221, + "grad_norm": 0.18774688243865967, + "learning_rate": 0.0001, + "loss": 1.6976, + "step": 1512 + }, + { + "epoch": 0.7382888744307091, + "grad_norm": 0.18913634121418, + "learning_rate": 0.0001, + "loss": 1.6605, + "step": 1513 + }, + { + "epoch": 0.7387768379960963, + "grad_norm": 0.18106456100940704, + "learning_rate": 0.0001, + "loss": 1.7201, + "step": 1514 + }, + { + "epoch": 0.7392648015614834, + "grad_norm": 0.1875046044588089, + "learning_rate": 0.0001, + "loss": 1.7115, + "step": 1515 + }, + { + "epoch": 0.7397527651268705, + "grad_norm": 0.1848473697900772, + "learning_rate": 0.0001, + "loss": 1.6694, + "step": 1516 + }, + { + "epoch": 0.7402407286922577, + "grad_norm": 0.18459069728851318, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 1517 + }, + { + "epoch": 0.7407286922576448, + "grad_norm": 0.1830248087644577, + "learning_rate": 0.0001, + "loss": 1.6973, + "step": 1518 + }, + { + "epoch": 0.7412166558230319, + "grad_norm": 0.18021319806575775, + "learning_rate": 0.0001, + "loss": 1.7161, + "step": 1519 + }, + { + "epoch": 0.741704619388419, + "grad_norm": 0.1852198839187622, + "learning_rate": 0.0001, + "loss": 1.6411, + "step": 1520 + }, + { + "epoch": 0.7421925829538061, + "grad_norm": 0.18931609392166138, + "learning_rate": 0.0001, + "loss": 1.7385, + "step": 1521 + }, + { + "epoch": 0.7426805465191932, + "grad_norm": 0.18388409912586212, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 1522 + }, + { + "epoch": 0.7431685100845804, + "grad_norm": 0.19164806604385376, + "learning_rate": 0.0001, + "loss": 1.7588, + "step": 1523 + }, + { + "epoch": 0.7436564736499675, + "grad_norm": 0.19370153546333313, + "learning_rate": 0.0001, + "loss": 1.6054, + "step": 1524 + }, + { + "epoch": 0.7441444372153546, + "grad_norm": 0.17590291798114777, + "learning_rate": 0.0001, + "loss": 1.6625, + "step": 1525 + }, + { + "epoch": 0.7446324007807417, + "grad_norm": 0.18713389337062836, + "learning_rate": 0.0001, + "loss": 1.6697, + "step": 1526 + }, + { + "epoch": 0.7451203643461288, + "grad_norm": 0.18523730337619781, + "learning_rate": 0.0001, + "loss": 1.613, + "step": 1527 + }, + { + "epoch": 0.745608327911516, + "grad_norm": 0.1839686632156372, + "learning_rate": 0.0001, + "loss": 1.6729, + "step": 1528 + }, + { + "epoch": 0.746096291476903, + "grad_norm": 0.20028969645500183, + "learning_rate": 0.0001, + "loss": 1.6504, + "step": 1529 + }, + { + "epoch": 0.7465842550422902, + "grad_norm": 0.18220870196819305, + "learning_rate": 0.0001, + "loss": 1.6921, + "step": 1530 + }, + { + "epoch": 0.7470722186076773, + "grad_norm": 0.18175910413265228, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 1531 + }, + { + "epoch": 0.7475601821730644, + "grad_norm": 0.18016168475151062, + "learning_rate": 0.0001, + "loss": 1.5405, + "step": 1532 + }, + { + "epoch": 0.7480481457384516, + "grad_norm": 0.1960187703371048, + "learning_rate": 0.0001, + "loss": 1.6983, + "step": 1533 + }, + { + "epoch": 0.7485361093038386, + "grad_norm": 0.1788274049758911, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 1534 + }, + { + "epoch": 0.7490240728692258, + "grad_norm": 0.19441407918930054, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 1535 + }, + { + "epoch": 0.7495120364346128, + "grad_norm": 0.19135618209838867, + "learning_rate": 0.0001, + "loss": 1.6645, + "step": 1536 + }, + { + "epoch": 0.75, + "grad_norm": 0.1894136369228363, + "learning_rate": 0.0001, + "loss": 1.618, + "step": 1537 + }, + { + "epoch": 0.7504879635653872, + "grad_norm": 0.1781785488128662, + "learning_rate": 0.0001, + "loss": 1.6687, + "step": 1538 + }, + { + "epoch": 0.7509759271307742, + "grad_norm": 0.18362712860107422, + "learning_rate": 0.0001, + "loss": 1.7104, + "step": 1539 + }, + { + "epoch": 0.7514638906961614, + "grad_norm": 0.20387201011180878, + "learning_rate": 0.0001, + "loss": 1.7899, + "step": 1540 + }, + { + "epoch": 0.7519518542615484, + "grad_norm": 0.18107970058918, + "learning_rate": 0.0001, + "loss": 1.5952, + "step": 1541 + }, + { + "epoch": 0.7524398178269356, + "grad_norm": 0.18185783922672272, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 1542 + }, + { + "epoch": 0.7529277813923227, + "grad_norm": 0.17853549122810364, + "learning_rate": 0.0001, + "loss": 1.5997, + "step": 1543 + }, + { + "epoch": 0.7534157449577098, + "grad_norm": 0.17627951502799988, + "learning_rate": 0.0001, + "loss": 1.5845, + "step": 1544 + }, + { + "epoch": 0.753903708523097, + "grad_norm": 0.19762729108333588, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 1545 + }, + { + "epoch": 0.754391672088484, + "grad_norm": 0.20241263508796692, + "learning_rate": 0.0001, + "loss": 1.794, + "step": 1546 + }, + { + "epoch": 0.7548796356538712, + "grad_norm": 0.1798173040151596, + "learning_rate": 0.0001, + "loss": 1.6637, + "step": 1547 + }, + { + "epoch": 0.7553675992192583, + "grad_norm": 0.1928299218416214, + "learning_rate": 0.0001, + "loss": 1.7751, + "step": 1548 + }, + { + "epoch": 0.7558555627846454, + "grad_norm": 0.18737445771694183, + "learning_rate": 0.0001, + "loss": 1.76, + "step": 1549 + }, + { + "epoch": 0.7563435263500325, + "grad_norm": 0.1899656057357788, + "learning_rate": 0.0001, + "loss": 1.7796, + "step": 1550 + }, + { + "epoch": 0.7568314899154196, + "grad_norm": 0.18091318011283875, + "learning_rate": 0.0001, + "loss": 1.6277, + "step": 1551 + }, + { + "epoch": 0.7573194534808068, + "grad_norm": 0.18001140654087067, + "learning_rate": 0.0001, + "loss": 1.5968, + "step": 1552 + }, + { + "epoch": 0.7578074170461939, + "grad_norm": 0.17739026248455048, + "learning_rate": 0.0001, + "loss": 1.5413, + "step": 1553 + }, + { + "epoch": 0.758295380611581, + "grad_norm": 0.183801531791687, + "learning_rate": 0.0001, + "loss": 1.6924, + "step": 1554 + }, + { + "epoch": 0.7587833441769681, + "grad_norm": 0.19029074907302856, + "learning_rate": 0.0001, + "loss": 1.7264, + "step": 1555 + }, + { + "epoch": 0.7592713077423552, + "grad_norm": 0.18088550865650177, + "learning_rate": 0.0001, + "loss": 1.648, + "step": 1556 + }, + { + "epoch": 0.7597592713077423, + "grad_norm": 0.182144433259964, + "learning_rate": 0.0001, + "loss": 1.6971, + "step": 1557 + }, + { + "epoch": 0.7602472348731295, + "grad_norm": 0.1943800002336502, + "learning_rate": 0.0001, + "loss": 1.8083, + "step": 1558 + }, + { + "epoch": 0.7607351984385166, + "grad_norm": 0.19364763796329498, + "learning_rate": 0.0001, + "loss": 1.8159, + "step": 1559 + }, + { + "epoch": 0.7612231620039037, + "grad_norm": 0.19187410175800323, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 1560 + }, + { + "epoch": 0.7617111255692909, + "grad_norm": 0.1859988272190094, + "learning_rate": 0.0001, + "loss": 1.643, + "step": 1561 + }, + { + "epoch": 0.7621990891346779, + "grad_norm": 0.18581219017505646, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 1562 + }, + { + "epoch": 0.7626870527000651, + "grad_norm": 0.1905384510755539, + "learning_rate": 0.0001, + "loss": 1.6672, + "step": 1563 + }, + { + "epoch": 0.7631750162654521, + "grad_norm": 0.19391977787017822, + "learning_rate": 0.0001, + "loss": 1.7094, + "step": 1564 + }, + { + "epoch": 0.7636629798308393, + "grad_norm": 0.18060721457004547, + "learning_rate": 0.0001, + "loss": 1.5947, + "step": 1565 + }, + { + "epoch": 0.7641509433962265, + "grad_norm": 0.19189974665641785, + "learning_rate": 0.0001, + "loss": 1.6929, + "step": 1566 + }, + { + "epoch": 0.7646389069616135, + "grad_norm": 0.1866428405046463, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 1567 + }, + { + "epoch": 0.7651268705270007, + "grad_norm": 0.1927812546491623, + "learning_rate": 0.0001, + "loss": 1.6787, + "step": 1568 + }, + { + "epoch": 0.7656148340923877, + "grad_norm": 0.1899099200963974, + "learning_rate": 0.0001, + "loss": 1.7085, + "step": 1569 + }, + { + "epoch": 0.7661027976577749, + "grad_norm": 0.19770196080207825, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 1570 + }, + { + "epoch": 0.766590761223162, + "grad_norm": 0.1968637853860855, + "learning_rate": 0.0001, + "loss": 1.6972, + "step": 1571 + }, + { + "epoch": 0.7670787247885491, + "grad_norm": 0.19224700331687927, + "learning_rate": 0.0001, + "loss": 1.6879, + "step": 1572 + }, + { + "epoch": 0.7675666883539363, + "grad_norm": 0.18638695776462555, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 1573 + }, + { + "epoch": 0.7680546519193233, + "grad_norm": 0.18514102697372437, + "learning_rate": 0.0001, + "loss": 1.6404, + "step": 1574 + }, + { + "epoch": 0.7685426154847105, + "grad_norm": 0.1898544877767563, + "learning_rate": 0.0001, + "loss": 1.6941, + "step": 1575 + }, + { + "epoch": 0.7690305790500976, + "grad_norm": 0.1941719949245453, + "learning_rate": 0.0001, + "loss": 1.8447, + "step": 1576 + }, + { + "epoch": 0.7695185426154847, + "grad_norm": 0.19126634299755096, + "learning_rate": 0.0001, + "loss": 1.7318, + "step": 1577 + }, + { + "epoch": 0.7700065061808719, + "grad_norm": 0.2004380077123642, + "learning_rate": 0.0001, + "loss": 1.7452, + "step": 1578 + }, + { + "epoch": 0.7704944697462589, + "grad_norm": 0.18317647278308868, + "learning_rate": 0.0001, + "loss": 1.6582, + "step": 1579 + }, + { + "epoch": 0.7709824333116461, + "grad_norm": 0.17985877394676208, + "learning_rate": 0.0001, + "loss": 1.5736, + "step": 1580 + }, + { + "epoch": 0.7714703968770332, + "grad_norm": 0.19441930949687958, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 1581 + }, + { + "epoch": 0.7719583604424203, + "grad_norm": 0.18969953060150146, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 1582 + }, + { + "epoch": 0.7724463240078074, + "grad_norm": 0.19708946347236633, + "learning_rate": 0.0001, + "loss": 1.7267, + "step": 1583 + }, + { + "epoch": 0.7729342875731945, + "grad_norm": 0.17867428064346313, + "learning_rate": 0.0001, + "loss": 1.5539, + "step": 1584 + }, + { + "epoch": 0.7734222511385817, + "grad_norm": 0.1882658153772354, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 1585 + }, + { + "epoch": 0.7739102147039688, + "grad_norm": 0.19716131687164307, + "learning_rate": 0.0001, + "loss": 1.7124, + "step": 1586 + }, + { + "epoch": 0.7743981782693559, + "grad_norm": 0.1901741474866867, + "learning_rate": 0.0001, + "loss": 1.5683, + "step": 1587 + }, + { + "epoch": 0.774886141834743, + "grad_norm": 0.1997768133878708, + "learning_rate": 0.0001, + "loss": 1.7426, + "step": 1588 + }, + { + "epoch": 0.7753741054001301, + "grad_norm": 0.20431944727897644, + "learning_rate": 0.0001, + "loss": 1.6736, + "step": 1589 + }, + { + "epoch": 0.7758620689655172, + "grad_norm": 0.19220104813575745, + "learning_rate": 0.0001, + "loss": 1.613, + "step": 1590 + }, + { + "epoch": 0.7763500325309044, + "grad_norm": 0.21191276609897614, + "learning_rate": 0.0001, + "loss": 1.7514, + "step": 1591 + }, + { + "epoch": 0.7768379960962914, + "grad_norm": 0.1914515644311905, + "learning_rate": 0.0001, + "loss": 1.6661, + "step": 1592 + }, + { + "epoch": 0.7773259596616786, + "grad_norm": 0.20354917645454407, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 1593 + }, + { + "epoch": 0.7778139232270657, + "grad_norm": 0.18612132966518402, + "learning_rate": 0.0001, + "loss": 1.654, + "step": 1594 + }, + { + "epoch": 0.7783018867924528, + "grad_norm": 0.18054398894309998, + "learning_rate": 0.0001, + "loss": 1.5979, + "step": 1595 + }, + { + "epoch": 0.77878985035784, + "grad_norm": 0.18446581065654755, + "learning_rate": 0.0001, + "loss": 1.6939, + "step": 1596 + }, + { + "epoch": 0.779277813923227, + "grad_norm": 0.19891038537025452, + "learning_rate": 0.0001, + "loss": 1.598, + "step": 1597 + }, + { + "epoch": 0.7797657774886142, + "grad_norm": 0.1845475286245346, + "learning_rate": 0.0001, + "loss": 1.6092, + "step": 1598 + }, + { + "epoch": 0.7802537410540012, + "grad_norm": 0.19764572381973267, + "learning_rate": 0.0001, + "loss": 1.7236, + "step": 1599 + }, + { + "epoch": 0.7807417046193884, + "grad_norm": 0.18992526829242706, + "learning_rate": 0.0001, + "loss": 1.659, + "step": 1600 + }, + { + "epoch": 0.7812296681847756, + "grad_norm": 0.19778114557266235, + "learning_rate": 0.0001, + "loss": 1.7526, + "step": 1601 + }, + { + "epoch": 0.7817176317501626, + "grad_norm": 0.18773706257343292, + "learning_rate": 0.0001, + "loss": 1.747, + "step": 1602 + }, + { + "epoch": 0.7822055953155498, + "grad_norm": 0.1825982630252838, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 1603 + }, + { + "epoch": 0.7826935588809368, + "grad_norm": 0.1806437224149704, + "learning_rate": 0.0001, + "loss": 1.6724, + "step": 1604 + }, + { + "epoch": 0.783181522446324, + "grad_norm": 0.20853163301944733, + "learning_rate": 0.0001, + "loss": 1.7719, + "step": 1605 + }, + { + "epoch": 0.7836694860117112, + "grad_norm": 0.17729510366916656, + "learning_rate": 0.0001, + "loss": 1.5767, + "step": 1606 + }, + { + "epoch": 0.7841574495770982, + "grad_norm": 0.1899496465921402, + "learning_rate": 0.0001, + "loss": 1.6542, + "step": 1607 + }, + { + "epoch": 0.7846454131424854, + "grad_norm": 0.1904434859752655, + "learning_rate": 0.0001, + "loss": 1.6901, + "step": 1608 + }, + { + "epoch": 0.7851333767078725, + "grad_norm": 0.19352108240127563, + "learning_rate": 0.0001, + "loss": 1.6535, + "step": 1609 + }, + { + "epoch": 0.7856213402732596, + "grad_norm": 0.18233756721019745, + "learning_rate": 0.0001, + "loss": 1.6878, + "step": 1610 + }, + { + "epoch": 0.7861093038386467, + "grad_norm": 0.17833392322063446, + "learning_rate": 0.0001, + "loss": 1.5601, + "step": 1611 + }, + { + "epoch": 0.7865972674040338, + "grad_norm": 0.18793350458145142, + "learning_rate": 0.0001, + "loss": 1.6703, + "step": 1612 + }, + { + "epoch": 0.787085230969421, + "grad_norm": 0.2092360109090805, + "learning_rate": 0.0001, + "loss": 1.7353, + "step": 1613 + }, + { + "epoch": 0.7875731945348081, + "grad_norm": 0.18878163397312164, + "learning_rate": 0.0001, + "loss": 1.6139, + "step": 1614 + }, + { + "epoch": 0.7880611581001952, + "grad_norm": 0.17977482080459595, + "learning_rate": 0.0001, + "loss": 1.5614, + "step": 1615 + }, + { + "epoch": 0.7885491216655823, + "grad_norm": 0.1935320347547531, + "learning_rate": 0.0001, + "loss": 1.6885, + "step": 1616 + }, + { + "epoch": 0.7890370852309694, + "grad_norm": 0.19334746897220612, + "learning_rate": 0.0001, + "loss": 1.7766, + "step": 1617 + }, + { + "epoch": 0.7895250487963565, + "grad_norm": 0.2163378894329071, + "learning_rate": 0.0001, + "loss": 1.7319, + "step": 1618 + }, + { + "epoch": 0.7900130123617437, + "grad_norm": 0.1957041621208191, + "learning_rate": 0.0001, + "loss": 1.7022, + "step": 1619 + }, + { + "epoch": 0.7905009759271308, + "grad_norm": 0.18318067491054535, + "learning_rate": 0.0001, + "loss": 1.5488, + "step": 1620 + }, + { + "epoch": 0.7909889394925179, + "grad_norm": 0.2046167552471161, + "learning_rate": 0.0001, + "loss": 1.5745, + "step": 1621 + }, + { + "epoch": 0.791476903057905, + "grad_norm": 0.1877659112215042, + "learning_rate": 0.0001, + "loss": 1.6708, + "step": 1622 + }, + { + "epoch": 0.7919648666232921, + "grad_norm": 0.21634037792682648, + "learning_rate": 0.0001, + "loss": 1.8028, + "step": 1623 + }, + { + "epoch": 0.7924528301886793, + "grad_norm": 0.22510355710983276, + "learning_rate": 0.0001, + "loss": 1.6316, + "step": 1624 + }, + { + "epoch": 0.7929407937540663, + "grad_norm": 0.21921491622924805, + "learning_rate": 0.0001, + "loss": 1.6614, + "step": 1625 + }, + { + "epoch": 0.7934287573194535, + "grad_norm": 0.1849048137664795, + "learning_rate": 0.0001, + "loss": 1.7025, + "step": 1626 + }, + { + "epoch": 0.7939167208848406, + "grad_norm": 0.20738188922405243, + "learning_rate": 0.0001, + "loss": 1.5814, + "step": 1627 + }, + { + "epoch": 0.7944046844502277, + "grad_norm": 0.227822944521904, + "learning_rate": 0.0001, + "loss": 1.6831, + "step": 1628 + }, + { + "epoch": 0.7948926480156149, + "grad_norm": 0.1745869219303131, + "learning_rate": 0.0001, + "loss": 1.5815, + "step": 1629 + }, + { + "epoch": 0.7953806115810019, + "grad_norm": 0.23134981095790863, + "learning_rate": 0.0001, + "loss": 1.6855, + "step": 1630 + }, + { + "epoch": 0.7958685751463891, + "grad_norm": 0.2177252322435379, + "learning_rate": 0.0001, + "loss": 1.6097, + "step": 1631 + }, + { + "epoch": 0.7963565387117761, + "grad_norm": 0.19471696019172668, + "learning_rate": 0.0001, + "loss": 1.7006, + "step": 1632 + }, + { + "epoch": 0.7968445022771633, + "grad_norm": 0.212845116853714, + "learning_rate": 0.0001, + "loss": 1.6917, + "step": 1633 + }, + { + "epoch": 0.7973324658425505, + "grad_norm": 0.22464518249034882, + "learning_rate": 0.0001, + "loss": 1.5826, + "step": 1634 + }, + { + "epoch": 0.7978204294079375, + "grad_norm": 0.185982346534729, + "learning_rate": 0.0001, + "loss": 1.7345, + "step": 1635 + }, + { + "epoch": 0.7983083929733247, + "grad_norm": 0.20566463470458984, + "learning_rate": 0.0001, + "loss": 1.6962, + "step": 1636 + }, + { + "epoch": 0.7987963565387117, + "grad_norm": 0.20042772591114044, + "learning_rate": 0.0001, + "loss": 1.6684, + "step": 1637 + }, + { + "epoch": 0.7992843201040989, + "grad_norm": 0.1849479079246521, + "learning_rate": 0.0001, + "loss": 1.7162, + "step": 1638 + }, + { + "epoch": 0.799772283669486, + "grad_norm": 0.199243426322937, + "learning_rate": 0.0001, + "loss": 1.8008, + "step": 1639 + }, + { + "epoch": 0.8002602472348731, + "grad_norm": 0.18501123785972595, + "learning_rate": 0.0001, + "loss": 1.7044, + "step": 1640 + }, + { + "epoch": 0.8007482108002603, + "grad_norm": 0.18791933357715607, + "learning_rate": 0.0001, + "loss": 1.6895, + "step": 1641 + }, + { + "epoch": 0.8012361743656473, + "grad_norm": 0.20649796724319458, + "learning_rate": 0.0001, + "loss": 1.7186, + "step": 1642 + }, + { + "epoch": 0.8017241379310345, + "grad_norm": 0.18353037536144257, + "learning_rate": 0.0001, + "loss": 1.5924, + "step": 1643 + }, + { + "epoch": 0.8022121014964216, + "grad_norm": 0.19343356788158417, + "learning_rate": 0.0001, + "loss": 1.7828, + "step": 1644 + }, + { + "epoch": 0.8027000650618087, + "grad_norm": 0.18914443254470825, + "learning_rate": 0.0001, + "loss": 1.736, + "step": 1645 + }, + { + "epoch": 0.8031880286271958, + "grad_norm": 0.19788146018981934, + "learning_rate": 0.0001, + "loss": 1.8165, + "step": 1646 + }, + { + "epoch": 0.8036759921925829, + "grad_norm": 0.19266889989376068, + "learning_rate": 0.0001, + "loss": 1.6921, + "step": 1647 + }, + { + "epoch": 0.8041639557579701, + "grad_norm": 0.1988595426082611, + "learning_rate": 0.0001, + "loss": 1.6117, + "step": 1648 + }, + { + "epoch": 0.8046519193233572, + "grad_norm": 0.1922290027141571, + "learning_rate": 0.0001, + "loss": 1.6717, + "step": 1649 + }, + { + "epoch": 0.8051398828887443, + "grad_norm": 0.1773114949464798, + "learning_rate": 0.0001, + "loss": 1.5806, + "step": 1650 + }, + { + "epoch": 0.8056278464541314, + "grad_norm": 0.18884365260601044, + "learning_rate": 0.0001, + "loss": 1.6146, + "step": 1651 + }, + { + "epoch": 0.8061158100195186, + "grad_norm": 0.1919168382883072, + "learning_rate": 0.0001, + "loss": 1.6228, + "step": 1652 + }, + { + "epoch": 0.8066037735849056, + "grad_norm": 0.19377237558364868, + "learning_rate": 0.0001, + "loss": 1.6811, + "step": 1653 + }, + { + "epoch": 0.8070917371502928, + "grad_norm": 0.20503416657447815, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 1654 + }, + { + "epoch": 0.8075797007156799, + "grad_norm": 0.1844811588525772, + "learning_rate": 0.0001, + "loss": 1.5669, + "step": 1655 + }, + { + "epoch": 0.808067664281067, + "grad_norm": 0.19252845644950867, + "learning_rate": 0.0001, + "loss": 1.7096, + "step": 1656 + }, + { + "epoch": 0.8085556278464542, + "grad_norm": 0.1933499425649643, + "learning_rate": 0.0001, + "loss": 1.6825, + "step": 1657 + }, + { + "epoch": 0.8090435914118412, + "grad_norm": 0.1752086579799652, + "learning_rate": 0.0001, + "loss": 1.6121, + "step": 1658 + }, + { + "epoch": 0.8095315549772284, + "grad_norm": 0.1772938370704651, + "learning_rate": 0.0001, + "loss": 1.7315, + "step": 1659 + }, + { + "epoch": 0.8100195185426154, + "grad_norm": 0.18318156898021698, + "learning_rate": 0.0001, + "loss": 1.7374, + "step": 1660 + }, + { + "epoch": 0.8105074821080026, + "grad_norm": 0.1868072748184204, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 1661 + }, + { + "epoch": 0.8109954456733898, + "grad_norm": 0.1911938190460205, + "learning_rate": 0.0001, + "loss": 1.74, + "step": 1662 + }, + { + "epoch": 0.8114834092387768, + "grad_norm": 0.19108733534812927, + "learning_rate": 0.0001, + "loss": 1.7353, + "step": 1663 + }, + { + "epoch": 0.811971372804164, + "grad_norm": 0.1893574446439743, + "learning_rate": 0.0001, + "loss": 1.6413, + "step": 1664 + }, + { + "epoch": 0.812459336369551, + "grad_norm": 0.18296094238758087, + "learning_rate": 0.0001, + "loss": 1.679, + "step": 1665 + }, + { + "epoch": 0.8129472999349382, + "grad_norm": 0.18871071934700012, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 1666 + }, + { + "epoch": 0.8134352635003254, + "grad_norm": 0.192779541015625, + "learning_rate": 0.0001, + "loss": 1.7312, + "step": 1667 + }, + { + "epoch": 0.8139232270657124, + "grad_norm": 0.19144397974014282, + "learning_rate": 0.0001, + "loss": 1.649, + "step": 1668 + }, + { + "epoch": 0.8144111906310996, + "grad_norm": 0.1832989603281021, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 1669 + }, + { + "epoch": 0.8148991541964866, + "grad_norm": 0.1771543025970459, + "learning_rate": 0.0001, + "loss": 1.547, + "step": 1670 + }, + { + "epoch": 0.8153871177618738, + "grad_norm": 0.2000136524438858, + "learning_rate": 0.0001, + "loss": 1.7604, + "step": 1671 + }, + { + "epoch": 0.8158750813272609, + "grad_norm": 0.17948202788829803, + "learning_rate": 0.0001, + "loss": 1.6072, + "step": 1672 + }, + { + "epoch": 0.816363044892648, + "grad_norm": 0.18728068470954895, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 1673 + }, + { + "epoch": 0.8168510084580352, + "grad_norm": 0.17753037810325623, + "learning_rate": 0.0001, + "loss": 1.58, + "step": 1674 + }, + { + "epoch": 0.8173389720234222, + "grad_norm": 0.19952118396759033, + "learning_rate": 0.0001, + "loss": 1.7201, + "step": 1675 + }, + { + "epoch": 0.8178269355888094, + "grad_norm": 0.18967780470848083, + "learning_rate": 0.0001, + "loss": 1.7637, + "step": 1676 + }, + { + "epoch": 0.8183148991541965, + "grad_norm": 0.18781159818172455, + "learning_rate": 0.0001, + "loss": 1.6385, + "step": 1677 + }, + { + "epoch": 0.8188028627195836, + "grad_norm": 0.18192321062088013, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 1678 + }, + { + "epoch": 0.8192908262849707, + "grad_norm": 0.1921571046113968, + "learning_rate": 0.0001, + "loss": 1.7492, + "step": 1679 + }, + { + "epoch": 0.8197787898503578, + "grad_norm": 0.19465389847755432, + "learning_rate": 0.0001, + "loss": 1.6658, + "step": 1680 + }, + { + "epoch": 0.820266753415745, + "grad_norm": 0.19218620657920837, + "learning_rate": 0.0001, + "loss": 1.6639, + "step": 1681 + }, + { + "epoch": 0.8207547169811321, + "grad_norm": 0.18591156601905823, + "learning_rate": 0.0001, + "loss": 1.5869, + "step": 1682 + }, + { + "epoch": 0.8212426805465192, + "grad_norm": 0.2010912001132965, + "learning_rate": 0.0001, + "loss": 1.7304, + "step": 1683 + }, + { + "epoch": 0.8217306441119063, + "grad_norm": 0.18728473782539368, + "learning_rate": 0.0001, + "loss": 1.6596, + "step": 1684 + }, + { + "epoch": 0.8222186076772934, + "grad_norm": 0.1863940954208374, + "learning_rate": 0.0001, + "loss": 1.5735, + "step": 1685 + }, + { + "epoch": 0.8227065712426805, + "grad_norm": 0.18404056131839752, + "learning_rate": 0.0001, + "loss": 1.5921, + "step": 1686 + }, + { + "epoch": 0.8231945348080677, + "grad_norm": 0.19402609765529633, + "learning_rate": 0.0001, + "loss": 1.6873, + "step": 1687 + }, + { + "epoch": 0.8236824983734548, + "grad_norm": 0.1880168318748474, + "learning_rate": 0.0001, + "loss": 1.6991, + "step": 1688 + }, + { + "epoch": 0.8241704619388419, + "grad_norm": 0.18674950301647186, + "learning_rate": 0.0001, + "loss": 1.7284, + "step": 1689 + }, + { + "epoch": 0.824658425504229, + "grad_norm": 0.18363825976848602, + "learning_rate": 0.0001, + "loss": 1.6617, + "step": 1690 + }, + { + "epoch": 0.8251463890696161, + "grad_norm": 0.19676996767520905, + "learning_rate": 0.0001, + "loss": 1.8167, + "step": 1691 + }, + { + "epoch": 0.8256343526350033, + "grad_norm": 0.17890886962413788, + "learning_rate": 0.0001, + "loss": 1.6515, + "step": 1692 + }, + { + "epoch": 0.8261223162003903, + "grad_norm": 0.19220395386219025, + "learning_rate": 0.0001, + "loss": 1.6924, + "step": 1693 + }, + { + "epoch": 0.8266102797657775, + "grad_norm": 0.17939774692058563, + "learning_rate": 0.0001, + "loss": 1.6658, + "step": 1694 + }, + { + "epoch": 0.8270982433311646, + "grad_norm": 0.18299368023872375, + "learning_rate": 0.0001, + "loss": 1.6109, + "step": 1695 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.19559091329574585, + "learning_rate": 0.0001, + "loss": 1.71, + "step": 1696 + }, + { + "epoch": 0.8280741704619389, + "grad_norm": 0.18175216019153595, + "learning_rate": 0.0001, + "loss": 1.579, + "step": 1697 + }, + { + "epoch": 0.8285621340273259, + "grad_norm": 0.17916333675384521, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 1698 + }, + { + "epoch": 0.8290500975927131, + "grad_norm": 0.19414590299129486, + "learning_rate": 0.0001, + "loss": 1.6539, + "step": 1699 + }, + { + "epoch": 0.8295380611581002, + "grad_norm": 0.190690279006958, + "learning_rate": 0.0001, + "loss": 1.6743, + "step": 1700 + }, + { + "epoch": 0.8300260247234873, + "grad_norm": 0.1775364726781845, + "learning_rate": 0.0001, + "loss": 1.6156, + "step": 1701 + }, + { + "epoch": 0.8305139882888745, + "grad_norm": 0.19147178530693054, + "learning_rate": 0.0001, + "loss": 1.6988, + "step": 1702 + }, + { + "epoch": 0.8310019518542615, + "grad_norm": 0.191775843501091, + "learning_rate": 0.0001, + "loss": 1.6263, + "step": 1703 + }, + { + "epoch": 0.8314899154196487, + "grad_norm": 0.18212559819221497, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 1704 + }, + { + "epoch": 0.8319778789850358, + "grad_norm": 0.1973716914653778, + "learning_rate": 0.0001, + "loss": 1.6681, + "step": 1705 + }, + { + "epoch": 0.8324658425504229, + "grad_norm": 0.19240307807922363, + "learning_rate": 0.0001, + "loss": 1.6565, + "step": 1706 + }, + { + "epoch": 0.83295380611581, + "grad_norm": 0.18731878697872162, + "learning_rate": 0.0001, + "loss": 1.7008, + "step": 1707 + }, + { + "epoch": 0.8334417696811971, + "grad_norm": 0.18686629831790924, + "learning_rate": 0.0001, + "loss": 1.6936, + "step": 1708 + }, + { + "epoch": 0.8339297332465843, + "grad_norm": 0.19418823719024658, + "learning_rate": 0.0001, + "loss": 1.6749, + "step": 1709 + }, + { + "epoch": 0.8344176968119714, + "grad_norm": 0.19065696001052856, + "learning_rate": 0.0001, + "loss": 1.5825, + "step": 1710 + }, + { + "epoch": 0.8349056603773585, + "grad_norm": 0.19086980819702148, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 1711 + }, + { + "epoch": 0.8353936239427456, + "grad_norm": 0.18461991846561432, + "learning_rate": 0.0001, + "loss": 1.6877, + "step": 1712 + }, + { + "epoch": 0.8358815875081327, + "grad_norm": 0.18750816583633423, + "learning_rate": 0.0001, + "loss": 1.5869, + "step": 1713 + }, + { + "epoch": 0.8363695510735198, + "grad_norm": 0.2001444548368454, + "learning_rate": 0.0001, + "loss": 1.7659, + "step": 1714 + }, + { + "epoch": 0.836857514638907, + "grad_norm": 0.195534348487854, + "learning_rate": 0.0001, + "loss": 1.6863, + "step": 1715 + }, + { + "epoch": 0.8373454782042941, + "grad_norm": 0.18564340472221375, + "learning_rate": 0.0001, + "loss": 1.6825, + "step": 1716 + }, + { + "epoch": 0.8378334417696812, + "grad_norm": 0.1830897182226181, + "learning_rate": 0.0001, + "loss": 1.5911, + "step": 1717 + }, + { + "epoch": 0.8383214053350683, + "grad_norm": 0.18405020236968994, + "learning_rate": 0.0001, + "loss": 1.6177, + "step": 1718 + }, + { + "epoch": 0.8388093689004554, + "grad_norm": 0.1829666793346405, + "learning_rate": 0.0001, + "loss": 1.5921, + "step": 1719 + }, + { + "epoch": 0.8392973324658426, + "grad_norm": 0.19332802295684814, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 1720 + }, + { + "epoch": 0.8397852960312296, + "grad_norm": 0.19402368366718292, + "learning_rate": 0.0001, + "loss": 1.6052, + "step": 1721 + }, + { + "epoch": 0.8402732595966168, + "grad_norm": 0.1924324482679367, + "learning_rate": 0.0001, + "loss": 1.7398, + "step": 1722 + }, + { + "epoch": 0.8407612231620039, + "grad_norm": 0.1912868320941925, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 1723 + }, + { + "epoch": 0.841249186727391, + "grad_norm": 0.19338296353816986, + "learning_rate": 0.0001, + "loss": 1.7595, + "step": 1724 + }, + { + "epoch": 0.8417371502927782, + "grad_norm": 0.19593428075313568, + "learning_rate": 0.0001, + "loss": 1.754, + "step": 1725 + }, + { + "epoch": 0.8422251138581652, + "grad_norm": 0.18348434567451477, + "learning_rate": 0.0001, + "loss": 1.6831, + "step": 1726 + }, + { + "epoch": 0.8427130774235524, + "grad_norm": 0.18937769532203674, + "learning_rate": 0.0001, + "loss": 1.6921, + "step": 1727 + }, + { + "epoch": 0.8432010409889394, + "grad_norm": 0.1873343288898468, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 1728 + }, + { + "epoch": 0.8436890045543266, + "grad_norm": 0.18602168560028076, + "learning_rate": 0.0001, + "loss": 1.6805, + "step": 1729 + }, + { + "epoch": 0.8441769681197138, + "grad_norm": 0.1878175288438797, + "learning_rate": 0.0001, + "loss": 1.6887, + "step": 1730 + }, + { + "epoch": 0.8446649316851008, + "grad_norm": 0.18829582631587982, + "learning_rate": 0.0001, + "loss": 1.6766, + "step": 1731 + }, + { + "epoch": 0.845152895250488, + "grad_norm": 0.19731226563453674, + "learning_rate": 0.0001, + "loss": 1.7651, + "step": 1732 + }, + { + "epoch": 0.845640858815875, + "grad_norm": 0.1821230798959732, + "learning_rate": 0.0001, + "loss": 1.6899, + "step": 1733 + }, + { + "epoch": 0.8461288223812622, + "grad_norm": 0.19442221522331238, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 1734 + }, + { + "epoch": 0.8466167859466494, + "grad_norm": 0.18161530792713165, + "learning_rate": 0.0001, + "loss": 1.5891, + "step": 1735 + }, + { + "epoch": 0.8471047495120364, + "grad_norm": 0.18692250549793243, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 1736 + }, + { + "epoch": 0.8475927130774236, + "grad_norm": 0.1943616271018982, + "learning_rate": 0.0001, + "loss": 1.5322, + "step": 1737 + }, + { + "epoch": 0.8480806766428106, + "grad_norm": 0.1985720843076706, + "learning_rate": 0.0001, + "loss": 1.6677, + "step": 1738 + }, + { + "epoch": 0.8485686402081978, + "grad_norm": 0.18838457763195038, + "learning_rate": 0.0001, + "loss": 1.6649, + "step": 1739 + }, + { + "epoch": 0.8490566037735849, + "grad_norm": 0.19150035083293915, + "learning_rate": 0.0001, + "loss": 1.6901, + "step": 1740 + }, + { + "epoch": 0.849544567338972, + "grad_norm": 0.18909554183483124, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 1741 + }, + { + "epoch": 0.8500325309043592, + "grad_norm": 0.19308097660541534, + "learning_rate": 0.0001, + "loss": 1.7253, + "step": 1742 + }, + { + "epoch": 0.8505204944697463, + "grad_norm": 0.19270172715187073, + "learning_rate": 0.0001, + "loss": 1.7028, + "step": 1743 + }, + { + "epoch": 0.8510084580351334, + "grad_norm": 0.1880318969488144, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 1744 + }, + { + "epoch": 0.8514964216005205, + "grad_norm": 0.1749912053346634, + "learning_rate": 0.0001, + "loss": 1.6776, + "step": 1745 + }, + { + "epoch": 0.8519843851659076, + "grad_norm": 0.18535734713077545, + "learning_rate": 0.0001, + "loss": 1.7433, + "step": 1746 + }, + { + "epoch": 0.8524723487312947, + "grad_norm": 0.18681146204471588, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 1747 + }, + { + "epoch": 0.8529603122966819, + "grad_norm": 0.21550102531909943, + "learning_rate": 0.0001, + "loss": 1.6953, + "step": 1748 + }, + { + "epoch": 0.853448275862069, + "grad_norm": 0.1936476081609726, + "learning_rate": 0.0001, + "loss": 1.6796, + "step": 1749 + }, + { + "epoch": 0.8539362394274561, + "grad_norm": 0.18475957214832306, + "learning_rate": 0.0001, + "loss": 1.7073, + "step": 1750 + }, + { + "epoch": 0.8544242029928432, + "grad_norm": 0.1896783858537674, + "learning_rate": 0.0001, + "loss": 1.6556, + "step": 1751 + }, + { + "epoch": 0.8549121665582303, + "grad_norm": 0.1936115324497223, + "learning_rate": 0.0001, + "loss": 1.6614, + "step": 1752 + }, + { + "epoch": 0.8554001301236175, + "grad_norm": 0.18598994612693787, + "learning_rate": 0.0001, + "loss": 1.708, + "step": 1753 + }, + { + "epoch": 0.8558880936890045, + "grad_norm": 0.1906435489654541, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 1754 + }, + { + "epoch": 0.8563760572543917, + "grad_norm": 0.199356809258461, + "learning_rate": 0.0001, + "loss": 1.55, + "step": 1755 + }, + { + "epoch": 0.8568640208197787, + "grad_norm": 0.1791318953037262, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 1756 + }, + { + "epoch": 0.8573519843851659, + "grad_norm": 0.2023209035396576, + "learning_rate": 0.0001, + "loss": 1.7851, + "step": 1757 + }, + { + "epoch": 0.8578399479505531, + "grad_norm": 0.18916785717010498, + "learning_rate": 0.0001, + "loss": 1.6051, + "step": 1758 + }, + { + "epoch": 0.8583279115159401, + "grad_norm": 0.19388586282730103, + "learning_rate": 0.0001, + "loss": 1.736, + "step": 1759 + }, + { + "epoch": 0.8588158750813273, + "grad_norm": 0.1888210028409958, + "learning_rate": 0.0001, + "loss": 1.6526, + "step": 1760 + }, + { + "epoch": 0.8593038386467143, + "grad_norm": 0.18460845947265625, + "learning_rate": 0.0001, + "loss": 1.657, + "step": 1761 + }, + { + "epoch": 0.8597918022121015, + "grad_norm": 0.18918482959270477, + "learning_rate": 0.0001, + "loss": 1.6517, + "step": 1762 + }, + { + "epoch": 0.8602797657774887, + "grad_norm": 0.2215425670146942, + "learning_rate": 0.0001, + "loss": 1.8116, + "step": 1763 + }, + { + "epoch": 0.8607677293428757, + "grad_norm": 0.1845925748348236, + "learning_rate": 0.0001, + "loss": 1.5914, + "step": 1764 + }, + { + "epoch": 0.8612556929082629, + "grad_norm": 0.1800438016653061, + "learning_rate": 0.0001, + "loss": 1.6074, + "step": 1765 + }, + { + "epoch": 0.8617436564736499, + "grad_norm": 0.1770068109035492, + "learning_rate": 0.0001, + "loss": 1.6869, + "step": 1766 + }, + { + "epoch": 0.8622316200390371, + "grad_norm": 0.19672898948192596, + "learning_rate": 0.0001, + "loss": 1.5741, + "step": 1767 + }, + { + "epoch": 0.8627195836044242, + "grad_norm": 0.1833876222372055, + "learning_rate": 0.0001, + "loss": 1.6364, + "step": 1768 + }, + { + "epoch": 0.8632075471698113, + "grad_norm": 0.18901808559894562, + "learning_rate": 0.0001, + "loss": 1.7703, + "step": 1769 + }, + { + "epoch": 0.8636955107351985, + "grad_norm": 0.18848566710948944, + "learning_rate": 0.0001, + "loss": 1.7137, + "step": 1770 + }, + { + "epoch": 0.8641834743005855, + "grad_norm": 0.1907043308019638, + "learning_rate": 0.0001, + "loss": 1.6075, + "step": 1771 + }, + { + "epoch": 0.8646714378659727, + "grad_norm": 0.20296499133110046, + "learning_rate": 0.0001, + "loss": 1.7779, + "step": 1772 + }, + { + "epoch": 0.8651594014313598, + "grad_norm": 0.18933746218681335, + "learning_rate": 0.0001, + "loss": 1.7367, + "step": 1773 + }, + { + "epoch": 0.8656473649967469, + "grad_norm": 0.19736367464065552, + "learning_rate": 0.0001, + "loss": 1.7314, + "step": 1774 + }, + { + "epoch": 0.866135328562134, + "grad_norm": 0.18523859977722168, + "learning_rate": 0.0001, + "loss": 1.6182, + "step": 1775 + }, + { + "epoch": 0.8666232921275211, + "grad_norm": 0.200862318277359, + "learning_rate": 0.0001, + "loss": 1.72, + "step": 1776 + }, + { + "epoch": 0.8671112556929083, + "grad_norm": 0.18352118134498596, + "learning_rate": 0.0001, + "loss": 1.5903, + "step": 1777 + }, + { + "epoch": 0.8675992192582954, + "grad_norm": 0.18788601458072662, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 1778 + }, + { + "epoch": 0.8680871828236825, + "grad_norm": 0.2129855901002884, + "learning_rate": 0.0001, + "loss": 1.6931, + "step": 1779 + }, + { + "epoch": 0.8685751463890696, + "grad_norm": 0.18848247826099396, + "learning_rate": 0.0001, + "loss": 1.7186, + "step": 1780 + }, + { + "epoch": 0.8690631099544567, + "grad_norm": 0.18020255863666534, + "learning_rate": 0.0001, + "loss": 1.5447, + "step": 1781 + }, + { + "epoch": 0.8695510735198438, + "grad_norm": 0.19755859673023224, + "learning_rate": 0.0001, + "loss": 1.6055, + "step": 1782 + }, + { + "epoch": 0.870039037085231, + "grad_norm": 0.19791275262832642, + "learning_rate": 0.0001, + "loss": 1.681, + "step": 1783 + }, + { + "epoch": 0.870527000650618, + "grad_norm": 0.194001704454422, + "learning_rate": 0.0001, + "loss": 1.7187, + "step": 1784 + }, + { + "epoch": 0.8710149642160052, + "grad_norm": 0.19635316729545593, + "learning_rate": 0.0001, + "loss": 1.6656, + "step": 1785 + }, + { + "epoch": 0.8715029277813924, + "grad_norm": 0.18945664167404175, + "learning_rate": 0.0001, + "loss": 1.6149, + "step": 1786 + }, + { + "epoch": 0.8719908913467794, + "grad_norm": 0.19505362212657928, + "learning_rate": 0.0001, + "loss": 1.7541, + "step": 1787 + }, + { + "epoch": 0.8724788549121666, + "grad_norm": 0.2048032581806183, + "learning_rate": 0.0001, + "loss": 1.7539, + "step": 1788 + }, + { + "epoch": 0.8729668184775536, + "grad_norm": 0.1928759068250656, + "learning_rate": 0.0001, + "loss": 1.6836, + "step": 1789 + }, + { + "epoch": 0.8734547820429408, + "grad_norm": 0.1996418982744217, + "learning_rate": 0.0001, + "loss": 1.5969, + "step": 1790 + }, + { + "epoch": 0.873942745608328, + "grad_norm": 0.1895206868648529, + "learning_rate": 0.0001, + "loss": 1.6577, + "step": 1791 + }, + { + "epoch": 0.874430709173715, + "grad_norm": 0.19562388956546783, + "learning_rate": 0.0001, + "loss": 1.6688, + "step": 1792 + }, + { + "epoch": 0.8749186727391022, + "grad_norm": 0.1907225251197815, + "learning_rate": 0.0001, + "loss": 1.6491, + "step": 1793 + }, + { + "epoch": 0.8754066363044892, + "grad_norm": 0.1887628734111786, + "learning_rate": 0.0001, + "loss": 1.657, + "step": 1794 + }, + { + "epoch": 0.8758945998698764, + "grad_norm": 0.1809932142496109, + "learning_rate": 0.0001, + "loss": 1.5484, + "step": 1795 + }, + { + "epoch": 0.8763825634352636, + "grad_norm": 0.18477503955364227, + "learning_rate": 0.0001, + "loss": 1.5783, + "step": 1796 + }, + { + "epoch": 0.8768705270006506, + "grad_norm": 0.2000272572040558, + "learning_rate": 0.0001, + "loss": 1.7027, + "step": 1797 + }, + { + "epoch": 0.8773584905660378, + "grad_norm": 0.18049073219299316, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 1798 + }, + { + "epoch": 0.8778464541314248, + "grad_norm": 0.19112178683280945, + "learning_rate": 0.0001, + "loss": 1.7403, + "step": 1799 + }, + { + "epoch": 0.878334417696812, + "grad_norm": 0.1985938549041748, + "learning_rate": 0.0001, + "loss": 1.5708, + "step": 1800 + }, + { + "epoch": 0.8788223812621991, + "grad_norm": 0.1869334578514099, + "learning_rate": 0.0001, + "loss": 1.7111, + "step": 1801 + }, + { + "epoch": 0.8793103448275862, + "grad_norm": 0.20291414856910706, + "learning_rate": 0.0001, + "loss": 1.753, + "step": 1802 + }, + { + "epoch": 0.8797983083929733, + "grad_norm": 0.19386352598667145, + "learning_rate": 0.0001, + "loss": 1.6402, + "step": 1803 + }, + { + "epoch": 0.8802862719583604, + "grad_norm": 0.18778564035892487, + "learning_rate": 0.0001, + "loss": 1.6885, + "step": 1804 + }, + { + "epoch": 0.8807742355237476, + "grad_norm": 0.19215920567512512, + "learning_rate": 0.0001, + "loss": 1.6659, + "step": 1805 + }, + { + "epoch": 0.8812621990891347, + "grad_norm": 0.2166108340024948, + "learning_rate": 0.0001, + "loss": 1.6867, + "step": 1806 + }, + { + "epoch": 0.8817501626545218, + "grad_norm": 0.18133436143398285, + "learning_rate": 0.0001, + "loss": 1.5926, + "step": 1807 + }, + { + "epoch": 0.8822381262199089, + "grad_norm": 0.18868204951286316, + "learning_rate": 0.0001, + "loss": 1.6172, + "step": 1808 + }, + { + "epoch": 0.882726089785296, + "grad_norm": 0.20519724488258362, + "learning_rate": 0.0001, + "loss": 1.7143, + "step": 1809 + }, + { + "epoch": 0.8832140533506831, + "grad_norm": 0.190599262714386, + "learning_rate": 0.0001, + "loss": 1.7854, + "step": 1810 + }, + { + "epoch": 0.8837020169160703, + "grad_norm": 0.1950819492340088, + "learning_rate": 0.0001, + "loss": 1.5849, + "step": 1811 + }, + { + "epoch": 0.8841899804814574, + "grad_norm": 0.1758696287870407, + "learning_rate": 0.0001, + "loss": 1.5769, + "step": 1812 + }, + { + "epoch": 0.8846779440468445, + "grad_norm": 0.19683918356895447, + "learning_rate": 0.0001, + "loss": 1.7032, + "step": 1813 + }, + { + "epoch": 0.8851659076122316, + "grad_norm": 0.17837479710578918, + "learning_rate": 0.0001, + "loss": 1.5567, + "step": 1814 + }, + { + "epoch": 0.8856538711776187, + "grad_norm": 0.19034849107265472, + "learning_rate": 0.0001, + "loss": 1.7274, + "step": 1815 + }, + { + "epoch": 0.8861418347430059, + "grad_norm": 0.19284600019454956, + "learning_rate": 0.0001, + "loss": 1.6933, + "step": 1816 + }, + { + "epoch": 0.886629798308393, + "grad_norm": 0.18302425742149353, + "learning_rate": 0.0001, + "loss": 1.6091, + "step": 1817 + }, + { + "epoch": 0.8871177618737801, + "grad_norm": 0.1840258687734604, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 1818 + }, + { + "epoch": 0.8876057254391672, + "grad_norm": 0.19161662459373474, + "learning_rate": 0.0001, + "loss": 1.7768, + "step": 1819 + }, + { + "epoch": 0.8880936890045543, + "grad_norm": 0.19132988154888153, + "learning_rate": 0.0001, + "loss": 1.6508, + "step": 1820 + }, + { + "epoch": 0.8885816525699415, + "grad_norm": 0.19872814416885376, + "learning_rate": 0.0001, + "loss": 1.756, + "step": 1821 + }, + { + "epoch": 0.8890696161353285, + "grad_norm": 0.19945646822452545, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 1822 + }, + { + "epoch": 0.8895575797007157, + "grad_norm": 0.1971243917942047, + "learning_rate": 0.0001, + "loss": 1.718, + "step": 1823 + }, + { + "epoch": 0.8900455432661027, + "grad_norm": 0.1854260414838791, + "learning_rate": 0.0001, + "loss": 1.6402, + "step": 1824 + }, + { + "epoch": 0.8905335068314899, + "grad_norm": 0.19041430950164795, + "learning_rate": 0.0001, + "loss": 1.5898, + "step": 1825 + }, + { + "epoch": 0.8910214703968771, + "grad_norm": 0.20725248754024506, + "learning_rate": 0.0001, + "loss": 1.6835, + "step": 1826 + }, + { + "epoch": 0.8915094339622641, + "grad_norm": 0.19040203094482422, + "learning_rate": 0.0001, + "loss": 1.7338, + "step": 1827 + }, + { + "epoch": 0.8919973975276513, + "grad_norm": 0.19236230850219727, + "learning_rate": 0.0001, + "loss": 1.6475, + "step": 1828 + }, + { + "epoch": 0.8924853610930383, + "grad_norm": 0.19737380743026733, + "learning_rate": 0.0001, + "loss": 1.6187, + "step": 1829 + }, + { + "epoch": 0.8929733246584255, + "grad_norm": 0.1852233111858368, + "learning_rate": 0.0001, + "loss": 1.5997, + "step": 1830 + }, + { + "epoch": 0.8934612882238127, + "grad_norm": 0.19599072635173798, + "learning_rate": 0.0001, + "loss": 1.7316, + "step": 1831 + }, + { + "epoch": 0.8939492517891997, + "grad_norm": 0.20338813960552216, + "learning_rate": 0.0001, + "loss": 1.7148, + "step": 1832 + }, + { + "epoch": 0.8944372153545869, + "grad_norm": 0.19061818718910217, + "learning_rate": 0.0001, + "loss": 1.6266, + "step": 1833 + }, + { + "epoch": 0.894925178919974, + "grad_norm": 0.1853945404291153, + "learning_rate": 0.0001, + "loss": 1.6853, + "step": 1834 + }, + { + "epoch": 0.8954131424853611, + "grad_norm": 0.1979200690984726, + "learning_rate": 0.0001, + "loss": 1.6889, + "step": 1835 + }, + { + "epoch": 0.8959011060507482, + "grad_norm": 0.1888495832681656, + "learning_rate": 0.0001, + "loss": 1.6354, + "step": 1836 + }, + { + "epoch": 0.8963890696161353, + "grad_norm": 0.1901407241821289, + "learning_rate": 0.0001, + "loss": 1.642, + "step": 1837 + }, + { + "epoch": 0.8968770331815225, + "grad_norm": 0.20139391720294952, + "learning_rate": 0.0001, + "loss": 1.7404, + "step": 1838 + }, + { + "epoch": 0.8973649967469096, + "grad_norm": 0.18233637511730194, + "learning_rate": 0.0001, + "loss": 1.6359, + "step": 1839 + }, + { + "epoch": 0.8978529603122967, + "grad_norm": 0.18822279572486877, + "learning_rate": 0.0001, + "loss": 1.6719, + "step": 1840 + }, + { + "epoch": 0.8983409238776838, + "grad_norm": 0.18992972373962402, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 1841 + }, + { + "epoch": 0.8988288874430709, + "grad_norm": 0.18185953795909882, + "learning_rate": 0.0001, + "loss": 1.5577, + "step": 1842 + }, + { + "epoch": 0.899316851008458, + "grad_norm": 0.18909883499145508, + "learning_rate": 0.0001, + "loss": 1.755, + "step": 1843 + }, + { + "epoch": 0.8998048145738452, + "grad_norm": 0.1953660547733307, + "learning_rate": 0.0001, + "loss": 1.6148, + "step": 1844 + }, + { + "epoch": 0.9002927781392323, + "grad_norm": 0.19910429418087006, + "learning_rate": 0.0001, + "loss": 1.6864, + "step": 1845 + }, + { + "epoch": 0.9007807417046194, + "grad_norm": 0.19126242399215698, + "learning_rate": 0.0001, + "loss": 1.6264, + "step": 1846 + }, + { + "epoch": 0.9012687052700065, + "grad_norm": 0.18766577541828156, + "learning_rate": 0.0001, + "loss": 1.7142, + "step": 1847 + }, + { + "epoch": 0.9017566688353936, + "grad_norm": 0.18595442175865173, + "learning_rate": 0.0001, + "loss": 1.6663, + "step": 1848 + }, + { + "epoch": 0.9022446324007808, + "grad_norm": 0.1945996880531311, + "learning_rate": 0.0001, + "loss": 1.7243, + "step": 1849 + }, + { + "epoch": 0.9027325959661678, + "grad_norm": 0.1816486269235611, + "learning_rate": 0.0001, + "loss": 1.632, + "step": 1850 + }, + { + "epoch": 0.903220559531555, + "grad_norm": 0.18732072412967682, + "learning_rate": 0.0001, + "loss": 1.5309, + "step": 1851 + }, + { + "epoch": 0.903708523096942, + "grad_norm": 0.18456104397773743, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 1852 + }, + { + "epoch": 0.9041964866623292, + "grad_norm": 0.1911781132221222, + "learning_rate": 0.0001, + "loss": 1.5973, + "step": 1853 + }, + { + "epoch": 0.9046844502277164, + "grad_norm": 0.19040006399154663, + "learning_rate": 0.0001, + "loss": 1.6354, + "step": 1854 + }, + { + "epoch": 0.9051724137931034, + "grad_norm": 0.17650040984153748, + "learning_rate": 0.0001, + "loss": 1.5285, + "step": 1855 + }, + { + "epoch": 0.9056603773584906, + "grad_norm": 0.19496093690395355, + "learning_rate": 0.0001, + "loss": 1.7319, + "step": 1856 + }, + { + "epoch": 0.9061483409238776, + "grad_norm": 0.18969886004924774, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 1857 + }, + { + "epoch": 0.9066363044892648, + "grad_norm": 0.1851990967988968, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 1858 + }, + { + "epoch": 0.907124268054652, + "grad_norm": 0.2054198980331421, + "learning_rate": 0.0001, + "loss": 1.779, + "step": 1859 + }, + { + "epoch": 0.907612231620039, + "grad_norm": 0.19475503265857697, + "learning_rate": 0.0001, + "loss": 1.6826, + "step": 1860 + }, + { + "epoch": 0.9081001951854262, + "grad_norm": 0.18214543163776398, + "learning_rate": 0.0001, + "loss": 1.6816, + "step": 1861 + }, + { + "epoch": 0.9085881587508132, + "grad_norm": 0.18886341154575348, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 1862 + }, + { + "epoch": 0.9090761223162004, + "grad_norm": 0.17963244020938873, + "learning_rate": 0.0001, + "loss": 1.6782, + "step": 1863 + }, + { + "epoch": 0.9095640858815875, + "grad_norm": 0.18377065658569336, + "learning_rate": 0.0001, + "loss": 1.6856, + "step": 1864 + }, + { + "epoch": 0.9100520494469746, + "grad_norm": 0.2315450757741928, + "learning_rate": 0.0001, + "loss": 1.6973, + "step": 1865 + }, + { + "epoch": 0.9105400130123618, + "grad_norm": 0.19719429314136505, + "learning_rate": 0.0001, + "loss": 1.7254, + "step": 1866 + }, + { + "epoch": 0.9110279765777488, + "grad_norm": 0.1869468241930008, + "learning_rate": 0.0001, + "loss": 1.7119, + "step": 1867 + }, + { + "epoch": 0.911515940143136, + "grad_norm": 0.19327637553215027, + "learning_rate": 0.0001, + "loss": 1.6643, + "step": 1868 + }, + { + "epoch": 0.9120039037085231, + "grad_norm": 0.1934322863817215, + "learning_rate": 0.0001, + "loss": 1.7806, + "step": 1869 + }, + { + "epoch": 0.9124918672739102, + "grad_norm": 0.1793750673532486, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 1870 + }, + { + "epoch": 0.9129798308392973, + "grad_norm": 0.1910194307565689, + "learning_rate": 0.0001, + "loss": 1.676, + "step": 1871 + }, + { + "epoch": 0.9134677944046844, + "grad_norm": 0.19140726327896118, + "learning_rate": 0.0001, + "loss": 1.609, + "step": 1872 + }, + { + "epoch": 0.9139557579700716, + "grad_norm": 0.18246062099933624, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 1873 + }, + { + "epoch": 0.9144437215354587, + "grad_norm": 0.19457212090492249, + "learning_rate": 0.0001, + "loss": 1.7357, + "step": 1874 + }, + { + "epoch": 0.9149316851008458, + "grad_norm": 0.1949836015701294, + "learning_rate": 0.0001, + "loss": 1.805, + "step": 1875 + }, + { + "epoch": 0.9154196486662329, + "grad_norm": 0.18788614869117737, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 1876 + }, + { + "epoch": 0.9159076122316201, + "grad_norm": 0.1801442950963974, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 1877 + }, + { + "epoch": 0.9163955757970071, + "grad_norm": 0.18876299262046814, + "learning_rate": 0.0001, + "loss": 1.7086, + "step": 1878 + }, + { + "epoch": 0.9168835393623943, + "grad_norm": 0.20244908332824707, + "learning_rate": 0.0001, + "loss": 1.5959, + "step": 1879 + }, + { + "epoch": 0.9173715029277814, + "grad_norm": 0.1789156198501587, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 1880 + }, + { + "epoch": 0.9178594664931685, + "grad_norm": 0.2068023681640625, + "learning_rate": 0.0001, + "loss": 1.7403, + "step": 1881 + }, + { + "epoch": 0.9183474300585557, + "grad_norm": 0.18543741106987, + "learning_rate": 0.0001, + "loss": 1.6918, + "step": 1882 + }, + { + "epoch": 0.9188353936239427, + "grad_norm": 0.18919646739959717, + "learning_rate": 0.0001, + "loss": 1.7332, + "step": 1883 + }, + { + "epoch": 0.9193233571893299, + "grad_norm": 0.18657784163951874, + "learning_rate": 0.0001, + "loss": 1.7097, + "step": 1884 + }, + { + "epoch": 0.9198113207547169, + "grad_norm": 0.18775001168251038, + "learning_rate": 0.0001, + "loss": 1.6269, + "step": 1885 + }, + { + "epoch": 0.9202992843201041, + "grad_norm": 0.19378246366977692, + "learning_rate": 0.0001, + "loss": 1.6625, + "step": 1886 + }, + { + "epoch": 0.9207872478854913, + "grad_norm": 0.1863091140985489, + "learning_rate": 0.0001, + "loss": 1.7335, + "step": 1887 + }, + { + "epoch": 0.9212752114508783, + "grad_norm": 0.20152948796749115, + "learning_rate": 0.0001, + "loss": 1.7632, + "step": 1888 + }, + { + "epoch": 0.9217631750162655, + "grad_norm": 0.1818552315235138, + "learning_rate": 0.0001, + "loss": 1.5801, + "step": 1889 + }, + { + "epoch": 0.9222511385816525, + "grad_norm": 0.19405800104141235, + "learning_rate": 0.0001, + "loss": 1.6958, + "step": 1890 + }, + { + "epoch": 0.9227391021470397, + "grad_norm": 0.1904546320438385, + "learning_rate": 0.0001, + "loss": 1.7213, + "step": 1891 + }, + { + "epoch": 0.9232270657124269, + "grad_norm": 0.18888206779956818, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 1892 + }, + { + "epoch": 0.9237150292778139, + "grad_norm": 0.1954280138015747, + "learning_rate": 0.0001, + "loss": 1.6506, + "step": 1893 + }, + { + "epoch": 0.9242029928432011, + "grad_norm": 0.19753329455852509, + "learning_rate": 0.0001, + "loss": 1.5673, + "step": 1894 + }, + { + "epoch": 0.9246909564085881, + "grad_norm": 0.1822923868894577, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 1895 + }, + { + "epoch": 0.9251789199739753, + "grad_norm": 0.18649877607822418, + "learning_rate": 0.0001, + "loss": 1.6065, + "step": 1896 + }, + { + "epoch": 0.9256668835393624, + "grad_norm": 0.19342879951000214, + "learning_rate": 0.0001, + "loss": 1.6686, + "step": 1897 + }, + { + "epoch": 0.9261548471047495, + "grad_norm": 0.1949494183063507, + "learning_rate": 0.0001, + "loss": 1.7297, + "step": 1898 + }, + { + "epoch": 0.9266428106701367, + "grad_norm": 0.18507172167301178, + "learning_rate": 0.0001, + "loss": 1.7023, + "step": 1899 + }, + { + "epoch": 0.9271307742355237, + "grad_norm": 0.19422446191310883, + "learning_rate": 0.0001, + "loss": 1.7484, + "step": 1900 + }, + { + "epoch": 0.9276187378009109, + "grad_norm": 0.18911758065223694, + "learning_rate": 0.0001, + "loss": 1.6291, + "step": 1901 + }, + { + "epoch": 0.928106701366298, + "grad_norm": 0.18500325083732605, + "learning_rate": 0.0001, + "loss": 1.6286, + "step": 1902 + }, + { + "epoch": 0.9285946649316851, + "grad_norm": 0.1928795874118805, + "learning_rate": 0.0001, + "loss": 1.6231, + "step": 1903 + }, + { + "epoch": 0.9290826284970722, + "grad_norm": 0.187940314412117, + "learning_rate": 0.0001, + "loss": 1.4776, + "step": 1904 + }, + { + "epoch": 0.9295705920624593, + "grad_norm": 0.19200646877288818, + "learning_rate": 0.0001, + "loss": 1.6558, + "step": 1905 + }, + { + "epoch": 0.9300585556278465, + "grad_norm": 0.19187362492084503, + "learning_rate": 0.0001, + "loss": 1.732, + "step": 1906 + }, + { + "epoch": 0.9305465191932336, + "grad_norm": 0.18200846016407013, + "learning_rate": 0.0001, + "loss": 1.6865, + "step": 1907 + }, + { + "epoch": 0.9310344827586207, + "grad_norm": 0.20386971533298492, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 1908 + }, + { + "epoch": 0.9315224463240078, + "grad_norm": 0.18020214140415192, + "learning_rate": 0.0001, + "loss": 1.5867, + "step": 1909 + }, + { + "epoch": 0.9320104098893949, + "grad_norm": 0.19435757398605347, + "learning_rate": 0.0001, + "loss": 1.7309, + "step": 1910 + }, + { + "epoch": 0.932498373454782, + "grad_norm": 0.19285395741462708, + "learning_rate": 0.0001, + "loss": 1.6832, + "step": 1911 + }, + { + "epoch": 0.9329863370201692, + "grad_norm": 0.19747254252433777, + "learning_rate": 0.0001, + "loss": 1.6816, + "step": 1912 + }, + { + "epoch": 0.9334743005855562, + "grad_norm": 0.18839652836322784, + "learning_rate": 0.0001, + "loss": 1.7361, + "step": 1913 + }, + { + "epoch": 0.9339622641509434, + "grad_norm": 0.18727315962314606, + "learning_rate": 0.0001, + "loss": 1.6448, + "step": 1914 + }, + { + "epoch": 0.9344502277163305, + "grad_norm": 0.17452287673950195, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 1915 + }, + { + "epoch": 0.9349381912817176, + "grad_norm": 0.19202987849712372, + "learning_rate": 0.0001, + "loss": 1.7024, + "step": 1916 + }, + { + "epoch": 0.9354261548471048, + "grad_norm": 0.18424364924430847, + "learning_rate": 0.0001, + "loss": 1.7268, + "step": 1917 + }, + { + "epoch": 0.9359141184124918, + "grad_norm": 0.1959637701511383, + "learning_rate": 0.0001, + "loss": 1.7705, + "step": 1918 + }, + { + "epoch": 0.936402081977879, + "grad_norm": 0.19204477965831757, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 1919 + }, + { + "epoch": 0.936890045543266, + "grad_norm": 0.22303852438926697, + "learning_rate": 0.0001, + "loss": 1.6484, + "step": 1920 + }, + { + "epoch": 0.9373780091086532, + "grad_norm": 0.1938052773475647, + "learning_rate": 0.0001, + "loss": 1.6094, + "step": 1921 + }, + { + "epoch": 0.9378659726740404, + "grad_norm": 0.20095346868038177, + "learning_rate": 0.0001, + "loss": 1.6189, + "step": 1922 + }, + { + "epoch": 0.9383539362394274, + "grad_norm": 0.20438328385353088, + "learning_rate": 0.0001, + "loss": 1.6793, + "step": 1923 + }, + { + "epoch": 0.9388418998048146, + "grad_norm": 0.1934322714805603, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 1924 + }, + { + "epoch": 0.9393298633702017, + "grad_norm": 0.1885659545660019, + "learning_rate": 0.0001, + "loss": 1.4937, + "step": 1925 + }, + { + "epoch": 0.9398178269355888, + "grad_norm": 0.18867121636867523, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 1926 + }, + { + "epoch": 0.940305790500976, + "grad_norm": 0.19884291291236877, + "learning_rate": 0.0001, + "loss": 1.7175, + "step": 1927 + }, + { + "epoch": 0.940793754066363, + "grad_norm": 0.19754834473133087, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 1928 + }, + { + "epoch": 0.9412817176317502, + "grad_norm": 0.20804598927497864, + "learning_rate": 0.0001, + "loss": 1.7577, + "step": 1929 + }, + { + "epoch": 0.9417696811971373, + "grad_norm": 0.1934799700975418, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 1930 + }, + { + "epoch": 0.9422576447625244, + "grad_norm": 0.19872422516345978, + "learning_rate": 0.0001, + "loss": 1.5745, + "step": 1931 + }, + { + "epoch": 0.9427456083279115, + "grad_norm": 0.19481562077999115, + "learning_rate": 0.0001, + "loss": 1.683, + "step": 1932 + }, + { + "epoch": 0.9432335718932986, + "grad_norm": 0.18682962656021118, + "learning_rate": 0.0001, + "loss": 1.5869, + "step": 1933 + }, + { + "epoch": 0.9437215354586858, + "grad_norm": 0.19617128372192383, + "learning_rate": 0.0001, + "loss": 1.6752, + "step": 1934 + }, + { + "epoch": 0.9442094990240729, + "grad_norm": 0.18968826532363892, + "learning_rate": 0.0001, + "loss": 1.637, + "step": 1935 + }, + { + "epoch": 0.94469746258946, + "grad_norm": 0.20613940060138702, + "learning_rate": 0.0001, + "loss": 1.7039, + "step": 1936 + }, + { + "epoch": 0.9451854261548471, + "grad_norm": 0.2038343995809555, + "learning_rate": 0.0001, + "loss": 1.7396, + "step": 1937 + }, + { + "epoch": 0.9456733897202342, + "grad_norm": 0.19330835342407227, + "learning_rate": 0.0001, + "loss": 1.6096, + "step": 1938 + }, + { + "epoch": 0.9461613532856213, + "grad_norm": 0.21375514566898346, + "learning_rate": 0.0001, + "loss": 1.7174, + "step": 1939 + }, + { + "epoch": 0.9466493168510085, + "grad_norm": 0.20083387196063995, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 1940 + }, + { + "epoch": 0.9471372804163956, + "grad_norm": 0.18656747043132782, + "learning_rate": 0.0001, + "loss": 1.6689, + "step": 1941 + }, + { + "epoch": 0.9476252439817827, + "grad_norm": 0.19367806613445282, + "learning_rate": 0.0001, + "loss": 1.6574, + "step": 1942 + }, + { + "epoch": 0.9481132075471698, + "grad_norm": 0.19045981764793396, + "learning_rate": 0.0001, + "loss": 1.6072, + "step": 1943 + }, + { + "epoch": 0.9486011711125569, + "grad_norm": 0.19302068650722504, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 1944 + }, + { + "epoch": 0.9490891346779441, + "grad_norm": 0.1919882893562317, + "learning_rate": 0.0001, + "loss": 1.6674, + "step": 1945 + }, + { + "epoch": 0.9495770982433311, + "grad_norm": 0.18873807787895203, + "learning_rate": 0.0001, + "loss": 1.5804, + "step": 1946 + }, + { + "epoch": 0.9500650618087183, + "grad_norm": 0.18615473806858063, + "learning_rate": 0.0001, + "loss": 1.4677, + "step": 1947 + }, + { + "epoch": 0.9505530253741054, + "grad_norm": 0.196221262216568, + "learning_rate": 0.0001, + "loss": 1.7643, + "step": 1948 + }, + { + "epoch": 0.9510409889394925, + "grad_norm": 0.19638247787952423, + "learning_rate": 0.0001, + "loss": 1.7174, + "step": 1949 + }, + { + "epoch": 0.9515289525048797, + "grad_norm": 0.18977941572666168, + "learning_rate": 0.0001, + "loss": 1.7463, + "step": 1950 + }, + { + "epoch": 0.9520169160702667, + "grad_norm": 0.18567734956741333, + "learning_rate": 0.0001, + "loss": 1.6243, + "step": 1951 + }, + { + "epoch": 0.9525048796356539, + "grad_norm": 0.1937309354543686, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 1952 + }, + { + "epoch": 0.9529928432010409, + "grad_norm": 0.18985025584697723, + "learning_rate": 0.0001, + "loss": 1.5859, + "step": 1953 + }, + { + "epoch": 0.9534808067664281, + "grad_norm": 0.19192877411842346, + "learning_rate": 0.0001, + "loss": 1.6615, + "step": 1954 + }, + { + "epoch": 0.9539687703318153, + "grad_norm": 0.18933430314064026, + "learning_rate": 0.0001, + "loss": 1.6925, + "step": 1955 + }, + { + "epoch": 0.9544567338972023, + "grad_norm": 0.18384099006652832, + "learning_rate": 0.0001, + "loss": 1.6519, + "step": 1956 + }, + { + "epoch": 0.9549446974625895, + "grad_norm": 0.187970370054245, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 1957 + }, + { + "epoch": 0.9554326610279765, + "grad_norm": 0.18869946897029877, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 1958 + }, + { + "epoch": 0.9559206245933637, + "grad_norm": 0.18695271015167236, + "learning_rate": 0.0001, + "loss": 1.711, + "step": 1959 + }, + { + "epoch": 0.9564085881587508, + "grad_norm": 0.18627239763736725, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 1960 + }, + { + "epoch": 0.9568965517241379, + "grad_norm": 0.1865292340517044, + "learning_rate": 0.0001, + "loss": 1.6333, + "step": 1961 + }, + { + "epoch": 0.9573845152895251, + "grad_norm": 0.19713494181632996, + "learning_rate": 0.0001, + "loss": 1.6997, + "step": 1962 + }, + { + "epoch": 0.9578724788549121, + "grad_norm": 0.19183889031410217, + "learning_rate": 0.0001, + "loss": 1.6611, + "step": 1963 + }, + { + "epoch": 0.9583604424202993, + "grad_norm": 0.1824401468038559, + "learning_rate": 0.0001, + "loss": 1.6543, + "step": 1964 + }, + { + "epoch": 0.9588484059856864, + "grad_norm": 0.19251850247383118, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 1965 + }, + { + "epoch": 0.9593363695510735, + "grad_norm": 0.188092902302742, + "learning_rate": 0.0001, + "loss": 1.5799, + "step": 1966 + }, + { + "epoch": 0.9598243331164606, + "grad_norm": 0.2004861980676651, + "learning_rate": 0.0001, + "loss": 1.829, + "step": 1967 + }, + { + "epoch": 0.9603122966818478, + "grad_norm": 0.18226411938667297, + "learning_rate": 0.0001, + "loss": 1.6785, + "step": 1968 + }, + { + "epoch": 0.9608002602472349, + "grad_norm": 0.18180397152900696, + "learning_rate": 0.0001, + "loss": 1.6726, + "step": 1969 + }, + { + "epoch": 0.961288223812622, + "grad_norm": 0.18457727134227753, + "learning_rate": 0.0001, + "loss": 1.6192, + "step": 1970 + }, + { + "epoch": 0.9617761873780091, + "grad_norm": 0.18507972359657288, + "learning_rate": 0.0001, + "loss": 1.4573, + "step": 1971 + }, + { + "epoch": 0.9622641509433962, + "grad_norm": 0.17920102179050446, + "learning_rate": 0.0001, + "loss": 1.5538, + "step": 1972 + }, + { + "epoch": 0.9627521145087834, + "grad_norm": 0.18464797735214233, + "learning_rate": 0.0001, + "loss": 1.6241, + "step": 1973 + }, + { + "epoch": 0.9632400780741704, + "grad_norm": 0.18835680186748505, + "learning_rate": 0.0001, + "loss": 1.6425, + "step": 1974 + }, + { + "epoch": 0.9637280416395576, + "grad_norm": 0.19201530516147614, + "learning_rate": 0.0001, + "loss": 1.5636, + "step": 1975 + }, + { + "epoch": 0.9642160052049447, + "grad_norm": 0.1867157518863678, + "learning_rate": 0.0001, + "loss": 1.7102, + "step": 1976 + }, + { + "epoch": 0.9647039687703318, + "grad_norm": 0.18558953702449799, + "learning_rate": 0.0001, + "loss": 1.6129, + "step": 1977 + }, + { + "epoch": 0.965191932335719, + "grad_norm": 0.18594802916049957, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 1978 + }, + { + "epoch": 0.965679895901106, + "grad_norm": 0.18944233655929565, + "learning_rate": 0.0001, + "loss": 1.6166, + "step": 1979 + }, + { + "epoch": 0.9661678594664932, + "grad_norm": 0.19270744919776917, + "learning_rate": 0.0001, + "loss": 1.7154, + "step": 1980 + }, + { + "epoch": 0.9666558230318802, + "grad_norm": 0.18543404340744019, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 1981 + }, + { + "epoch": 0.9671437865972674, + "grad_norm": 0.1873311698436737, + "learning_rate": 0.0001, + "loss": 1.6802, + "step": 1982 + }, + { + "epoch": 0.9676317501626546, + "grad_norm": 0.20122510194778442, + "learning_rate": 0.0001, + "loss": 1.6775, + "step": 1983 + }, + { + "epoch": 0.9681197137280416, + "grad_norm": 0.18540111184120178, + "learning_rate": 0.0001, + "loss": 1.71, + "step": 1984 + }, + { + "epoch": 0.9686076772934288, + "grad_norm": 0.19041728973388672, + "learning_rate": 0.0001, + "loss": 1.6158, + "step": 1985 + }, + { + "epoch": 0.9690956408588158, + "grad_norm": 0.18193362653255463, + "learning_rate": 0.0001, + "loss": 1.5784, + "step": 1986 + }, + { + "epoch": 0.969583604424203, + "grad_norm": 0.18553735315799713, + "learning_rate": 0.0001, + "loss": 1.5646, + "step": 1987 + }, + { + "epoch": 0.9700715679895902, + "grad_norm": 0.18669581413269043, + "learning_rate": 0.0001, + "loss": 1.6766, + "step": 1988 + }, + { + "epoch": 0.9705595315549772, + "grad_norm": 0.18920449912548065, + "learning_rate": 0.0001, + "loss": 1.6766, + "step": 1989 + }, + { + "epoch": 0.9710474951203644, + "grad_norm": 0.20458273589611053, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 1990 + }, + { + "epoch": 0.9715354586857514, + "grad_norm": 0.1898808777332306, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 1991 + }, + { + "epoch": 0.9720234222511386, + "grad_norm": 0.2061244398355484, + "learning_rate": 0.0001, + "loss": 1.7023, + "step": 1992 + }, + { + "epoch": 0.9725113858165257, + "grad_norm": 0.1939939558506012, + "learning_rate": 0.0001, + "loss": 1.6788, + "step": 1993 + }, + { + "epoch": 0.9729993493819128, + "grad_norm": 0.18010011315345764, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 1994 + }, + { + "epoch": 0.9734873129473, + "grad_norm": 0.1993561089038849, + "learning_rate": 0.0001, + "loss": 1.703, + "step": 1995 + }, + { + "epoch": 0.973975276512687, + "grad_norm": 0.2100955992937088, + "learning_rate": 0.0001, + "loss": 1.8338, + "step": 1996 + }, + { + "epoch": 0.9744632400780742, + "grad_norm": 0.18871144950389862, + "learning_rate": 0.0001, + "loss": 1.5936, + "step": 1997 + }, + { + "epoch": 0.9749512036434613, + "grad_norm": 0.18759900331497192, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 1998 + }, + { + "epoch": 0.9754391672088484, + "grad_norm": 0.18255159258842468, + "learning_rate": 0.0001, + "loss": 1.565, + "step": 1999 + }, + { + "epoch": 0.9759271307742355, + "grad_norm": 0.19834022223949432, + "learning_rate": 0.0001, + "loss": 1.7178, + "step": 2000 + }, + { + "epoch": 0.9764150943396226, + "grad_norm": 0.1984955072402954, + "learning_rate": 0.0001, + "loss": 1.6893, + "step": 2001 + }, + { + "epoch": 0.9769030579050098, + "grad_norm": 0.19068273901939392, + "learning_rate": 0.0001, + "loss": 1.5748, + "step": 2002 + }, + { + "epoch": 0.9773910214703969, + "grad_norm": 0.18630477786064148, + "learning_rate": 0.0001, + "loss": 1.524, + "step": 2003 + }, + { + "epoch": 0.977878985035784, + "grad_norm": 0.19503630697727203, + "learning_rate": 0.0001, + "loss": 1.8031, + "step": 2004 + }, + { + "epoch": 0.9783669486011711, + "grad_norm": 0.19002202153205872, + "learning_rate": 0.0001, + "loss": 1.7034, + "step": 2005 + }, + { + "epoch": 0.9788549121665582, + "grad_norm": 0.18310774862766266, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 2006 + }, + { + "epoch": 0.9793428757319453, + "grad_norm": 0.19140413403511047, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 2007 + }, + { + "epoch": 0.9798308392973325, + "grad_norm": 0.19118629395961761, + "learning_rate": 0.0001, + "loss": 1.8085, + "step": 2008 + }, + { + "epoch": 0.9803188028627196, + "grad_norm": 0.180838942527771, + "learning_rate": 0.0001, + "loss": 1.6047, + "step": 2009 + }, + { + "epoch": 0.9808067664281067, + "grad_norm": 0.19258812069892883, + "learning_rate": 0.0001, + "loss": 1.6151, + "step": 2010 + }, + { + "epoch": 0.9812947299934938, + "grad_norm": 0.18637025356292725, + "learning_rate": 0.0001, + "loss": 1.6767, + "step": 2011 + }, + { + "epoch": 0.9817826935588809, + "grad_norm": 0.19936662912368774, + "learning_rate": 0.0001, + "loss": 1.6625, + "step": 2012 + }, + { + "epoch": 0.9822706571242681, + "grad_norm": 0.194144606590271, + "learning_rate": 0.0001, + "loss": 1.7133, + "step": 2013 + }, + { + "epoch": 0.9827586206896551, + "grad_norm": 0.20041455328464508, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 2014 + }, + { + "epoch": 0.9832465842550423, + "grad_norm": 0.19850897789001465, + "learning_rate": 0.0001, + "loss": 1.7541, + "step": 2015 + }, + { + "epoch": 0.9837345478204295, + "grad_norm": 0.18957051634788513, + "learning_rate": 0.0001, + "loss": 1.5824, + "step": 2016 + }, + { + "epoch": 0.9842225113858165, + "grad_norm": 0.19494563341140747, + "learning_rate": 0.0001, + "loss": 1.6266, + "step": 2017 + }, + { + "epoch": 0.9847104749512037, + "grad_norm": 0.1976606249809265, + "learning_rate": 0.0001, + "loss": 1.7235, + "step": 2018 + }, + { + "epoch": 0.9851984385165907, + "grad_norm": 0.1937410980463028, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 2019 + }, + { + "epoch": 0.9856864020819779, + "grad_norm": 0.2040870636701584, + "learning_rate": 0.0001, + "loss": 1.5867, + "step": 2020 + }, + { + "epoch": 0.986174365647365, + "grad_norm": 0.18701307475566864, + "learning_rate": 0.0001, + "loss": 1.7137, + "step": 2021 + }, + { + "epoch": 0.9866623292127521, + "grad_norm": 0.19137197732925415, + "learning_rate": 0.0001, + "loss": 1.7141, + "step": 2022 + }, + { + "epoch": 0.9871502927781393, + "grad_norm": 0.18723750114440918, + "learning_rate": 0.0001, + "loss": 1.5555, + "step": 2023 + }, + { + "epoch": 0.9876382563435263, + "grad_norm": 0.19908587634563446, + "learning_rate": 0.0001, + "loss": 1.7694, + "step": 2024 + }, + { + "epoch": 0.9881262199089135, + "grad_norm": 0.18599991500377655, + "learning_rate": 0.0001, + "loss": 1.6416, + "step": 2025 + }, + { + "epoch": 0.9886141834743006, + "grad_norm": 0.19032098352909088, + "learning_rate": 0.0001, + "loss": 1.725, + "step": 2026 + }, + { + "epoch": 0.9891021470396877, + "grad_norm": 0.19751659035682678, + "learning_rate": 0.0001, + "loss": 1.6269, + "step": 2027 + }, + { + "epoch": 0.9895901106050748, + "grad_norm": 0.19484581053256989, + "learning_rate": 0.0001, + "loss": 1.6075, + "step": 2028 + }, + { + "epoch": 0.9900780741704619, + "grad_norm": 0.18925638496875763, + "learning_rate": 0.0001, + "loss": 1.6035, + "step": 2029 + }, + { + "epoch": 0.9905660377358491, + "grad_norm": 0.1926533430814743, + "learning_rate": 0.0001, + "loss": 1.6842, + "step": 2030 + }, + { + "epoch": 0.9910540013012362, + "grad_norm": 0.18332500755786896, + "learning_rate": 0.0001, + "loss": 1.6314, + "step": 2031 + }, + { + "epoch": 0.9915419648666233, + "grad_norm": 0.19438926875591278, + "learning_rate": 0.0001, + "loss": 1.7524, + "step": 2032 + }, + { + "epoch": 0.9920299284320104, + "grad_norm": 0.18711014091968536, + "learning_rate": 0.0001, + "loss": 1.5707, + "step": 2033 + }, + { + "epoch": 0.9925178919973975, + "grad_norm": 0.1909669041633606, + "learning_rate": 0.0001, + "loss": 1.6926, + "step": 2034 + }, + { + "epoch": 0.9930058555627846, + "grad_norm": 0.187711700797081, + "learning_rate": 0.0001, + "loss": 1.571, + "step": 2035 + }, + { + "epoch": 0.9934938191281718, + "grad_norm": 0.18141263723373413, + "learning_rate": 0.0001, + "loss": 1.6505, + "step": 2036 + }, + { + "epoch": 0.9939817826935589, + "grad_norm": 0.18751592934131622, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 2037 + }, + { + "epoch": 0.994469746258946, + "grad_norm": 0.18346115946769714, + "learning_rate": 0.0001, + "loss": 1.6837, + "step": 2038 + }, + { + "epoch": 0.9949577098243331, + "grad_norm": 0.1962192952632904, + "learning_rate": 0.0001, + "loss": 1.7056, + "step": 2039 + }, + { + "epoch": 0.9954456733897202, + "grad_norm": 0.19639593362808228, + "learning_rate": 0.0001, + "loss": 1.6419, + "step": 2040 + }, + { + "epoch": 0.9959336369551074, + "grad_norm": 0.19418394565582275, + "learning_rate": 0.0001, + "loss": 1.7652, + "step": 2041 + }, + { + "epoch": 0.9964216005204944, + "grad_norm": 0.19044318795204163, + "learning_rate": 0.0001, + "loss": 1.6707, + "step": 2042 + }, + { + "epoch": 0.9969095640858816, + "grad_norm": 0.1870298981666565, + "learning_rate": 0.0001, + "loss": 1.521, + "step": 2043 + }, + { + "epoch": 0.9973975276512687, + "grad_norm": 0.20216675102710724, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 2044 + }, + { + "epoch": 0.9978854912166558, + "grad_norm": 0.1920004040002823, + "learning_rate": 0.0001, + "loss": 1.626, + "step": 2045 + }, + { + "epoch": 0.998373454782043, + "grad_norm": 0.20293602347373962, + "learning_rate": 0.0001, + "loss": 1.5053, + "step": 2046 + }, + { + "epoch": 0.99886141834743, + "grad_norm": 0.2009141743183136, + "learning_rate": 0.0001, + "loss": 1.6191, + "step": 2047 + }, + { + "epoch": 0.9993493819128172, + "grad_norm": 0.1968158483505249, + "learning_rate": 0.0001, + "loss": 1.6945, + "step": 2048 + }, + { + "epoch": 0.9998373454782042, + "grad_norm": 0.2032029926776886, + "learning_rate": 0.0001, + "loss": 1.6321, + "step": 2049 + } + ], + "logging_steps": 1, + "max_steps": 2049, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.374213438189863e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}