diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6665 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 1892, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004228329809725159, + "grad_norm": 6.605544090270996, + "learning_rate": 5.263157894736842e-08, + "loss": 1.8237226009368896, + "step": 2 + }, + { + "epoch": 0.008456659619450317, + "grad_norm": 0.9109106659889221, + "learning_rate": 1.5789473684210527e-07, + "loss": 2.176421880722046, + "step": 4 + }, + { + "epoch": 0.012684989429175475, + "grad_norm": 3.9910192489624023, + "learning_rate": 2.6315789473684213e-07, + "loss": 2.1531057357788086, + "step": 6 + }, + { + "epoch": 0.016913319238900635, + "grad_norm": 0.9899866580963135, + "learning_rate": 3.6842105263157896e-07, + "loss": 1.9564805030822754, + "step": 8 + }, + { + "epoch": 0.021141649048625793, + "grad_norm": 2.9574663639068604, + "learning_rate": 4.7368421052631585e-07, + "loss": 2.021973133087158, + "step": 10 + }, + { + "epoch": 0.02536997885835095, + "grad_norm": 2.2470693588256836, + "learning_rate": 5.789473684210526e-07, + "loss": 1.692598581314087, + "step": 12 + }, + { + "epoch": 0.02959830866807611, + "grad_norm": 1.5818345546722412, + "learning_rate": 6.842105263157896e-07, + "loss": 1.6616182327270508, + "step": 14 + }, + { + "epoch": 0.03382663847780127, + "grad_norm": 1.9749239683151245, + "learning_rate": 7.894736842105263e-07, + "loss": 1.8213186264038086, + "step": 16 + }, + { + "epoch": 0.03805496828752643, + "grad_norm": 1.0426429510116577, + "learning_rate": 8.947368421052632e-07, + "loss": 1.8437881469726562, + "step": 18 + }, + { + "epoch": 0.042283298097251586, + "grad_norm": 15.372987747192383, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.0638184547424316, + "step": 20 + }, + { + "epoch": 0.046511627906976744, + "grad_norm": 4.7466301918029785, + "learning_rate": 1.1052631578947369e-06, + "loss": 1.948048710823059, + "step": 22 + }, + { + "epoch": 0.0507399577167019, + "grad_norm": 1.4575281143188477, + "learning_rate": 1.2105263157894738e-06, + "loss": 1.8744062185287476, + "step": 24 + }, + { + "epoch": 0.05496828752642706, + "grad_norm": 3.336325168609619, + "learning_rate": 1.3157894736842106e-06, + "loss": 1.715809941291809, + "step": 26 + }, + { + "epoch": 0.05919661733615222, + "grad_norm": 0.6526132822036743, + "learning_rate": 1.4210526315789475e-06, + "loss": 1.750809907913208, + "step": 28 + }, + { + "epoch": 0.06342494714587738, + "grad_norm": 0.8683815002441406, + "learning_rate": 1.5263157894736844e-06, + "loss": 1.5532337427139282, + "step": 30 + }, + { + "epoch": 0.06765327695560254, + "grad_norm": 4.369999885559082, + "learning_rate": 1.6315789473684212e-06, + "loss": 1.2949138879776, + "step": 32 + }, + { + "epoch": 0.07188160676532769, + "grad_norm": 0.8993115425109863, + "learning_rate": 1.736842105263158e-06, + "loss": 1.3249835968017578, + "step": 34 + }, + { + "epoch": 0.07610993657505286, + "grad_norm": 1.2760223150253296, + "learning_rate": 1.8421052631578948e-06, + "loss": 1.4548490047454834, + "step": 36 + }, + { + "epoch": 0.080338266384778, + "grad_norm": 5.320272445678711, + "learning_rate": 1.9473684210526315e-06, + "loss": 1.3651189804077148, + "step": 38 + }, + { + "epoch": 0.08456659619450317, + "grad_norm": 4.702236175537109, + "learning_rate": 2.0526315789473687e-06, + "loss": 1.316279649734497, + "step": 40 + }, + { + "epoch": 0.08879492600422834, + "grad_norm": 5.524050712585449, + "learning_rate": 2.1578947368421054e-06, + "loss": 1.7164320945739746, + "step": 42 + }, + { + "epoch": 0.09302325581395349, + "grad_norm": 1.7671512365341187, + "learning_rate": 2.2631578947368426e-06, + "loss": 1.695072889328003, + "step": 44 + }, + { + "epoch": 0.09725158562367865, + "grad_norm": 0.7419072985649109, + "learning_rate": 2.368421052631579e-06, + "loss": 1.3463151454925537, + "step": 46 + }, + { + "epoch": 0.1014799154334038, + "grad_norm": 0.44772660732269287, + "learning_rate": 2.473684210526316e-06, + "loss": 1.5603983402252197, + "step": 48 + }, + { + "epoch": 0.10570824524312897, + "grad_norm": 0.9128488302230835, + "learning_rate": 2.578947368421053e-06, + "loss": 1.5553311109542847, + "step": 50 + }, + { + "epoch": 0.10993657505285412, + "grad_norm": 2.591745138168335, + "learning_rate": 2.68421052631579e-06, + "loss": 0.8912559747695923, + "step": 52 + }, + { + "epoch": 0.11416490486257928, + "grad_norm": 0.4358270466327667, + "learning_rate": 2.789473684210526e-06, + "loss": 1.115212321281433, + "step": 54 + }, + { + "epoch": 0.11839323467230443, + "grad_norm": 0.9643327593803406, + "learning_rate": 2.8947368421052634e-06, + "loss": 1.4279577732086182, + "step": 56 + }, + { + "epoch": 0.1226215644820296, + "grad_norm": 0.7719278931617737, + "learning_rate": 3e-06, + "loss": 1.4487831592559814, + "step": 58 + }, + { + "epoch": 0.12684989429175475, + "grad_norm": 0.7415221929550171, + "learning_rate": 3.1052631578947372e-06, + "loss": 1.4636942148208618, + "step": 60 + }, + { + "epoch": 0.13107822410147993, + "grad_norm": 1.0166652202606201, + "learning_rate": 3.210526315789474e-06, + "loss": 0.8027479648590088, + "step": 62 + }, + { + "epoch": 0.13530655391120508, + "grad_norm": 0.8704442381858826, + "learning_rate": 3.3157894736842107e-06, + "loss": 1.042100429534912, + "step": 64 + }, + { + "epoch": 0.13953488372093023, + "grad_norm": 0.5612062811851501, + "learning_rate": 3.421052631578948e-06, + "loss": 1.3720247745513916, + "step": 66 + }, + { + "epoch": 0.14376321353065538, + "grad_norm": 0.9619214534759521, + "learning_rate": 3.5263157894736846e-06, + "loss": 1.4121863842010498, + "step": 68 + }, + { + "epoch": 0.14799154334038056, + "grad_norm": 0.7827504277229309, + "learning_rate": 3.6315789473684217e-06, + "loss": 1.409840703010559, + "step": 70 + }, + { + "epoch": 0.1522198731501057, + "grad_norm": 0.7777084708213806, + "learning_rate": 3.736842105263158e-06, + "loss": 1.3885422945022583, + "step": 72 + }, + { + "epoch": 0.15644820295983086, + "grad_norm": 0.7667552828788757, + "learning_rate": 3.842105263157895e-06, + "loss": 0.9778481125831604, + "step": 74 + }, + { + "epoch": 0.160676532769556, + "grad_norm": 0.7666372060775757, + "learning_rate": 3.947368421052632e-06, + "loss": 0.8655365705490112, + "step": 76 + }, + { + "epoch": 0.1649048625792812, + "grad_norm": 1.127411961555481, + "learning_rate": 4.052631578947368e-06, + "loss": 1.3328880071640015, + "step": 78 + }, + { + "epoch": 0.16913319238900634, + "grad_norm": 1.8701919317245483, + "learning_rate": 4.157894736842106e-06, + "loss": 1.1148747205734253, + "step": 80 + }, + { + "epoch": 0.1733615221987315, + "grad_norm": 0.7047215104103088, + "learning_rate": 4.2631578947368425e-06, + "loss": 1.1087937355041504, + "step": 82 + }, + { + "epoch": 0.17758985200845667, + "grad_norm": 1.534998893737793, + "learning_rate": 4.368421052631579e-06, + "loss": 0.9685766696929932, + "step": 84 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 1.2067896127700806, + "learning_rate": 4.473684210526316e-06, + "loss": 1.7643671035766602, + "step": 86 + }, + { + "epoch": 0.18604651162790697, + "grad_norm": 1.5933588743209839, + "learning_rate": 4.578947368421053e-06, + "loss": 1.3110942840576172, + "step": 88 + }, + { + "epoch": 0.19027484143763213, + "grad_norm": 4.7901225090026855, + "learning_rate": 4.68421052631579e-06, + "loss": 1.0165361166000366, + "step": 90 + }, + { + "epoch": 0.1945031712473573, + "grad_norm": 0.7400741577148438, + "learning_rate": 4.789473684210527e-06, + "loss": 1.3473522663116455, + "step": 92 + }, + { + "epoch": 0.19873150105708245, + "grad_norm": 1.6431527137756348, + "learning_rate": 4.894736842105264e-06, + "loss": 0.9921259880065918, + "step": 94 + }, + { + "epoch": 0.2029598308668076, + "grad_norm": 1.1242965459823608, + "learning_rate": 5e-06, + "loss": 1.3252302408218384, + "step": 96 + }, + { + "epoch": 0.20718816067653276, + "grad_norm": 1.4306470155715942, + "learning_rate": 4.999986246423023e-06, + "loss": 1.4119060039520264, + "step": 98 + }, + { + "epoch": 0.21141649048625794, + "grad_norm": 0.6519229412078857, + "learning_rate": 4.999944985860234e-06, + "loss": 1.3824262619018555, + "step": 100 + }, + { + "epoch": 0.2156448202959831, + "grad_norm": 0.9801005721092224, + "learning_rate": 4.9998762188160604e-06, + "loss": 1.2583341598510742, + "step": 102 + }, + { + "epoch": 0.21987315010570824, + "grad_norm": 0.6960574388504028, + "learning_rate": 4.999779946131206e-06, + "loss": 1.3352376222610474, + "step": 104 + }, + { + "epoch": 0.22410147991543342, + "grad_norm": 0.937359631061554, + "learning_rate": 4.9996561689826455e-06, + "loss": 1.3265419006347656, + "step": 106 + }, + { + "epoch": 0.22832980972515857, + "grad_norm": 0.973751962184906, + "learning_rate": 4.999504888883601e-06, + "loss": 1.0313334465026855, + "step": 108 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 0.8326078653335571, + "learning_rate": 4.999326107683535e-06, + "loss": 1.3112177848815918, + "step": 110 + }, + { + "epoch": 0.23678646934460887, + "grad_norm": 1.3524088859558105, + "learning_rate": 4.999119827568119e-06, + "loss": 1.379159688949585, + "step": 112 + }, + { + "epoch": 0.24101479915433405, + "grad_norm": 0.8450507521629333, + "learning_rate": 4.9988860510592085e-06, + "loss": 1.4243919849395752, + "step": 114 + }, + { + "epoch": 0.2452431289640592, + "grad_norm": 1.4355442523956299, + "learning_rate": 4.998624781014819e-06, + "loss": 1.3441710472106934, + "step": 116 + }, + { + "epoch": 0.24947145877378435, + "grad_norm": 1.2588502168655396, + "learning_rate": 4.998336020629077e-06, + "loss": 1.3617056608200073, + "step": 118 + }, + { + "epoch": 0.2536997885835095, + "grad_norm": 1.3343617916107178, + "learning_rate": 4.998019773432198e-06, + "loss": 0.8026182055473328, + "step": 120 + }, + { + "epoch": 0.25792811839323465, + "grad_norm": 0.6089492440223694, + "learning_rate": 4.997676043290429e-06, + "loss": 1.320847988128662, + "step": 122 + }, + { + "epoch": 0.26215644820295986, + "grad_norm": 0.6601752042770386, + "learning_rate": 4.997304834406011e-06, + "loss": 1.1560726165771484, + "step": 124 + }, + { + "epoch": 0.266384778012685, + "grad_norm": 0.9852200746536255, + "learning_rate": 4.9969061513171185e-06, + "loss": 1.3645535707473755, + "step": 126 + }, + { + "epoch": 0.27061310782241016, + "grad_norm": 1.04742431640625, + "learning_rate": 4.996479998897815e-06, + "loss": 1.0370509624481201, + "step": 128 + }, + { + "epoch": 0.2748414376321353, + "grad_norm": 0.768661379814148, + "learning_rate": 4.996026382357985e-06, + "loss": 0.976492166519165, + "step": 130 + }, + { + "epoch": 0.27906976744186046, + "grad_norm": 1.244371771812439, + "learning_rate": 4.995545307243273e-06, + "loss": 1.290363073348999, + "step": 132 + }, + { + "epoch": 0.2832980972515856, + "grad_norm": 1.2053449153900146, + "learning_rate": 4.995036779435014e-06, + "loss": 0.8751378655433655, + "step": 134 + }, + { + "epoch": 0.28752642706131076, + "grad_norm": 2.1075494289398193, + "learning_rate": 4.994500805150167e-06, + "loss": 1.123706579208374, + "step": 136 + }, + { + "epoch": 0.2917547568710359, + "grad_norm": 2.0092146396636963, + "learning_rate": 4.993937390941231e-06, + "loss": 1.4683767557144165, + "step": 138 + }, + { + "epoch": 0.2959830866807611, + "grad_norm": 1.5879631042480469, + "learning_rate": 4.9933465436961705e-06, + "loss": 0.9096964597702026, + "step": 140 + }, + { + "epoch": 0.30021141649048627, + "grad_norm": 1.2290595769882202, + "learning_rate": 4.992728270638333e-06, + "loss": 1.5735292434692383, + "step": 142 + }, + { + "epoch": 0.3044397463002114, + "grad_norm": 16.471097946166992, + "learning_rate": 4.992082579326354e-06, + "loss": 1.1104016304016113, + "step": 144 + }, + { + "epoch": 0.3086680761099366, + "grad_norm": 3.488492250442505, + "learning_rate": 4.9914094776540676e-06, + "loss": 0.6830090284347534, + "step": 146 + }, + { + "epoch": 0.3128964059196617, + "grad_norm": 1.9017225503921509, + "learning_rate": 4.990708973850415e-06, + "loss": 1.2578611373901367, + "step": 148 + }, + { + "epoch": 0.3171247357293869, + "grad_norm": 0.8831533789634705, + "learning_rate": 4.989981076479334e-06, + "loss": 1.2861902713775635, + "step": 150 + }, + { + "epoch": 0.321353065539112, + "grad_norm": 1.4398751258850098, + "learning_rate": 4.989225794439665e-06, + "loss": 1.2161321640014648, + "step": 152 + }, + { + "epoch": 0.32558139534883723, + "grad_norm": 2.2701990604400635, + "learning_rate": 4.9884431369650316e-06, + "loss": 0.7495906949043274, + "step": 154 + }, + { + "epoch": 0.3298097251585624, + "grad_norm": 1.7071163654327393, + "learning_rate": 4.987633113623737e-06, + "loss": 1.2624861001968384, + "step": 156 + }, + { + "epoch": 0.33403805496828753, + "grad_norm": 0.7767173647880554, + "learning_rate": 4.986795734318643e-06, + "loss": 1.2871098518371582, + "step": 158 + }, + { + "epoch": 0.3382663847780127, + "grad_norm": 3.3821675777435303, + "learning_rate": 4.985931009287047e-06, + "loss": 1.279846429824829, + "step": 160 + }, + { + "epoch": 0.34249471458773784, + "grad_norm": 1.1429699659347534, + "learning_rate": 4.98503894910056e-06, + "loss": 1.1057888269424438, + "step": 162 + }, + { + "epoch": 0.346723044397463, + "grad_norm": 1.2170449495315552, + "learning_rate": 4.9841195646649764e-06, + "loss": 1.0053894519805908, + "step": 164 + }, + { + "epoch": 0.35095137420718814, + "grad_norm": 0.7444745302200317, + "learning_rate": 4.98317286722014e-06, + "loss": 0.8725204467773438, + "step": 166 + }, + { + "epoch": 0.35517970401691334, + "grad_norm": 1.1420894861221313, + "learning_rate": 4.982198868339808e-06, + "loss": 1.1899808645248413, + "step": 168 + }, + { + "epoch": 0.3594080338266385, + "grad_norm": 0.6692090034484863, + "learning_rate": 4.981197579931507e-06, + "loss": 1.2234405279159546, + "step": 170 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.6509742736816406, + "learning_rate": 4.980169014236391e-06, + "loss": 1.050593614578247, + "step": 172 + }, + { + "epoch": 0.3678646934460888, + "grad_norm": 0.659625232219696, + "learning_rate": 4.979113183829088e-06, + "loss": 1.3807719945907593, + "step": 174 + }, + { + "epoch": 0.37209302325581395, + "grad_norm": 0.8343127369880676, + "learning_rate": 4.97803010161755e-06, + "loss": 1.068203330039978, + "step": 176 + }, + { + "epoch": 0.3763213530655391, + "grad_norm": 1.0895072221755981, + "learning_rate": 4.976919780842892e-06, + "loss": 0.8895647525787354, + "step": 178 + }, + { + "epoch": 0.38054968287526425, + "grad_norm": 0.4808787703514099, + "learning_rate": 4.97578223507923e-06, + "loss": 1.3076379299163818, + "step": 180 + }, + { + "epoch": 0.38477801268498946, + "grad_norm": 1.1349347829818726, + "learning_rate": 4.97461747823352e-06, + "loss": 1.547876238822937, + "step": 182 + }, + { + "epoch": 0.3890063424947146, + "grad_norm": 1.6290419101715088, + "learning_rate": 4.973425524545382e-06, + "loss": 1.3527616262435913, + "step": 184 + }, + { + "epoch": 0.39323467230443976, + "grad_norm": 1.678579568862915, + "learning_rate": 4.972206388586927e-06, + "loss": 1.102654218673706, + "step": 186 + }, + { + "epoch": 0.3974630021141649, + "grad_norm": 1.367773175239563, + "learning_rate": 4.970960085262584e-06, + "loss": 0.9921371340751648, + "step": 188 + }, + { + "epoch": 0.40169133192389006, + "grad_norm": 1.1447407007217407, + "learning_rate": 4.969686629808911e-06, + "loss": 1.0145394802093506, + "step": 190 + }, + { + "epoch": 0.4059196617336152, + "grad_norm": 0.8305537104606628, + "learning_rate": 4.9683860377944125e-06, + "loss": 1.3157576322555542, + "step": 192 + }, + { + "epoch": 0.41014799154334036, + "grad_norm": 0.30826497077941895, + "learning_rate": 4.967058325119348e-06, + "loss": 1.023323655128479, + "step": 194 + }, + { + "epoch": 0.4143763213530655, + "grad_norm": 1.3932182788848877, + "learning_rate": 4.965703508015539e-06, + "loss": 1.2941372394561768, + "step": 196 + }, + { + "epoch": 0.4186046511627907, + "grad_norm": 0.7291075587272644, + "learning_rate": 4.964321603046169e-06, + "loss": 1.0717015266418457, + "step": 198 + }, + { + "epoch": 0.42283298097251587, + "grad_norm": 0.5842322707176208, + "learning_rate": 4.962912627105581e-06, + "loss": 1.1873562335968018, + "step": 200 + }, + { + "epoch": 0.427061310782241, + "grad_norm": 0.6558592319488525, + "learning_rate": 4.961476597419072e-06, + "loss": 0.5549638867378235, + "step": 202 + }, + { + "epoch": 0.4312896405919662, + "grad_norm": 0.5496861934661865, + "learning_rate": 4.960013531542681e-06, + "loss": 1.251203179359436, + "step": 204 + }, + { + "epoch": 0.4355179704016913, + "grad_norm": 1.2829310894012451, + "learning_rate": 4.958523447362978e-06, + "loss": 1.3057016134262085, + "step": 206 + }, + { + "epoch": 0.4397463002114165, + "grad_norm": 1.240744709968567, + "learning_rate": 4.95700636309684e-06, + "loss": 0.773059606552124, + "step": 208 + }, + { + "epoch": 0.4439746300211416, + "grad_norm": 0.6856869459152222, + "learning_rate": 4.955462297291231e-06, + "loss": 1.3060951232910156, + "step": 210 + }, + { + "epoch": 0.44820295983086683, + "grad_norm": 1.0836693048477173, + "learning_rate": 4.953891268822977e-06, + "loss": 1.2806111574172974, + "step": 212 + }, + { + "epoch": 0.452431289640592, + "grad_norm": 0.8455312252044678, + "learning_rate": 4.952293296898531e-06, + "loss": 1.4302403926849365, + "step": 214 + }, + { + "epoch": 0.45665961945031713, + "grad_norm": 0.5475296974182129, + "learning_rate": 4.9506684010537425e-06, + "loss": 0.6154606938362122, + "step": 216 + }, + { + "epoch": 0.4608879492600423, + "grad_norm": 2.704235315322876, + "learning_rate": 4.949016601153615e-06, + "loss": 1.0028847455978394, + "step": 218 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 0.5048341751098633, + "learning_rate": 4.947337917392068e-06, + "loss": 1.2792624235153198, + "step": 220 + }, + { + "epoch": 0.4693446088794926, + "grad_norm": 1.052980899810791, + "learning_rate": 4.9456323702916834e-06, + "loss": 1.2493329048156738, + "step": 222 + }, + { + "epoch": 0.47357293868921774, + "grad_norm": 0.7700513601303101, + "learning_rate": 4.94389998070346e-06, + "loss": 1.3523839712142944, + "step": 224 + }, + { + "epoch": 0.47780126849894294, + "grad_norm": 0.4727860689163208, + "learning_rate": 4.9421407698065546e-06, + "loss": 1.262749433517456, + "step": 226 + }, + { + "epoch": 0.4820295983086681, + "grad_norm": 0.7402790188789368, + "learning_rate": 4.940354759108031e-06, + "loss": 1.2572187185287476, + "step": 228 + }, + { + "epoch": 0.48625792811839325, + "grad_norm": 0.5866014361381531, + "learning_rate": 4.938541970442585e-06, + "loss": 0.9033302068710327, + "step": 230 + }, + { + "epoch": 0.4904862579281184, + "grad_norm": 0.6419193148612976, + "learning_rate": 4.9367024259722866e-06, + "loss": 1.2711232900619507, + "step": 232 + }, + { + "epoch": 0.49471458773784355, + "grad_norm": 1.1681197881698608, + "learning_rate": 4.934836148186306e-06, + "loss": 0.9501933455467224, + "step": 234 + }, + { + "epoch": 0.4989429175475687, + "grad_norm": 7.673317909240723, + "learning_rate": 4.93294315990064e-06, + "loss": 0.8862608075141907, + "step": 236 + }, + { + "epoch": 0.5031712473572939, + "grad_norm": 1.019538402557373, + "learning_rate": 4.93102348425783e-06, + "loss": 0.8333485722541809, + "step": 238 + }, + { + "epoch": 0.507399577167019, + "grad_norm": 0.6142158508300781, + "learning_rate": 4.9290771447266815e-06, + "loss": 0.8960846066474915, + "step": 240 + }, + { + "epoch": 0.5116279069767442, + "grad_norm": 0.829379677772522, + "learning_rate": 4.927104165101979e-06, + "loss": 1.3168963193893433, + "step": 242 + }, + { + "epoch": 0.5158562367864693, + "grad_norm": 1.2920678853988647, + "learning_rate": 4.925104569504188e-06, + "loss": 1.365329623222351, + "step": 244 + }, + { + "epoch": 0.5200845665961945, + "grad_norm": 1.2702393531799316, + "learning_rate": 4.923078382379172e-06, + "loss": 1.2854634523391724, + "step": 246 + }, + { + "epoch": 0.5243128964059197, + "grad_norm": 0.618291974067688, + "learning_rate": 4.921025628497879e-06, + "loss": 1.2556568384170532, + "step": 248 + }, + { + "epoch": 0.5285412262156448, + "grad_norm": 1.7209643125534058, + "learning_rate": 4.918946332956052e-06, + "loss": 1.1718345880508423, + "step": 250 + }, + { + "epoch": 0.53276955602537, + "grad_norm": 2.5865588188171387, + "learning_rate": 4.916840521173914e-06, + "loss": 1.1015582084655762, + "step": 252 + }, + { + "epoch": 0.5369978858350951, + "grad_norm": 1.6121222972869873, + "learning_rate": 4.914708218895861e-06, + "loss": 1.742082118988037, + "step": 254 + }, + { + "epoch": 0.5412262156448203, + "grad_norm": 0.8724984526634216, + "learning_rate": 4.912549452190142e-06, + "loss": 1.2257004976272583, + "step": 256 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 0.8018029928207397, + "learning_rate": 4.9103642474485506e-06, + "loss": 1.404122233390808, + "step": 258 + }, + { + "epoch": 0.5496828752642706, + "grad_norm": 0.7809526920318604, + "learning_rate": 4.908152631386091e-06, + "loss": 1.0011447668075562, + "step": 260 + }, + { + "epoch": 0.5539112050739958, + "grad_norm": 0.903286874294281, + "learning_rate": 4.905914631040658e-06, + "loss": 1.2129504680633545, + "step": 262 + }, + { + "epoch": 0.5581395348837209, + "grad_norm": 0.8726590871810913, + "learning_rate": 4.9036502737727055e-06, + "loss": 1.0686239004135132, + "step": 264 + }, + { + "epoch": 0.5623678646934461, + "grad_norm": 1.0852851867675781, + "learning_rate": 4.901359587264911e-06, + "loss": 1.510224461555481, + "step": 266 + }, + { + "epoch": 0.5665961945031712, + "grad_norm": 1.2106469869613647, + "learning_rate": 4.899042599521836e-06, + "loss": 0.5239309072494507, + "step": 268 + }, + { + "epoch": 0.5708245243128964, + "grad_norm": 1.205407738685608, + "learning_rate": 4.8966993388695886e-06, + "loss": 1.0271662473678589, + "step": 270 + }, + { + "epoch": 0.5750528541226215, + "grad_norm": 2.216895580291748, + "learning_rate": 4.894329833955471e-06, + "loss": 1.2076795101165771, + "step": 272 + }, + { + "epoch": 0.5792811839323467, + "grad_norm": 1.1767100095748901, + "learning_rate": 4.891934113747631e-06, + "loss": 0.9579524993896484, + "step": 274 + }, + { + "epoch": 0.5835095137420718, + "grad_norm": 1.0378605127334595, + "learning_rate": 4.8895122075347135e-06, + "loss": 0.9333509206771851, + "step": 276 + }, + { + "epoch": 0.587737843551797, + "grad_norm": 0.2679741382598877, + "learning_rate": 4.887064144925493e-06, + "loss": 0.8527027368545532, + "step": 278 + }, + { + "epoch": 0.5919661733615222, + "grad_norm": 0.6713143587112427, + "learning_rate": 4.8845899558485185e-06, + "loss": 1.2377649545669556, + "step": 280 + }, + { + "epoch": 0.5961945031712473, + "grad_norm": 0.8605502247810364, + "learning_rate": 4.8820896705517465e-06, + "loss": 1.4566680192947388, + "step": 282 + }, + { + "epoch": 0.6004228329809725, + "grad_norm": 0.1871010959148407, + "learning_rate": 4.879563319602169e-06, + "loss": 0.9204542636871338, + "step": 284 + }, + { + "epoch": 0.6046511627906976, + "grad_norm": 1.0409096479415894, + "learning_rate": 4.87701093388544e-06, + "loss": 1.2875986099243164, + "step": 286 + }, + { + "epoch": 0.6088794926004228, + "grad_norm": 1.0819401741027832, + "learning_rate": 4.874432544605502e-06, + "loss": 0.4104747176170349, + "step": 288 + }, + { + "epoch": 0.6131078224101479, + "grad_norm": 1.2147349119186401, + "learning_rate": 4.871828183284199e-06, + "loss": 0.9401180744171143, + "step": 290 + }, + { + "epoch": 0.6173361522198731, + "grad_norm": 0.9073833227157593, + "learning_rate": 4.869197881760896e-06, + "loss": 0.881571888923645, + "step": 292 + }, + { + "epoch": 0.6215644820295984, + "grad_norm": 1.9964375495910645, + "learning_rate": 4.866541672192082e-06, + "loss": 0.7248478531837463, + "step": 294 + }, + { + "epoch": 0.6257928118393234, + "grad_norm": 0.8532471656799316, + "learning_rate": 4.863859587050991e-06, + "loss": 0.7459216117858887, + "step": 296 + }, + { + "epoch": 0.6300211416490487, + "grad_norm": 1.436072587966919, + "learning_rate": 4.861151659127188e-06, + "loss": 1.300452709197998, + "step": 298 + }, + { + "epoch": 0.6342494714587738, + "grad_norm": 0.7239471673965454, + "learning_rate": 4.85841792152618e-06, + "loss": 1.2527419328689575, + "step": 300 + }, + { + "epoch": 0.638477801268499, + "grad_norm": 0.6468306183815002, + "learning_rate": 4.85565840766901e-06, + "loss": 0.6989489197731018, + "step": 302 + }, + { + "epoch": 0.642706131078224, + "grad_norm": 0.8453686237335205, + "learning_rate": 4.852873151291841e-06, + "loss": 0.8262038230895996, + "step": 304 + }, + { + "epoch": 0.6469344608879493, + "grad_norm": 1.1085318326950073, + "learning_rate": 4.850062186445552e-06, + "loss": 0.9046404361724854, + "step": 306 + }, + { + "epoch": 0.6511627906976745, + "grad_norm": 1.146599531173706, + "learning_rate": 4.847225547495318e-06, + "loss": 1.2455283403396606, + "step": 308 + }, + { + "epoch": 0.6553911205073996, + "grad_norm": 1.3924742937088013, + "learning_rate": 4.84436326912019e-06, + "loss": 1.2206207513809204, + "step": 310 + }, + { + "epoch": 0.6596194503171248, + "grad_norm": 0.65780109167099, + "learning_rate": 4.84147538631267e-06, + "loss": 1.2247376441955566, + "step": 312 + }, + { + "epoch": 0.6638477801268499, + "grad_norm": 1.4019877910614014, + "learning_rate": 4.8385619343782865e-06, + "loss": 1.2421458959579468, + "step": 314 + }, + { + "epoch": 0.6680761099365751, + "grad_norm": 0.5540094375610352, + "learning_rate": 4.835622948935159e-06, + "loss": 1.0704643726348877, + "step": 316 + }, + { + "epoch": 0.6723044397463002, + "grad_norm": 4.01638126373291, + "learning_rate": 4.832658465913566e-06, + "loss": 0.7506370544433594, + "step": 318 + }, + { + "epoch": 0.6765327695560254, + "grad_norm": 0.8524858355522156, + "learning_rate": 4.829668521555503e-06, + "loss": 1.2541189193725586, + "step": 320 + }, + { + "epoch": 0.6807610993657506, + "grad_norm": 0.789856493473053, + "learning_rate": 4.826653152414242e-06, + "loss": 1.31632661819458, + "step": 322 + }, + { + "epoch": 0.6849894291754757, + "grad_norm": 0.6171086430549622, + "learning_rate": 4.823612395353881e-06, + "loss": 1.0809494256973267, + "step": 324 + }, + { + "epoch": 0.6892177589852009, + "grad_norm": 0.7493656873703003, + "learning_rate": 4.820546287548897e-06, + "loss": 1.2742823362350464, + "step": 326 + }, + { + "epoch": 0.693446088794926, + "grad_norm": 1.6876893043518066, + "learning_rate": 4.81745486648369e-06, + "loss": 1.1811712980270386, + "step": 328 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 3.2828762531280518, + "learning_rate": 4.814338169952125e-06, + "loss": 0.8377833366394043, + "step": 330 + }, + { + "epoch": 0.7019027484143763, + "grad_norm": 1.0052536725997925, + "learning_rate": 4.811196236057068e-06, + "loss": 1.3030086755752563, + "step": 332 + }, + { + "epoch": 0.7061310782241015, + "grad_norm": 1.0873143672943115, + "learning_rate": 4.808029103209925e-06, + "loss": 1.2012561559677124, + "step": 334 + }, + { + "epoch": 0.7103594080338267, + "grad_norm": 0.6253702640533447, + "learning_rate": 4.804836810130165e-06, + "loss": 1.2230525016784668, + "step": 336 + }, + { + "epoch": 0.7145877378435518, + "grad_norm": 0.9542647004127502, + "learning_rate": 4.801619395844855e-06, + "loss": 1.3592028617858887, + "step": 338 + }, + { + "epoch": 0.718816067653277, + "grad_norm": 2.7538552284240723, + "learning_rate": 4.798376899688178e-06, + "loss": 1.2697663307189941, + "step": 340 + }, + { + "epoch": 0.7230443974630021, + "grad_norm": 1.300477147102356, + "learning_rate": 4.79510936130095e-06, + "loss": 1.1157526969909668, + "step": 342 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 1.0564080476760864, + "learning_rate": 4.791816820630143e-06, + "loss": 0.9000387191772461, + "step": 344 + }, + { + "epoch": 0.7315010570824524, + "grad_norm": 6.434342861175537, + "learning_rate": 4.788499317928387e-06, + "loss": 0.8705897927284241, + "step": 346 + }, + { + "epoch": 0.7357293868921776, + "grad_norm": 1.8239296674728394, + "learning_rate": 4.785156893753487e-06, + "loss": 0.9839805364608765, + "step": 348 + }, + { + "epoch": 0.7399577167019028, + "grad_norm": 1.4335103034973145, + "learning_rate": 4.781789588967922e-06, + "loss": 1.3093687295913696, + "step": 350 + }, + { + "epoch": 0.7441860465116279, + "grad_norm": 1.6742173433303833, + "learning_rate": 4.778397444738344e-06, + "loss": 1.1608158349990845, + "step": 352 + }, + { + "epoch": 0.7484143763213531, + "grad_norm": 7.366899013519287, + "learning_rate": 4.774980502535081e-06, + "loss": 0.7054665088653564, + "step": 354 + }, + { + "epoch": 0.7526427061310782, + "grad_norm": 1.3835808038711548, + "learning_rate": 4.771538804131623e-06, + "loss": 1.0112260580062866, + "step": 356 + }, + { + "epoch": 0.7568710359408034, + "grad_norm": 0.7247501015663147, + "learning_rate": 4.7680723916041145e-06, + "loss": 1.21829092502594, + "step": 358 + }, + { + "epoch": 0.7610993657505285, + "grad_norm": 1.7645400762557983, + "learning_rate": 4.764581307330844e-06, + "loss": 0.8012920618057251, + "step": 360 + }, + { + "epoch": 0.7653276955602537, + "grad_norm": 0.9456394910812378, + "learning_rate": 4.761065593991716e-06, + "loss": 1.0871394872665405, + "step": 362 + }, + { + "epoch": 0.7695560253699789, + "grad_norm": 1.7007086277008057, + "learning_rate": 4.757525294567743e-06, + "loss": 1.0711324214935303, + "step": 364 + }, + { + "epoch": 0.773784355179704, + "grad_norm": 1.7648683786392212, + "learning_rate": 4.753960452340503e-06, + "loss": 1.2688275575637817, + "step": 366 + }, + { + "epoch": 0.7780126849894292, + "grad_norm": 2.573831796646118, + "learning_rate": 4.750371110891628e-06, + "loss": 1.2218682765960693, + "step": 368 + }, + { + "epoch": 0.7822410147991543, + "grad_norm": 1.4216328859329224, + "learning_rate": 4.746757314102258e-06, + "loss": 0.882118821144104, + "step": 370 + }, + { + "epoch": 0.7864693446088795, + "grad_norm": 0.9801198840141296, + "learning_rate": 4.74311910615251e-06, + "loss": 1.1977894306182861, + "step": 372 + }, + { + "epoch": 0.7906976744186046, + "grad_norm": 0.9408032894134521, + "learning_rate": 4.739456531520939e-06, + "loss": 1.218635082244873, + "step": 374 + }, + { + "epoch": 0.7949260042283298, + "grad_norm": 0.8495731949806213, + "learning_rate": 4.735769634983991e-06, + "loss": 1.3023980855941772, + "step": 376 + }, + { + "epoch": 0.7991543340380549, + "grad_norm": 0.7672894597053528, + "learning_rate": 4.732058461615457e-06, + "loss": 0.9807602763175964, + "step": 378 + }, + { + "epoch": 0.8033826638477801, + "grad_norm": 1.1932101249694824, + "learning_rate": 4.728323056785922e-06, + "loss": 1.3166661262512207, + "step": 380 + }, + { + "epoch": 0.8076109936575053, + "grad_norm": 0.8716092109680176, + "learning_rate": 4.724563466162212e-06, + "loss": 1.1811023950576782, + "step": 382 + }, + { + "epoch": 0.8118393234672304, + "grad_norm": 0.7233268618583679, + "learning_rate": 4.7207797357068325e-06, + "loss": 0.9482329487800598, + "step": 384 + }, + { + "epoch": 0.8160676532769556, + "grad_norm": 1.9323195219039917, + "learning_rate": 4.716971911677408e-06, + "loss": 0.9550711512565613, + "step": 386 + }, + { + "epoch": 0.8202959830866807, + "grad_norm": 2.0255048274993896, + "learning_rate": 4.713140040626116e-06, + "loss": 1.4793070554733276, + "step": 388 + }, + { + "epoch": 0.8245243128964059, + "grad_norm": 3.492385149002075, + "learning_rate": 4.709284169399122e-06, + "loss": 1.1643321514129639, + "step": 390 + }, + { + "epoch": 0.828752642706131, + "grad_norm": 1.8576074838638306, + "learning_rate": 4.7054043451359995e-06, + "loss": 0.9359977841377258, + "step": 392 + }, + { + "epoch": 0.8329809725158562, + "grad_norm": 2.6958370208740234, + "learning_rate": 4.70150061526916e-06, + "loss": 1.2630811929702759, + "step": 394 + }, + { + "epoch": 0.8372093023255814, + "grad_norm": 0.2705208957195282, + "learning_rate": 4.6975730275232675e-06, + "loss": 0.7544412612915039, + "step": 396 + }, + { + "epoch": 0.8414376321353065, + "grad_norm": 0.574475109577179, + "learning_rate": 4.693621629914662e-06, + "loss": 0.6635357737541199, + "step": 398 + }, + { + "epoch": 0.8456659619450317, + "grad_norm": 3.4476590156555176, + "learning_rate": 4.689646470750765e-06, + "loss": 1.1272733211517334, + "step": 400 + }, + { + "epoch": 0.8498942917547568, + "grad_norm": 1.7214937210083008, + "learning_rate": 4.685647598629496e-06, + "loss": 0.9259978532791138, + "step": 402 + }, + { + "epoch": 0.854122621564482, + "grad_norm": 0.8425617218017578, + "learning_rate": 4.681625062438672e-06, + "loss": 0.8047032952308655, + "step": 404 + }, + { + "epoch": 0.8583509513742071, + "grad_norm": 0.5278515219688416, + "learning_rate": 4.677578911355415e-06, + "loss": 0.9893290996551514, + "step": 406 + }, + { + "epoch": 0.8625792811839323, + "grad_norm": 1.7615666389465332, + "learning_rate": 4.673509194845547e-06, + "loss": 1.0258289575576782, + "step": 408 + }, + { + "epoch": 0.8668076109936576, + "grad_norm": 4.036815643310547, + "learning_rate": 4.669415962662987e-06, + "loss": 0.8945714235305786, + "step": 410 + }, + { + "epoch": 0.8710359408033826, + "grad_norm": 1.1441407203674316, + "learning_rate": 4.665299264849144e-06, + "loss": 1.1385798454284668, + "step": 412 + }, + { + "epoch": 0.8752642706131079, + "grad_norm": 0.9861418604850769, + "learning_rate": 4.661159151732302e-06, + "loss": 1.000221848487854, + "step": 414 + }, + { + "epoch": 0.879492600422833, + "grad_norm": 1.6568901538848877, + "learning_rate": 4.656995673927008e-06, + "loss": 1.1056493520736694, + "step": 416 + }, + { + "epoch": 0.8837209302325582, + "grad_norm": 1.9883769750595093, + "learning_rate": 4.6528088823334485e-06, + "loss": 1.2290613651275635, + "step": 418 + }, + { + "epoch": 0.8879492600422833, + "grad_norm": 2.026019334793091, + "learning_rate": 4.648598828136836e-06, + "loss": 1.2732092142105103, + "step": 420 + }, + { + "epoch": 0.8921775898520085, + "grad_norm": 2.332921266555786, + "learning_rate": 4.644365562806772e-06, + "loss": 0.9564085006713867, + "step": 422 + }, + { + "epoch": 0.8964059196617337, + "grad_norm": 1.1014127731323242, + "learning_rate": 4.6401091380966276e-06, + "loss": 1.2294795513153076, + "step": 424 + }, + { + "epoch": 0.9006342494714588, + "grad_norm": 0.6077444553375244, + "learning_rate": 4.635829606042904e-06, + "loss": 1.0533849000930786, + "step": 426 + }, + { + "epoch": 0.904862579281184, + "grad_norm": 1.495557427406311, + "learning_rate": 4.6315270189645994e-06, + "loss": 0.870442807674408, + "step": 428 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.9370867609977722, + "learning_rate": 4.627201429462571e-06, + "loss": 1.0831764936447144, + "step": 430 + }, + { + "epoch": 0.9133192389006343, + "grad_norm": 1.3776339292526245, + "learning_rate": 4.622852890418887e-06, + "loss": 1.2492940425872803, + "step": 432 + }, + { + "epoch": 0.9175475687103594, + "grad_norm": 1.6412044763565063, + "learning_rate": 4.618481454996184e-06, + "loss": 0.5277518033981323, + "step": 434 + }, + { + "epoch": 0.9217758985200846, + "grad_norm": 1.8212065696716309, + "learning_rate": 4.614087176637018e-06, + "loss": 0.39484813809394836, + "step": 436 + }, + { + "epoch": 0.9260042283298098, + "grad_norm": 0.5595113039016724, + "learning_rate": 4.6096701090632064e-06, + "loss": 0.9221642017364502, + "step": 438 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 0.5767474174499512, + "learning_rate": 4.605230306275174e-06, + "loss": 1.1318392753601074, + "step": 440 + }, + { + "epoch": 0.9344608879492601, + "grad_norm": 7.54542875289917, + "learning_rate": 4.600767822551295e-06, + "loss": 0.7188118100166321, + "step": 442 + }, + { + "epoch": 0.9386892177589852, + "grad_norm": 1.233697772026062, + "learning_rate": 4.596282712447225e-06, + "loss": 1.243707299232483, + "step": 444 + }, + { + "epoch": 0.9429175475687104, + "grad_norm": 0.9562466144561768, + "learning_rate": 4.591775030795238e-06, + "loss": 1.0868984460830688, + "step": 446 + }, + { + "epoch": 0.9471458773784355, + "grad_norm": 1.5018267631530762, + "learning_rate": 4.587244832703551e-06, + "loss": 1.1005150079727173, + "step": 448 + }, + { + "epoch": 0.9513742071881607, + "grad_norm": 1.9610888957977295, + "learning_rate": 4.582692173555658e-06, + "loss": 0.7627214193344116, + "step": 450 + }, + { + "epoch": 0.9556025369978859, + "grad_norm": 0.661365807056427, + "learning_rate": 4.5781171090096456e-06, + "loss": 1.0607075691223145, + "step": 452 + }, + { + "epoch": 0.959830866807611, + "grad_norm": 1.50721275806427, + "learning_rate": 4.573519694997514e-06, + "loss": 1.3157492876052856, + "step": 454 + }, + { + "epoch": 0.9640591966173362, + "grad_norm": 5.258788585662842, + "learning_rate": 4.568899987724499e-06, + "loss": 0.6505974531173706, + "step": 456 + }, + { + "epoch": 0.9682875264270613, + "grad_norm": 2.9857466220855713, + "learning_rate": 4.564258043668378e-06, + "loss": 0.8859183192253113, + "step": 458 + }, + { + "epoch": 0.9725158562367865, + "grad_norm": 1.149194598197937, + "learning_rate": 4.559593919578779e-06, + "loss": 1.232746958732605, + "step": 460 + }, + { + "epoch": 0.9767441860465116, + "grad_norm": 1.1455705165863037, + "learning_rate": 4.554907672476498e-06, + "loss": 1.2240073680877686, + "step": 462 + }, + { + "epoch": 0.9809725158562368, + "grad_norm": 2.103093385696411, + "learning_rate": 4.550199359652783e-06, + "loss": 0.6853596568107605, + "step": 464 + }, + { + "epoch": 0.985200845665962, + "grad_norm": 2.1419098377227783, + "learning_rate": 4.5454690386686525e-06, + "loss": 1.2064260244369507, + "step": 466 + }, + { + "epoch": 0.9894291754756871, + "grad_norm": 2.1042675971984863, + "learning_rate": 4.540716767354182e-06, + "loss": 0.9678149819374084, + "step": 468 + }, + { + "epoch": 0.9936575052854123, + "grad_norm": 1.194627046585083, + "learning_rate": 4.5359426038077955e-06, + "loss": 1.2596162557601929, + "step": 470 + }, + { + "epoch": 0.9978858350951374, + "grad_norm": 0.7306416034698486, + "learning_rate": 4.531146606395561e-06, + "loss": 1.2588738203048706, + "step": 472 + }, + { + "epoch": 1.0021141649048626, + "grad_norm": 1.3689743280410767, + "learning_rate": 4.5263288337504755e-06, + "loss": 0.9573943614959717, + "step": 474 + }, + { + "epoch": 1.0063424947145878, + "grad_norm": 0.6775557994842529, + "learning_rate": 4.521489344771744e-06, + "loss": 1.2035043239593506, + "step": 476 + }, + { + "epoch": 1.0105708245243128, + "grad_norm": 2.428388833999634, + "learning_rate": 4.516628198624062e-06, + "loss": 0.43922215700149536, + "step": 478 + }, + { + "epoch": 1.014799154334038, + "grad_norm": 1.7859880924224854, + "learning_rate": 4.511745454736895e-06, + "loss": 0.8049924969673157, + "step": 480 + }, + { + "epoch": 1.0190274841437632, + "grad_norm": 0.9620187878608704, + "learning_rate": 4.506841172803751e-06, + "loss": 0.7076442241668701, + "step": 482 + }, + { + "epoch": 1.0232558139534884, + "grad_norm": 0.7065964341163635, + "learning_rate": 4.501915412781443e-06, + "loss": 1.0704156160354614, + "step": 484 + }, + { + "epoch": 1.0274841437632136, + "grad_norm": 1.055808663368225, + "learning_rate": 4.49696823488937e-06, + "loss": 1.1018344163894653, + "step": 486 + }, + { + "epoch": 1.0317124735729386, + "grad_norm": 1.521101474761963, + "learning_rate": 4.491999699608768e-06, + "loss": 0.8694652915000916, + "step": 488 + }, + { + "epoch": 1.0359408033826638, + "grad_norm": 1.305381178855896, + "learning_rate": 4.487009867681976e-06, + "loss": 0.8501845002174377, + "step": 490 + }, + { + "epoch": 1.040169133192389, + "grad_norm": 2.744489908218384, + "learning_rate": 4.4819988001116935e-06, + "loss": 0.7224630117416382, + "step": 492 + }, + { + "epoch": 1.0443974630021142, + "grad_norm": 1.4060019254684448, + "learning_rate": 4.476966558160237e-06, + "loss": 1.1804600954055786, + "step": 494 + }, + { + "epoch": 1.0486257928118394, + "grad_norm": 0.8754310011863708, + "learning_rate": 4.4719132033487845e-06, + "loss": 0.997734010219574, + "step": 496 + }, + { + "epoch": 1.0528541226215644, + "grad_norm": 0.615909993648529, + "learning_rate": 4.46683879745663e-06, + "loss": 0.8599073886871338, + "step": 498 + }, + { + "epoch": 1.0570824524312896, + "grad_norm": 0.9889487028121948, + "learning_rate": 4.461743402520423e-06, + "loss": 0.8792165517807007, + "step": 500 + }, + { + "epoch": 1.0613107822410148, + "grad_norm": 0.7160633206367493, + "learning_rate": 4.456627080833414e-06, + "loss": 1.1756080389022827, + "step": 502 + }, + { + "epoch": 1.06553911205074, + "grad_norm": 0.8279690742492676, + "learning_rate": 4.451489894944691e-06, + "loss": 1.1408627033233643, + "step": 504 + }, + { + "epoch": 1.069767441860465, + "grad_norm": 1.4028804302215576, + "learning_rate": 4.446331907658416e-06, + "loss": 0.8269267678260803, + "step": 506 + }, + { + "epoch": 1.0739957716701902, + "grad_norm": 0.851521909236908, + "learning_rate": 4.441153182033057e-06, + "loss": 0.8282674551010132, + "step": 508 + }, + { + "epoch": 1.0782241014799154, + "grad_norm": 4.568781852722168, + "learning_rate": 4.435953781380613e-06, + "loss": 0.9908189177513123, + "step": 510 + }, + { + "epoch": 1.0824524312896406, + "grad_norm": 1.2083165645599365, + "learning_rate": 4.430733769265846e-06, + "loss": 1.0665321350097656, + "step": 512 + }, + { + "epoch": 1.0866807610993658, + "grad_norm": 0.9603208303451538, + "learning_rate": 4.425493209505503e-06, + "loss": 1.1846468448638916, + "step": 514 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 2.0771138668060303, + "learning_rate": 4.420232166167531e-06, + "loss": 0.920912504196167, + "step": 516 + }, + { + "epoch": 1.095137420718816, + "grad_norm": 0.8328940868377686, + "learning_rate": 4.414950703570299e-06, + "loss": 0.9249328374862671, + "step": 518 + }, + { + "epoch": 1.0993657505285412, + "grad_norm": 1.3584532737731934, + "learning_rate": 4.40964888628181e-06, + "loss": 0.6281458139419556, + "step": 520 + }, + { + "epoch": 1.1035940803382664, + "grad_norm": 0.7690941095352173, + "learning_rate": 4.404326779118909e-06, + "loss": 1.4201838970184326, + "step": 522 + }, + { + "epoch": 1.1078224101479917, + "grad_norm": 1.4306553602218628, + "learning_rate": 4.398984447146496e-06, + "loss": 0.7664209604263306, + "step": 524 + }, + { + "epoch": 1.1120507399577166, + "grad_norm": 0.9326035976409912, + "learning_rate": 4.393621955676723e-06, + "loss": 1.3979065418243408, + "step": 526 + }, + { + "epoch": 1.1162790697674418, + "grad_norm": 0.5948351621627808, + "learning_rate": 4.3882393702682046e-06, + "loss": 0.8897819519042969, + "step": 528 + }, + { + "epoch": 1.120507399577167, + "grad_norm": 18.87567138671875, + "learning_rate": 4.38283675672521e-06, + "loss": 0.4860161244869232, + "step": 530 + }, + { + "epoch": 1.1247357293868923, + "grad_norm": 0.7055052518844604, + "learning_rate": 4.377414181096859e-06, + "loss": 1.1043274402618408, + "step": 532 + }, + { + "epoch": 1.1289640591966172, + "grad_norm": 0.9923433065414429, + "learning_rate": 4.371971709676319e-06, + "loss": 0.8963814973831177, + "step": 534 + }, + { + "epoch": 1.1331923890063424, + "grad_norm": 0.6690739393234253, + "learning_rate": 4.366509408999988e-06, + "loss": 0.8666636347770691, + "step": 536 + }, + { + "epoch": 1.1374207188160677, + "grad_norm": 1.7911560535430908, + "learning_rate": 4.361027345846687e-06, + "loss": 0.7381163239479065, + "step": 538 + }, + { + "epoch": 1.1416490486257929, + "grad_norm": 2.775190591812134, + "learning_rate": 4.355525587236841e-06, + "loss": 0.803221583366394, + "step": 540 + }, + { + "epoch": 1.145877378435518, + "grad_norm": 0.5926763415336609, + "learning_rate": 4.350004200431658e-06, + "loss": 1.1303699016571045, + "step": 542 + }, + { + "epoch": 1.150105708245243, + "grad_norm": 1.6176501512527466, + "learning_rate": 4.344463252932312e-06, + "loss": 0.7936561107635498, + "step": 544 + }, + { + "epoch": 1.1543340380549683, + "grad_norm": 0.7994486689567566, + "learning_rate": 4.33890281247911e-06, + "loss": 1.1952998638153076, + "step": 546 + }, + { + "epoch": 1.1585623678646935, + "grad_norm": 5.03057861328125, + "learning_rate": 4.333322947050673e-06, + "loss": 1.3116034269332886, + "step": 548 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 1.1229743957519531, + "learning_rate": 4.3277237248630946e-06, + "loss": 0.8429150581359863, + "step": 550 + }, + { + "epoch": 1.1670190274841437, + "grad_norm": 1.2708096504211426, + "learning_rate": 4.3221052143691185e-06, + "loss": 1.1472980976104736, + "step": 552 + }, + { + "epoch": 1.1712473572938689, + "grad_norm": 1.303392767906189, + "learning_rate": 4.316467484257291e-06, + "loss": 1.1732940673828125, + "step": 554 + }, + { + "epoch": 1.175475687103594, + "grad_norm": 1.490607738494873, + "learning_rate": 4.310810603451128e-06, + "loss": 1.079361915588379, + "step": 556 + }, + { + "epoch": 1.1797040169133193, + "grad_norm": 0.6570778489112854, + "learning_rate": 4.30513464110827e-06, + "loss": 1.1126495599746704, + "step": 558 + }, + { + "epoch": 1.1839323467230445, + "grad_norm": 0.64628005027771, + "learning_rate": 4.299439666619637e-06, + "loss": 1.085148811340332, + "step": 560 + }, + { + "epoch": 1.1881606765327695, + "grad_norm": 0.881676435470581, + "learning_rate": 4.293725749608581e-06, + "loss": 0.8194442987442017, + "step": 562 + }, + { + "epoch": 1.1923890063424947, + "grad_norm": 1.4025084972381592, + "learning_rate": 4.287992959930033e-06, + "loss": 1.130499005317688, + "step": 564 + }, + { + "epoch": 1.1966173361522199, + "grad_norm": 1.591407060623169, + "learning_rate": 4.282241367669648e-06, + "loss": 1.0246634483337402, + "step": 566 + }, + { + "epoch": 1.200845665961945, + "grad_norm": 3.8888139724731445, + "learning_rate": 4.276471043142954e-06, + "loss": 1.2712934017181396, + "step": 568 + }, + { + "epoch": 1.20507399577167, + "grad_norm": 2.388948678970337, + "learning_rate": 4.270682056894487e-06, + "loss": 1.2416294813156128, + "step": 570 + }, + { + "epoch": 1.2093023255813953, + "grad_norm": 2.251979351043701, + "learning_rate": 4.264874479696928e-06, + "loss": 1.0589932203292847, + "step": 572 + }, + { + "epoch": 1.2135306553911205, + "grad_norm": 0.7591463923454285, + "learning_rate": 4.2590483825502425e-06, + "loss": 0.8409648537635803, + "step": 574 + }, + { + "epoch": 1.2177589852008457, + "grad_norm": 1.53620183467865, + "learning_rate": 4.25320383668081e-06, + "loss": 0.8636283278465271, + "step": 576 + }, + { + "epoch": 1.221987315010571, + "grad_norm": 1.1595256328582764, + "learning_rate": 4.247340913540548e-06, + "loss": 0.9905154705047607, + "step": 578 + }, + { + "epoch": 1.226215644820296, + "grad_norm": 1.0831875801086426, + "learning_rate": 4.241459684806052e-06, + "loss": 0.8118501305580139, + "step": 580 + }, + { + "epoch": 1.230443974630021, + "grad_norm": 0.7525449395179749, + "learning_rate": 4.235560222377703e-06, + "loss": 1.1561369895935059, + "step": 582 + }, + { + "epoch": 1.2346723044397463, + "grad_norm": 3.329188585281372, + "learning_rate": 4.2296425983788e-06, + "loss": 1.0410387516021729, + "step": 584 + }, + { + "epoch": 1.2389006342494715, + "grad_norm": 1.3844683170318604, + "learning_rate": 4.223706885154674e-06, + "loss": 0.877763569355011, + "step": 586 + }, + { + "epoch": 1.2431289640591967, + "grad_norm": 1.171336054801941, + "learning_rate": 4.217753155271804e-06, + "loss": 0.9664973020553589, + "step": 588 + }, + { + "epoch": 1.2473572938689217, + "grad_norm": 0.8962653279304504, + "learning_rate": 4.21178148151693e-06, + "loss": 0.6594605445861816, + "step": 590 + }, + { + "epoch": 1.251585623678647, + "grad_norm": 0.4513523578643799, + "learning_rate": 4.2057919368961626e-06, + "loss": 0.9009559154510498, + "step": 592 + }, + { + "epoch": 1.255813953488372, + "grad_norm": 1.2183146476745605, + "learning_rate": 4.199784594634091e-06, + "loss": 1.0488721132278442, + "step": 594 + }, + { + "epoch": 1.2600422832980973, + "grad_norm": 0.703557014465332, + "learning_rate": 4.193759528172889e-06, + "loss": 0.760339617729187, + "step": 596 + }, + { + "epoch": 1.2642706131078225, + "grad_norm": 1.2198814153671265, + "learning_rate": 4.187716811171412e-06, + "loss": 1.1317111253738403, + "step": 598 + }, + { + "epoch": 1.2684989429175475, + "grad_norm": 1.99573814868927, + "learning_rate": 4.181656517504306e-06, + "loss": 1.3582342863082886, + "step": 600 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 0.549845814704895, + "learning_rate": 4.175578721261093e-06, + "loss": 0.9524427056312561, + "step": 602 + }, + { + "epoch": 1.276955602536998, + "grad_norm": 1.245996117591858, + "learning_rate": 4.169483496745277e-06, + "loss": 1.1659082174301147, + "step": 604 + }, + { + "epoch": 1.2811839323467231, + "grad_norm": 2.977156162261963, + "learning_rate": 4.163370918473426e-06, + "loss": 0.790830135345459, + "step": 606 + }, + { + "epoch": 1.285412262156448, + "grad_norm": 0.9513248801231384, + "learning_rate": 4.157241061174261e-06, + "loss": 1.151841640472412, + "step": 608 + }, + { + "epoch": 1.2896405919661733, + "grad_norm": 0.8377892374992371, + "learning_rate": 4.151093999787755e-06, + "loss": 0.7630675435066223, + "step": 610 + }, + { + "epoch": 1.2938689217758985, + "grad_norm": 0.74871826171875, + "learning_rate": 4.144929809464202e-06, + "loss": 0.6663084626197815, + "step": 612 + }, + { + "epoch": 1.2980972515856237, + "grad_norm": 0.7946862578392029, + "learning_rate": 4.138748565563304e-06, + "loss": 0.9356685876846313, + "step": 614 + }, + { + "epoch": 1.302325581395349, + "grad_norm": 1.464992880821228, + "learning_rate": 4.132550343653251e-06, + "loss": 0.49841123819351196, + "step": 616 + }, + { + "epoch": 1.306553911205074, + "grad_norm": 2.7764222621917725, + "learning_rate": 4.1263352195097975e-06, + "loss": 0.921845018863678, + "step": 618 + }, + { + "epoch": 1.3107822410147991, + "grad_norm": 2.9197912216186523, + "learning_rate": 4.120103269115332e-06, + "loss": 1.1502526998519897, + "step": 620 + }, + { + "epoch": 1.3150105708245243, + "grad_norm": 0.5009030103683472, + "learning_rate": 4.113854568657952e-06, + "loss": 1.1119526624679565, + "step": 622 + }, + { + "epoch": 1.3192389006342495, + "grad_norm": 4.491978168487549, + "learning_rate": 4.107589194530532e-06, + "loss": 0.7493167519569397, + "step": 624 + }, + { + "epoch": 1.3234672304439745, + "grad_norm": 0.510420560836792, + "learning_rate": 4.101307223329786e-06, + "loss": 1.1615945100784302, + "step": 626 + }, + { + "epoch": 1.3276955602536997, + "grad_norm": 1.2960412502288818, + "learning_rate": 4.0950087318553375e-06, + "loss": 1.3132972717285156, + "step": 628 + }, + { + "epoch": 1.331923890063425, + "grad_norm": 0.8799285292625427, + "learning_rate": 4.088693797108774e-06, + "loss": 1.0321528911590576, + "step": 630 + }, + { + "epoch": 1.3361522198731501, + "grad_norm": 1.2005505561828613, + "learning_rate": 4.0823624962927104e-06, + "loss": 0.616770327091217, + "step": 632 + }, + { + "epoch": 1.3403805496828753, + "grad_norm": 0.6413878798484802, + "learning_rate": 4.076014906809842e-06, + "loss": 0.747455358505249, + "step": 634 + }, + { + "epoch": 1.3446088794926006, + "grad_norm": 0.6914223432540894, + "learning_rate": 4.069651106262003e-06, + "loss": 0.8139711022377014, + "step": 636 + }, + { + "epoch": 1.3488372093023255, + "grad_norm": 3.342055082321167, + "learning_rate": 4.063271172449209e-06, + "loss": 1.0335206985473633, + "step": 638 + }, + { + "epoch": 1.3530655391120507, + "grad_norm": 2.376635789871216, + "learning_rate": 4.0568751833687155e-06, + "loss": 0.7637988328933716, + "step": 640 + }, + { + "epoch": 1.357293868921776, + "grad_norm": 0.9393727779388428, + "learning_rate": 4.050463217214058e-06, + "loss": 1.218309760093689, + "step": 642 + }, + { + "epoch": 1.361522198731501, + "grad_norm": 4.736209869384766, + "learning_rate": 4.0440353523741e-06, + "loss": 1.1682794094085693, + "step": 644 + }, + { + "epoch": 1.3657505285412261, + "grad_norm": 1.6625150442123413, + "learning_rate": 4.0375916674320694e-06, + "loss": 0.7112323045730591, + "step": 646 + }, + { + "epoch": 1.3699788583509513, + "grad_norm": 1.635366678237915, + "learning_rate": 4.0311322411646045e-06, + "loss": 0.7137230634689331, + "step": 648 + }, + { + "epoch": 1.3742071881606766, + "grad_norm": 1.2800323963165283, + "learning_rate": 4.0246571525407875e-06, + "loss": 0.7801585793495178, + "step": 650 + }, + { + "epoch": 1.3784355179704018, + "grad_norm": 1.4994843006134033, + "learning_rate": 4.018166480721178e-06, + "loss": 0.7897611856460571, + "step": 652 + }, + { + "epoch": 1.382663847780127, + "grad_norm": 0.7120780348777771, + "learning_rate": 4.011660305056846e-06, + "loss": 1.1767425537109375, + "step": 654 + }, + { + "epoch": 1.386892177589852, + "grad_norm": 0.7388160228729248, + "learning_rate": 4.005138705088401e-06, + "loss": 1.0873156785964966, + "step": 656 + }, + { + "epoch": 1.3911205073995772, + "grad_norm": 1.0489729642868042, + "learning_rate": 3.9986017605450265e-06, + "loss": 0.8503063321113586, + "step": 658 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 0.8119449019432068, + "learning_rate": 3.992049551343493e-06, + "loss": 0.9161325097084045, + "step": 660 + }, + { + "epoch": 1.3995771670190273, + "grad_norm": 0.5929046869277954, + "learning_rate": 3.985482157587192e-06, + "loss": 1.1369270086288452, + "step": 662 + }, + { + "epoch": 1.4038054968287526, + "grad_norm": 0.3672987222671509, + "learning_rate": 3.97889965956515e-06, + "loss": 1.0164310932159424, + "step": 664 + }, + { + "epoch": 1.4080338266384778, + "grad_norm": 1.0386170148849487, + "learning_rate": 3.972302137751051e-06, + "loss": 1.374223232269287, + "step": 666 + }, + { + "epoch": 1.412262156448203, + "grad_norm": 1.0722689628601074, + "learning_rate": 3.9656896728022476e-06, + "loss": 1.12968111038208, + "step": 668 + }, + { + "epoch": 1.4164904862579282, + "grad_norm": 1.196387529373169, + "learning_rate": 3.959062345558782e-06, + "loss": 0.771783173084259, + "step": 670 + }, + { + "epoch": 1.4207188160676534, + "grad_norm": 1.3007256984710693, + "learning_rate": 3.9524202370423915e-06, + "loss": 1.1213726997375488, + "step": 672 + }, + { + "epoch": 1.4249471458773784, + "grad_norm": 0.9761534929275513, + "learning_rate": 3.945763428455523e-06, + "loss": 0.6354954242706299, + "step": 674 + }, + { + "epoch": 1.4291754756871036, + "grad_norm": 1.2106300592422485, + "learning_rate": 3.939092001180332e-06, + "loss": 0.8169525861740112, + "step": 676 + }, + { + "epoch": 1.4334038054968288, + "grad_norm": 0.6862068176269531, + "learning_rate": 3.932406036777701e-06, + "loss": 1.3615213632583618, + "step": 678 + }, + { + "epoch": 1.437632135306554, + "grad_norm": 1.1061359643936157, + "learning_rate": 3.9257056169862305e-06, + "loss": 1.1848570108413696, + "step": 680 + }, + { + "epoch": 1.441860465116279, + "grad_norm": 1.6158320903778076, + "learning_rate": 3.918990823721243e-06, + "loss": 1.1745814085006714, + "step": 682 + }, + { + "epoch": 1.4460887949260042, + "grad_norm": 0.6842957735061646, + "learning_rate": 3.912261739073785e-06, + "loss": 1.106062650680542, + "step": 684 + }, + { + "epoch": 1.4503171247357294, + "grad_norm": 1.5938684940338135, + "learning_rate": 3.905518445309619e-06, + "loss": 1.4594074487686157, + "step": 686 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 0.7108921408653259, + "learning_rate": 3.8987610248682205e-06, + "loss": 1.0741581916809082, + "step": 688 + }, + { + "epoch": 1.4587737843551798, + "grad_norm": 7.655938148498535, + "learning_rate": 3.89198956036177e-06, + "loss": 0.41335436701774597, + "step": 690 + }, + { + "epoch": 1.463002114164905, + "grad_norm": 0.6948283910751343, + "learning_rate": 3.885204134574141e-06, + "loss": 1.146783709526062, + "step": 692 + }, + { + "epoch": 1.46723044397463, + "grad_norm": 0.6634160876274109, + "learning_rate": 3.878404830459889e-06, + "loss": 0.65525221824646, + "step": 694 + }, + { + "epoch": 1.4714587737843552, + "grad_norm": 0.9858572483062744, + "learning_rate": 3.87159173114324e-06, + "loss": 1.1054624319076538, + "step": 696 + }, + { + "epoch": 1.4756871035940804, + "grad_norm": 1.3330109119415283, + "learning_rate": 3.86476491991707e-06, + "loss": 1.1466596126556396, + "step": 698 + }, + { + "epoch": 1.4799154334038054, + "grad_norm": 3.4319090843200684, + "learning_rate": 3.857924480241888e-06, + "loss": 0.9684445261955261, + "step": 700 + }, + { + "epoch": 1.4841437632135306, + "grad_norm": 0.5792906880378723, + "learning_rate": 3.851070495744819e-06, + "loss": 1.1101263761520386, + "step": 702 + }, + { + "epoch": 1.4883720930232558, + "grad_norm": 0.584158718585968, + "learning_rate": 3.8442030502185745e-06, + "loss": 1.0356827974319458, + "step": 704 + }, + { + "epoch": 1.492600422832981, + "grad_norm": 0.7270916700363159, + "learning_rate": 3.837322227620439e-06, + "loss": 0.8322772979736328, + "step": 706 + }, + { + "epoch": 1.4968287526427062, + "grad_norm": 0.28889569640159607, + "learning_rate": 3.830428112071228e-06, + "loss": 0.2769829332828522, + "step": 708 + }, + { + "epoch": 1.5010570824524314, + "grad_norm": 0.7377986907958984, + "learning_rate": 3.823520787854278e-06, + "loss": 0.6088220477104187, + "step": 710 + }, + { + "epoch": 1.5052854122621564, + "grad_norm": 2.107346296310425, + "learning_rate": 3.816600339414402e-06, + "loss": 0.5735040903091431, + "step": 712 + }, + { + "epoch": 1.5095137420718816, + "grad_norm": 0.6663100719451904, + "learning_rate": 3.8096668513568608e-06, + "loss": 0.9799573421478271, + "step": 714 + }, + { + "epoch": 1.5137420718816068, + "grad_norm": 0.7188597917556763, + "learning_rate": 3.8027204084463334e-06, + "loss": 1.1207448244094849, + "step": 716 + }, + { + "epoch": 1.5179704016913318, + "grad_norm": 0.694125771522522, + "learning_rate": 3.795761095605873e-06, + "loss": 1.0090175867080688, + "step": 718 + }, + { + "epoch": 1.522198731501057, + "grad_norm": 0.3084549903869629, + "learning_rate": 3.7887889979158775e-06, + "loss": 0.9819098711013794, + "step": 720 + }, + { + "epoch": 1.5264270613107822, + "grad_norm": 1.8949941396713257, + "learning_rate": 3.7818042006130405e-06, + "loss": 0.8384270071983337, + "step": 722 + }, + { + "epoch": 1.5306553911205074, + "grad_norm": 1.5150532722473145, + "learning_rate": 3.774806789089316e-06, + "loss": 0.9709129929542542, + "step": 724 + }, + { + "epoch": 1.5348837209302326, + "grad_norm": 1.0952752828598022, + "learning_rate": 3.7677968488908705e-06, + "loss": 0.9372836947441101, + "step": 726 + }, + { + "epoch": 1.5391120507399578, + "grad_norm": 1.564868450164795, + "learning_rate": 3.76077446571704e-06, + "loss": 0.6753690242767334, + "step": 728 + }, + { + "epoch": 1.543340380549683, + "grad_norm": 1.170804500579834, + "learning_rate": 3.75373972541928e-06, + "loss": 0.8191190361976624, + "step": 730 + }, + { + "epoch": 1.547568710359408, + "grad_norm": 0.679467499256134, + "learning_rate": 3.746692714000117e-06, + "loss": 1.086642861366272, + "step": 732 + }, + { + "epoch": 1.5517970401691332, + "grad_norm": 0.2902541756629944, + "learning_rate": 3.7396335176120953e-06, + "loss": 0.25046733021736145, + "step": 734 + }, + { + "epoch": 1.5560253699788582, + "grad_norm": 2.038381576538086, + "learning_rate": 3.7325622225567294e-06, + "loss": 1.009968876838684, + "step": 736 + }, + { + "epoch": 1.5602536997885834, + "grad_norm": 0.2496039867401123, + "learning_rate": 3.725478915283439e-06, + "loss": 0.84336918592453, + "step": 738 + }, + { + "epoch": 1.5644820295983086, + "grad_norm": 0.559074878692627, + "learning_rate": 3.7183836823885045e-06, + "loss": 1.1601533889770508, + "step": 740 + }, + { + "epoch": 1.5687103594080338, + "grad_norm": 1.2242622375488281, + "learning_rate": 3.7112766106139964e-06, + "loss": 0.8150052428245544, + "step": 742 + }, + { + "epoch": 1.572938689217759, + "grad_norm": 1.0551347732543945, + "learning_rate": 3.7041577868467242e-06, + "loss": 1.1540948152542114, + "step": 744 + }, + { + "epoch": 1.5771670190274842, + "grad_norm": 2.7716071605682373, + "learning_rate": 3.697027298117168e-06, + "loss": 1.1788626909255981, + "step": 746 + }, + { + "epoch": 1.5813953488372094, + "grad_norm": 1.1499396562576294, + "learning_rate": 3.6898852315984156e-06, + "loss": 1.057762861251831, + "step": 748 + }, + { + "epoch": 1.5856236786469344, + "grad_norm": 0.3814210295677185, + "learning_rate": 3.6827316746051015e-06, + "loss": 0.04337337985634804, + "step": 750 + }, + { + "epoch": 1.5898520084566596, + "grad_norm": 1.3480174541473389, + "learning_rate": 3.675566714592333e-06, + "loss": 0.9101552367210388, + "step": 752 + }, + { + "epoch": 1.5940803382663846, + "grad_norm": 1.1889445781707764, + "learning_rate": 3.6683904391546255e-06, + "loss": 1.2129230499267578, + "step": 754 + }, + { + "epoch": 1.5983086680761098, + "grad_norm": 0.5748162269592285, + "learning_rate": 3.6612029360248285e-06, + "loss": 1.1286925077438354, + "step": 756 + }, + { + "epoch": 1.602536997885835, + "grad_norm": 0.724022626876831, + "learning_rate": 3.6540042930730556e-06, + "loss": 1.1947628259658813, + "step": 758 + }, + { + "epoch": 1.6067653276955602, + "grad_norm": 0.677099347114563, + "learning_rate": 3.6467945983056104e-06, + "loss": 1.1410974264144897, + "step": 760 + }, + { + "epoch": 1.6109936575052854, + "grad_norm": 0.6079980731010437, + "learning_rate": 3.6395739398639057e-06, + "loss": 1.1570736169815063, + "step": 762 + }, + { + "epoch": 1.6152219873150107, + "grad_norm": 0.9599738121032715, + "learning_rate": 3.6323424060233936e-06, + "loss": 1.035282015800476, + "step": 764 + }, + { + "epoch": 1.6194503171247359, + "grad_norm": 1.0322376489639282, + "learning_rate": 3.6251000851924806e-06, + "loss": 0.8392003774642944, + "step": 766 + }, + { + "epoch": 1.6236786469344608, + "grad_norm": 0.708662211894989, + "learning_rate": 3.617847065911447e-06, + "loss": 1.1536966562271118, + "step": 768 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 1.8593244552612305, + "learning_rate": 3.610583436851369e-06, + "loss": 1.0729390382766724, + "step": 770 + }, + { + "epoch": 1.6321353065539113, + "grad_norm": 0.5333645343780518, + "learning_rate": 3.603309286813029e-06, + "loss": 1.1488738059997559, + "step": 772 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 1.6851012706756592, + "learning_rate": 3.596024704725835e-06, + "loss": 0.9281710386276245, + "step": 774 + }, + { + "epoch": 1.6405919661733614, + "grad_norm": 1.7228329181671143, + "learning_rate": 3.588729779646728e-06, + "loss": 1.158841609954834, + "step": 776 + }, + { + "epoch": 1.6448202959830867, + "grad_norm": 0.9394569396972656, + "learning_rate": 3.581424600759099e-06, + "loss": 0.7341264486312866, + "step": 778 + }, + { + "epoch": 1.6490486257928119, + "grad_norm": 0.6430965065956116, + "learning_rate": 3.5741092573716952e-06, + "loss": 1.096555233001709, + "step": 780 + }, + { + "epoch": 1.653276955602537, + "grad_norm": 1.5148671865463257, + "learning_rate": 3.5667838389175276e-06, + "loss": 0.8284240961074829, + "step": 782 + }, + { + "epoch": 1.6575052854122623, + "grad_norm": 0.6028370261192322, + "learning_rate": 3.55944843495278e-06, + "loss": 1.1990805864334106, + "step": 784 + }, + { + "epoch": 1.6617336152219875, + "grad_norm": 2.5651183128356934, + "learning_rate": 3.5521031351557116e-06, + "loss": 0.4815433621406555, + "step": 786 + }, + { + "epoch": 1.6659619450317125, + "grad_norm": 0.8050721287727356, + "learning_rate": 3.5447480293255666e-06, + "loss": 1.1529608964920044, + "step": 788 + }, + { + "epoch": 1.6701902748414377, + "grad_norm": 0.9118593335151672, + "learning_rate": 3.5373832073814668e-06, + "loss": 0.7648034691810608, + "step": 790 + }, + { + "epoch": 1.6744186046511627, + "grad_norm": 0.8314517736434937, + "learning_rate": 3.5300087593613186e-06, + "loss": 0.5136529207229614, + "step": 792 + }, + { + "epoch": 1.6786469344608879, + "grad_norm": 0.7918019890785217, + "learning_rate": 3.5226247754207138e-06, + "loss": 1.1230441331863403, + "step": 794 + }, + { + "epoch": 1.682875264270613, + "grad_norm": 0.4042631685733795, + "learning_rate": 3.5152313458318206e-06, + "loss": 0.6846147775650024, + "step": 796 + }, + { + "epoch": 1.6871035940803383, + "grad_norm": 1.0725696086883545, + "learning_rate": 3.5078285609822875e-06, + "loss": 1.2035937309265137, + "step": 798 + }, + { + "epoch": 1.6913319238900635, + "grad_norm": 0.5610724687576294, + "learning_rate": 3.5004165113741334e-06, + "loss": 1.1461760997772217, + "step": 800 + }, + { + "epoch": 1.6955602536997887, + "grad_norm": 1.0127768516540527, + "learning_rate": 3.4929952876226414e-06, + "loss": 0.6147741675376892, + "step": 802 + }, + { + "epoch": 1.699788583509514, + "grad_norm": 0.6945735216140747, + "learning_rate": 3.485564980455255e-06, + "loss": 1.1363788843154907, + "step": 804 + }, + { + "epoch": 1.7040169133192389, + "grad_norm": 1.351635217666626, + "learning_rate": 3.478125680710463e-06, + "loss": 0.8326917886734009, + "step": 806 + }, + { + "epoch": 1.708245243128964, + "grad_norm": 1.1634646654129028, + "learning_rate": 3.470677479336695e-06, + "loss": 0.7223104238510132, + "step": 808 + }, + { + "epoch": 1.712473572938689, + "grad_norm": 0.9786092042922974, + "learning_rate": 3.4632204673912034e-06, + "loss": 1.1191296577453613, + "step": 810 + }, + { + "epoch": 1.7167019027484143, + "grad_norm": 2.46586275100708, + "learning_rate": 3.4557547360389577e-06, + "loss": 1.3536570072174072, + "step": 812 + }, + { + "epoch": 1.7209302325581395, + "grad_norm": 0.8146648406982422, + "learning_rate": 3.4482803765515206e-06, + "loss": 1.100825309753418, + "step": 814 + }, + { + "epoch": 1.7251585623678647, + "grad_norm": 0.8478085994720459, + "learning_rate": 3.4407974803059406e-06, + "loss": 1.1602932214736938, + "step": 816 + }, + { + "epoch": 1.72938689217759, + "grad_norm": 0.9965582489967346, + "learning_rate": 3.4333061387836307e-06, + "loss": 0.9386340379714966, + "step": 818 + }, + { + "epoch": 1.733615221987315, + "grad_norm": 2.556925058364868, + "learning_rate": 3.4258064435692507e-06, + "loss": 1.0207256078720093, + "step": 820 + }, + { + "epoch": 1.7378435517970403, + "grad_norm": 1.3679172992706299, + "learning_rate": 3.4182984863495876e-06, + "loss": 0.6849140524864197, + "step": 822 + }, + { + "epoch": 1.7420718816067653, + "grad_norm": 0.4469180405139923, + "learning_rate": 3.410782358912435e-06, + "loss": 0.8242835998535156, + "step": 824 + }, + { + "epoch": 1.7463002114164905, + "grad_norm": 1.4416385889053345, + "learning_rate": 3.403258153145471e-06, + "loss": 0.9483500719070435, + "step": 826 + }, + { + "epoch": 1.7505285412262155, + "grad_norm": 0.6498605608940125, + "learning_rate": 3.3957259610351324e-06, + "loss": 0.9845226407051086, + "step": 828 + }, + { + "epoch": 1.7547568710359407, + "grad_norm": 2.385218620300293, + "learning_rate": 3.388185874665495e-06, + "loss": 0.8091049790382385, + "step": 830 + }, + { + "epoch": 1.758985200845666, + "grad_norm": 0.9289647936820984, + "learning_rate": 3.3806379862171448e-06, + "loss": 1.1820333003997803, + "step": 832 + }, + { + "epoch": 1.763213530655391, + "grad_norm": 1.0489366054534912, + "learning_rate": 3.373082387966048e-06, + "loss": 0.833751916885376, + "step": 834 + }, + { + "epoch": 1.7674418604651163, + "grad_norm": 0.571071982383728, + "learning_rate": 3.365519172282431e-06, + "loss": 0.8406846523284912, + "step": 836 + }, + { + "epoch": 1.7716701902748415, + "grad_norm": 0.5149204730987549, + "learning_rate": 3.357948431629643e-06, + "loss": 1.1610711812973022, + "step": 838 + }, + { + "epoch": 1.7758985200845667, + "grad_norm": 0.5685960054397583, + "learning_rate": 3.3503702585630305e-06, + "loss": 0.8929948806762695, + "step": 840 + }, + { + "epoch": 1.7801268498942917, + "grad_norm": 2.4608681201934814, + "learning_rate": 3.342784745728804e-06, + "loss": 0.4209887683391571, + "step": 842 + }, + { + "epoch": 1.784355179704017, + "grad_norm": 0.6386370062828064, + "learning_rate": 3.3351919858629045e-06, + "loss": 0.7464441061019897, + "step": 844 + }, + { + "epoch": 1.7885835095137421, + "grad_norm": 0.8585013747215271, + "learning_rate": 3.327592071789873e-06, + "loss": 0.9707925319671631, + "step": 846 + }, + { + "epoch": 1.792811839323467, + "grad_norm": 1.5596438646316528, + "learning_rate": 3.3199850964217116e-06, + "loss": 0.7446164488792419, + "step": 848 + }, + { + "epoch": 1.7970401691331923, + "grad_norm": 2.6806228160858154, + "learning_rate": 3.312371152756751e-06, + "loss": 0.7679558396339417, + "step": 850 + }, + { + "epoch": 1.8012684989429175, + "grad_norm": 0.7470750212669373, + "learning_rate": 3.304750333878511e-06, + "loss": 1.0020787715911865, + "step": 852 + }, + { + "epoch": 1.8054968287526427, + "grad_norm": 0.5787929892539978, + "learning_rate": 3.2971227329545634e-06, + "loss": 0.8919803500175476, + "step": 854 + }, + { + "epoch": 1.809725158562368, + "grad_norm": 0.50643390417099, + "learning_rate": 3.2894884432353957e-06, + "loss": 1.0985815525054932, + "step": 856 + }, + { + "epoch": 1.8139534883720931, + "grad_norm": 1.5751054286956787, + "learning_rate": 3.281847558053265e-06, + "loss": 0.6541829109191895, + "step": 858 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.6766167283058167, + "learning_rate": 3.274200170821064e-06, + "loss": 1.0379619598388672, + "step": 860 + }, + { + "epoch": 1.8224101479915433, + "grad_norm": 1.3516942262649536, + "learning_rate": 3.2665463750311727e-06, + "loss": 1.1044809818267822, + "step": 862 + }, + { + "epoch": 1.8266384778012685, + "grad_norm": 1.573708415031433, + "learning_rate": 3.2588862642543208e-06, + "loss": 0.5707927942276001, + "step": 864 + }, + { + "epoch": 1.8308668076109935, + "grad_norm": 0.4704338312149048, + "learning_rate": 3.2512199321384393e-06, + "loss": 0.7981724143028259, + "step": 866 + }, + { + "epoch": 1.8350951374207187, + "grad_norm": 0.5769053101539612, + "learning_rate": 3.243547472407518e-06, + "loss": 1.1399530172348022, + "step": 868 + }, + { + "epoch": 1.839323467230444, + "grad_norm": 0.8416613340377808, + "learning_rate": 3.23586897886046e-06, + "loss": 1.1359026432037354, + "step": 870 + }, + { + "epoch": 1.8435517970401691, + "grad_norm": 1.3577508926391602, + "learning_rate": 3.2281845453699345e-06, + "loss": 0.8569067716598511, + "step": 872 + }, + { + "epoch": 1.8477801268498943, + "grad_norm": 2.056459665298462, + "learning_rate": 3.220494265881227e-06, + "loss": 1.1351348161697388, + "step": 874 + }, + { + "epoch": 1.8520084566596196, + "grad_norm": 2.2590129375457764, + "learning_rate": 3.212798234411095e-06, + "loss": 0.9369499087333679, + "step": 876 + }, + { + "epoch": 1.8562367864693448, + "grad_norm": 1.6008362770080566, + "learning_rate": 3.2050965450466136e-06, + "loss": 0.2906026244163513, + "step": 878 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 1.1616226434707642, + "learning_rate": 3.197389291944032e-06, + "loss": 0.7301267385482788, + "step": 880 + }, + { + "epoch": 1.864693446088795, + "grad_norm": 0.7197896242141724, + "learning_rate": 3.1896765693276135e-06, + "loss": 1.1232812404632568, + "step": 882 + }, + { + "epoch": 1.86892177589852, + "grad_norm": 0.6383930444717407, + "learning_rate": 3.1819584714884903e-06, + "loss": 0.7655252814292908, + "step": 884 + }, + { + "epoch": 1.8731501057082451, + "grad_norm": 1.0251339673995972, + "learning_rate": 3.1742350927835125e-06, + "loss": 1.121950387954712, + "step": 886 + }, + { + "epoch": 1.8773784355179703, + "grad_norm": 0.6124594807624817, + "learning_rate": 3.1665065276340844e-06, + "loss": 1.1401907205581665, + "step": 888 + }, + { + "epoch": 1.8816067653276956, + "grad_norm": 1.397496223449707, + "learning_rate": 3.158772870525022e-06, + "loss": 0.6092522144317627, + "step": 890 + }, + { + "epoch": 1.8858350951374208, + "grad_norm": 2.1092441082000732, + "learning_rate": 3.1510342160033903e-06, + "loss": 0.8399344086647034, + "step": 892 + }, + { + "epoch": 1.890063424947146, + "grad_norm": 1.1821517944335938, + "learning_rate": 3.1432906586773488e-06, + "loss": 1.114659070968628, + "step": 894 + }, + { + "epoch": 1.8942917547568712, + "grad_norm": 1.5643501281738281, + "learning_rate": 3.135542293214997e-06, + "loss": 1.410881519317627, + "step": 896 + }, + { + "epoch": 1.8985200845665962, + "grad_norm": 0.7187080383300781, + "learning_rate": 3.1277892143432165e-06, + "loss": 1.065239429473877, + "step": 898 + }, + { + "epoch": 1.9027484143763214, + "grad_norm": 1.0004364252090454, + "learning_rate": 3.1200315168465113e-06, + "loss": 0.5023792386054993, + "step": 900 + }, + { + "epoch": 1.9069767441860463, + "grad_norm": 0.5592870116233826, + "learning_rate": 3.1122692955658497e-06, + "loss": 1.107616901397705, + "step": 902 + }, + { + "epoch": 1.9112050739957716, + "grad_norm": 1.719496726989746, + "learning_rate": 3.1045026453975048e-06, + "loss": 0.5772966146469116, + "step": 904 + }, + { + "epoch": 1.9154334038054968, + "grad_norm": 1.4579967260360718, + "learning_rate": 3.096731661291896e-06, + "loss": 0.8818938136100769, + "step": 906 + }, + { + "epoch": 1.919661733615222, + "grad_norm": 0.8083340525627136, + "learning_rate": 3.0889564382524257e-06, + "loss": 1.2467647790908813, + "step": 908 + }, + { + "epoch": 1.9238900634249472, + "grad_norm": 0.5722190737724304, + "learning_rate": 3.08117707133432e-06, + "loss": 0.812706708908081, + "step": 910 + }, + { + "epoch": 1.9281183932346724, + "grad_norm": 0.6764684319496155, + "learning_rate": 3.0733936556434634e-06, + "loss": 1.1728202104568481, + "step": 912 + }, + { + "epoch": 1.9323467230443976, + "grad_norm": 0.9146943688392639, + "learning_rate": 3.0656062863352413e-06, + "loss": 0.7626368999481201, + "step": 914 + }, + { + "epoch": 1.9365750528541226, + "grad_norm": 0.5261140465736389, + "learning_rate": 3.0578150586133704e-06, + "loss": 1.1478456258773804, + "step": 916 + }, + { + "epoch": 1.9408033826638478, + "grad_norm": 0.6775986552238464, + "learning_rate": 3.0500200677287428e-06, + "loss": 0.6973150968551636, + "step": 918 + }, + { + "epoch": 1.945031712473573, + "grad_norm": 1.3343801498413086, + "learning_rate": 3.042221408978251e-06, + "loss": 0.9482506513595581, + "step": 920 + }, + { + "epoch": 1.949260042283298, + "grad_norm": 1.5522212982177734, + "learning_rate": 3.0344191777036312e-06, + "loss": 0.9986613392829895, + "step": 922 + }, + { + "epoch": 1.9534883720930232, + "grad_norm": 0.4814535677433014, + "learning_rate": 3.026613469290298e-06, + "loss": 1.0413583517074585, + "step": 924 + }, + { + "epoch": 1.9577167019027484, + "grad_norm": 0.3166026473045349, + "learning_rate": 3.01880437916617e-06, + "loss": 0.9614431262016296, + "step": 926 + }, + { + "epoch": 1.9619450317124736, + "grad_norm": 1.094480037689209, + "learning_rate": 3.0109920028005135e-06, + "loss": 1.2636445760726929, + "step": 928 + }, + { + "epoch": 1.9661733615221988, + "grad_norm": 0.7118551135063171, + "learning_rate": 3.003176435702767e-06, + "loss": 0.9028820395469666, + "step": 930 + }, + { + "epoch": 1.970401691331924, + "grad_norm": 0.5945522785186768, + "learning_rate": 2.9953577734213775e-06, + "loss": 1.2327357530593872, + "step": 932 + }, + { + "epoch": 1.9746300211416492, + "grad_norm": 0.7517629265785217, + "learning_rate": 2.9875361115426347e-06, + "loss": 0.8936224579811096, + "step": 934 + }, + { + "epoch": 1.9788583509513742, + "grad_norm": 0.9688192009925842, + "learning_rate": 2.979711545689496e-06, + "loss": 0.7812487483024597, + "step": 936 + }, + { + "epoch": 1.9830866807610994, + "grad_norm": 8.303119659423828, + "learning_rate": 2.9718841715204227e-06, + "loss": 0.873395562171936, + "step": 938 + }, + { + "epoch": 1.9873150105708244, + "grad_norm": 0.6607802510261536, + "learning_rate": 2.9640540847282095e-06, + "loss": 1.0979681015014648, + "step": 940 + }, + { + "epoch": 1.9915433403805496, + "grad_norm": 0.6019576787948608, + "learning_rate": 2.956221381038812e-06, + "loss": 1.1199960708618164, + "step": 942 + }, + { + "epoch": 1.9957716701902748, + "grad_norm": 0.7929393649101257, + "learning_rate": 2.94838615621018e-06, + "loss": 1.1161065101623535, + "step": 944 + }, + { + "epoch": 2.0, + "grad_norm": 0.6814373135566711, + "learning_rate": 2.9405485060310857e-06, + "loss": 0.48783794045448303, + "step": 946 + }, + { + "epoch": 2.004228329809725, + "grad_norm": 1.4736313819885254, + "learning_rate": 2.9327085263199507e-06, + "loss": 0.7957913279533386, + "step": 948 + }, + { + "epoch": 2.0084566596194504, + "grad_norm": 0.4455970823764801, + "learning_rate": 2.924866312923677e-06, + "loss": 1.0547270774841309, + "step": 950 + }, + { + "epoch": 2.0126849894291756, + "grad_norm": 0.773058295249939, + "learning_rate": 2.9170219617164735e-06, + "loss": 1.0442657470703125, + "step": 952 + }, + { + "epoch": 2.016913319238901, + "grad_norm": 0.9597894549369812, + "learning_rate": 2.9091755685986866e-06, + "loss": 1.1685289144515991, + "step": 954 + }, + { + "epoch": 2.0211416490486256, + "grad_norm": 0.6969325542449951, + "learning_rate": 2.9013272294956223e-06, + "loss": 1.1930384635925293, + "step": 956 + }, + { + "epoch": 2.025369978858351, + "grad_norm": 0.8082700967788696, + "learning_rate": 2.8934770403563815e-06, + "loss": 0.776046872138977, + "step": 958 + }, + { + "epoch": 2.029598308668076, + "grad_norm": 0.7422521710395813, + "learning_rate": 2.8856250971526788e-06, + "loss": 1.0249298810958862, + "step": 960 + }, + { + "epoch": 2.033826638477801, + "grad_norm": 1.6249040365219116, + "learning_rate": 2.877771495877676e-06, + "loss": 0.9289775490760803, + "step": 962 + }, + { + "epoch": 2.0380549682875264, + "grad_norm": 3.067833185195923, + "learning_rate": 2.869916332544802e-06, + "loss": 0.8100100159645081, + "step": 964 + }, + { + "epoch": 2.0422832980972516, + "grad_norm": 0.724915087223053, + "learning_rate": 2.8620597031865854e-06, + "loss": 0.7401767373085022, + "step": 966 + }, + { + "epoch": 2.046511627906977, + "grad_norm": 2.0869836807250977, + "learning_rate": 2.854201703853477e-06, + "loss": 0.8137513399124146, + "step": 968 + }, + { + "epoch": 2.050739957716702, + "grad_norm": 0.6877044439315796, + "learning_rate": 2.8463424306126743e-06, + "loss": 1.10543692111969, + "step": 970 + }, + { + "epoch": 2.0549682875264272, + "grad_norm": 1.3014296293258667, + "learning_rate": 2.838481979546952e-06, + "loss": 0.5617172122001648, + "step": 972 + }, + { + "epoch": 2.059196617336152, + "grad_norm": 0.9769271016120911, + "learning_rate": 2.83062044675348e-06, + "loss": 1.060500144958496, + "step": 974 + }, + { + "epoch": 2.063424947145877, + "grad_norm": 2.4497523307800293, + "learning_rate": 2.822757928342658e-06, + "loss": 1.075200080871582, + "step": 976 + }, + { + "epoch": 2.0676532769556024, + "grad_norm": 0.8020917177200317, + "learning_rate": 2.814894520436931e-06, + "loss": 1.0989971160888672, + "step": 978 + }, + { + "epoch": 2.0718816067653276, + "grad_norm": 1.6352614164352417, + "learning_rate": 2.807030319169619e-06, + "loss": 0.699384868144989, + "step": 980 + }, + { + "epoch": 2.076109936575053, + "grad_norm": 6.557322978973389, + "learning_rate": 2.7991654206837434e-06, + "loss": 0.7373824119567871, + "step": 982 + }, + { + "epoch": 2.080338266384778, + "grad_norm": 0.7050887942314148, + "learning_rate": 2.7912999211308466e-06, + "loss": 0.8136764168739319, + "step": 984 + }, + { + "epoch": 2.0845665961945032, + "grad_norm": 1.3208609819412231, + "learning_rate": 2.783433916669822e-06, + "loss": 0.9552209973335266, + "step": 986 + }, + { + "epoch": 2.0887949260042284, + "grad_norm": 0.6587861180305481, + "learning_rate": 2.7755675034657336e-06, + "loss": 1.0741578340530396, + "step": 988 + }, + { + "epoch": 2.0930232558139537, + "grad_norm": 1.1716125011444092, + "learning_rate": 2.7677007776886437e-06, + "loss": 1.0747499465942383, + "step": 990 + }, + { + "epoch": 2.097251585623679, + "grad_norm": 1.5702075958251953, + "learning_rate": 2.759833835512435e-06, + "loss": 0.670864999294281, + "step": 992 + }, + { + "epoch": 2.1014799154334036, + "grad_norm": 22.38727569580078, + "learning_rate": 2.7519667731136364e-06, + "loss": 0.7279332280158997, + "step": 994 + }, + { + "epoch": 2.105708245243129, + "grad_norm": 0.8443185091018677, + "learning_rate": 2.7440996866702458e-06, + "loss": 0.8103309869766235, + "step": 996 + }, + { + "epoch": 2.109936575052854, + "grad_norm": 0.8229217529296875, + "learning_rate": 2.7362326723605566e-06, + "loss": 1.036565899848938, + "step": 998 + }, + { + "epoch": 2.1141649048625792, + "grad_norm": 0.7176088094711304, + "learning_rate": 2.7283658263619794e-06, + "loss": 1.0687159299850464, + "step": 1000 + }, + { + "epoch": 2.1183932346723044, + "grad_norm": 0.6158708333969116, + "learning_rate": 2.7204992448498657e-06, + "loss": 0.24933312833309174, + "step": 1002 + }, + { + "epoch": 2.1226215644820297, + "grad_norm": 1.7368133068084717, + "learning_rate": 2.712633023996336e-06, + "loss": 0.7682783007621765, + "step": 1004 + }, + { + "epoch": 2.126849894291755, + "grad_norm": 0.6421481966972351, + "learning_rate": 2.7047672599691e-06, + "loss": 1.0240600109100342, + "step": 1006 + }, + { + "epoch": 2.13107822410148, + "grad_norm": 1.3722180128097534, + "learning_rate": 2.696902048930284e-06, + "loss": 0.9667700529098511, + "step": 1008 + }, + { + "epoch": 2.1353065539112053, + "grad_norm": 0.6478885412216187, + "learning_rate": 2.6890374870352532e-06, + "loss": 0.8398556113243103, + "step": 1010 + }, + { + "epoch": 2.13953488372093, + "grad_norm": 1.5403518676757812, + "learning_rate": 2.6811736704314344e-06, + "loss": 0.57329922914505, + "step": 1012 + }, + { + "epoch": 2.1437632135306552, + "grad_norm": 0.8799501061439514, + "learning_rate": 2.6733106952571467e-06, + "loss": 0.6521193981170654, + "step": 1014 + }, + { + "epoch": 2.1479915433403804, + "grad_norm": 0.9985294938087463, + "learning_rate": 2.6654486576404197e-06, + "loss": 0.8588607311248779, + "step": 1016 + }, + { + "epoch": 2.1522198731501057, + "grad_norm": 0.9864040017127991, + "learning_rate": 2.657587653697822e-06, + "loss": 1.0104336738586426, + "step": 1018 + }, + { + "epoch": 2.156448202959831, + "grad_norm": 6.648144245147705, + "learning_rate": 2.6497277795332855e-06, + "loss": 0.8407163619995117, + "step": 1020 + }, + { + "epoch": 2.160676532769556, + "grad_norm": 0.837396502494812, + "learning_rate": 2.6418691312369295e-06, + "loss": 0.7050214409828186, + "step": 1022 + }, + { + "epoch": 2.1649048625792813, + "grad_norm": 0.6134306192398071, + "learning_rate": 2.634011804883886e-06, + "loss": 1.0578330755233765, + "step": 1024 + }, + { + "epoch": 2.1691331923890065, + "grad_norm": 0.7147375345230103, + "learning_rate": 2.6261558965331272e-06, + "loss": 1.0594534873962402, + "step": 1026 + }, + { + "epoch": 2.1733615221987317, + "grad_norm": 0.6058641672134399, + "learning_rate": 2.6183015022262892e-06, + "loss": 1.0534790754318237, + "step": 1028 + }, + { + "epoch": 2.177589852008457, + "grad_norm": 0.6654782891273499, + "learning_rate": 2.610448717986496e-06, + "loss": 1.067839503288269, + "step": 1030 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 1.3797681331634521, + "learning_rate": 2.6025976398171927e-06, + "loss": 1.0668026208877563, + "step": 1032 + }, + { + "epoch": 2.186046511627907, + "grad_norm": 0.6694332361221313, + "learning_rate": 2.5947483637009622e-06, + "loss": 1.1499404907226562, + "step": 1034 + }, + { + "epoch": 2.190274841437632, + "grad_norm": 0.46979257464408875, + "learning_rate": 2.586900985598358e-06, + "loss": 0.8229663372039795, + "step": 1036 + }, + { + "epoch": 2.1945031712473573, + "grad_norm": 2.6087801456451416, + "learning_rate": 2.579055601446732e-06, + "loss": 0.4731891453266144, + "step": 1038 + }, + { + "epoch": 2.1987315010570825, + "grad_norm": 1.0109667778015137, + "learning_rate": 2.571212307159056e-06, + "loss": 1.0908327102661133, + "step": 1040 + }, + { + "epoch": 2.2029598308668077, + "grad_norm": 2.8863284587860107, + "learning_rate": 2.563371198622755e-06, + "loss": 0.36066552996635437, + "step": 1042 + }, + { + "epoch": 2.207188160676533, + "grad_norm": 1.16829514503479, + "learning_rate": 2.5555323716985304e-06, + "loss": 1.053403615951538, + "step": 1044 + }, + { + "epoch": 2.211416490486258, + "grad_norm": 0.7351399064064026, + "learning_rate": 2.54769592221919e-06, + "loss": 0.6402167677879333, + "step": 1046 + }, + { + "epoch": 2.2156448202959833, + "grad_norm": 3.0547754764556885, + "learning_rate": 2.539861945988478e-06, + "loss": 0.8964632749557495, + "step": 1048 + }, + { + "epoch": 2.219873150105708, + "grad_norm": 1.3434550762176514, + "learning_rate": 2.5320305387799014e-06, + "loss": 0.6596440076828003, + "step": 1050 + }, + { + "epoch": 2.2241014799154333, + "grad_norm": 0.6450607776641846, + "learning_rate": 2.524201796335558e-06, + "loss": 0.9056267142295837, + "step": 1052 + }, + { + "epoch": 2.2283298097251585, + "grad_norm": 0.6330105066299438, + "learning_rate": 2.5163758143649716e-06, + "loss": 1.0713391304016113, + "step": 1054 + }, + { + "epoch": 2.2325581395348837, + "grad_norm": 0.6766862273216248, + "learning_rate": 2.5085526885439145e-06, + "loss": 1.0640653371810913, + "step": 1056 + }, + { + "epoch": 2.236786469344609, + "grad_norm": 0.3488926291465759, + "learning_rate": 2.5007325145132427e-06, + "loss": 0.8341073393821716, + "step": 1058 + }, + { + "epoch": 2.241014799154334, + "grad_norm": 2.001237154006958, + "learning_rate": 2.4929153878777268e-06, + "loss": 0.9115666747093201, + "step": 1060 + }, + { + "epoch": 2.2452431289640593, + "grad_norm": 0.7693464756011963, + "learning_rate": 2.48510140420488e-06, + "loss": 1.0226731300354004, + "step": 1062 + }, + { + "epoch": 2.2494714587737845, + "grad_norm": 1.4121301174163818, + "learning_rate": 2.477290659023791e-06, + "loss": 1.0118439197540283, + "step": 1064 + }, + { + "epoch": 2.2536997885835097, + "grad_norm": 2.2806310653686523, + "learning_rate": 2.469483247823959e-06, + "loss": 0.632957398891449, + "step": 1066 + }, + { + "epoch": 2.2579281183932345, + "grad_norm": 0.8324834704399109, + "learning_rate": 2.461679266054122e-06, + "loss": 0.8787606954574585, + "step": 1068 + }, + { + "epoch": 2.2621564482029597, + "grad_norm": 1.5810331106185913, + "learning_rate": 2.453878809121093e-06, + "loss": 0.8886688351631165, + "step": 1070 + }, + { + "epoch": 2.266384778012685, + "grad_norm": 0.6590220332145691, + "learning_rate": 2.4460819723885903e-06, + "loss": 1.0459415912628174, + "step": 1072 + }, + { + "epoch": 2.27061310782241, + "grad_norm": 0.26749613881111145, + "learning_rate": 2.4382888511760773e-06, + "loss": 0.7614855170249939, + "step": 1074 + }, + { + "epoch": 2.2748414376321353, + "grad_norm": 1.3493986129760742, + "learning_rate": 2.4304995407575917e-06, + "loss": 0.900128185749054, + "step": 1076 + }, + { + "epoch": 2.2790697674418605, + "grad_norm": 8.0263090133667, + "learning_rate": 2.4227141363605804e-06, + "loss": 0.22701826691627502, + "step": 1078 + }, + { + "epoch": 2.2832980972515857, + "grad_norm": 0.5107969641685486, + "learning_rate": 2.4149327331647432e-06, + "loss": 0.16721072793006897, + "step": 1080 + }, + { + "epoch": 2.287526427061311, + "grad_norm": 0.9236059188842773, + "learning_rate": 2.4071554263008584e-06, + "loss": 0.5462712645530701, + "step": 1082 + }, + { + "epoch": 2.291754756871036, + "grad_norm": 1.4398772716522217, + "learning_rate": 2.3993823108496272e-06, + "loss": 0.43305540084838867, + "step": 1084 + }, + { + "epoch": 2.295983086680761, + "grad_norm": 0.5344212055206299, + "learning_rate": 2.391613481840509e-06, + "loss": 0.25760167837142944, + "step": 1086 + }, + { + "epoch": 2.300211416490486, + "grad_norm": 5.494821071624756, + "learning_rate": 2.38384903425056e-06, + "loss": 0.7133547067642212, + "step": 1088 + }, + { + "epoch": 2.3044397463002113, + "grad_norm": 0.9530798196792603, + "learning_rate": 2.376089063003272e-06, + "loss": 0.9048901200294495, + "step": 1090 + }, + { + "epoch": 2.3086680761099365, + "grad_norm": 0.7235156893730164, + "learning_rate": 2.3683336629674096e-06, + "loss": 0.6983910202980042, + "step": 1092 + }, + { + "epoch": 2.3128964059196617, + "grad_norm": 0.6613774299621582, + "learning_rate": 2.3605829289558545e-06, + "loss": 1.0634891986846924, + "step": 1094 + }, + { + "epoch": 2.317124735729387, + "grad_norm": 0.7909154891967773, + "learning_rate": 2.3528369557244453e-06, + "loss": 1.035917043685913, + "step": 1096 + }, + { + "epoch": 2.321353065539112, + "grad_norm": 0.8521804213523865, + "learning_rate": 2.3450958379708156e-06, + "loss": 1.009893774986267, + "step": 1098 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 2.444586753845215, + "learning_rate": 2.3373596703332383e-06, + "loss": 0.6026294827461243, + "step": 1100 + }, + { + "epoch": 2.3298097251585626, + "grad_norm": 0.8242626786231995, + "learning_rate": 2.3296285473894746e-06, + "loss": 0.7475822567939758, + "step": 1102 + }, + { + "epoch": 2.3340380549682873, + "grad_norm": 0.684226930141449, + "learning_rate": 2.321902563655606e-06, + "loss": 1.0707495212554932, + "step": 1104 + }, + { + "epoch": 2.3382663847780125, + "grad_norm": 0.8783945441246033, + "learning_rate": 2.314181813584887e-06, + "loss": 1.013008952140808, + "step": 1106 + }, + { + "epoch": 2.3424947145877377, + "grad_norm": 0.9921977519989014, + "learning_rate": 2.306466391566591e-06, + "loss": 0.9479020833969116, + "step": 1108 + }, + { + "epoch": 2.346723044397463, + "grad_norm": 0.7830618619918823, + "learning_rate": 2.2987563919248518e-06, + "loss": 1.1364282369613647, + "step": 1110 + }, + { + "epoch": 2.350951374207188, + "grad_norm": 0.26116877794265747, + "learning_rate": 2.2910519089175103e-06, + "loss": 0.6622422933578491, + "step": 1112 + }, + { + "epoch": 2.3551797040169133, + "grad_norm": 4.712930202484131, + "learning_rate": 2.283353036734969e-06, + "loss": 0.94716477394104, + "step": 1114 + }, + { + "epoch": 2.3594080338266386, + "grad_norm": 0.9706722497940063, + "learning_rate": 2.2756598694990334e-06, + "loss": 0.6431679725646973, + "step": 1116 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 1.9938366413116455, + "learning_rate": 2.267972501261762e-06, + "loss": 1.308355450630188, + "step": 1118 + }, + { + "epoch": 2.367864693446089, + "grad_norm": 0.7777484059333801, + "learning_rate": 2.2602910260043208e-06, + "loss": 1.0695171356201172, + "step": 1120 + }, + { + "epoch": 2.3720930232558137, + "grad_norm": 0.7761583924293518, + "learning_rate": 2.252615537635831e-06, + "loss": 0.9347115755081177, + "step": 1122 + }, + { + "epoch": 2.376321353065539, + "grad_norm": 0.7822389006614685, + "learning_rate": 2.244946129992223e-06, + "loss": 0.7232018113136292, + "step": 1124 + }, + { + "epoch": 2.380549682875264, + "grad_norm": 2.1133530139923096, + "learning_rate": 2.2372828968350834e-06, + "loss": 1.0389723777770996, + "step": 1126 + }, + { + "epoch": 2.3847780126849893, + "grad_norm": 1.3042513132095337, + "learning_rate": 2.229625931850519e-06, + "loss": 0.7246500849723816, + "step": 1128 + }, + { + "epoch": 2.3890063424947146, + "grad_norm": 0.8496916890144348, + "learning_rate": 2.221975328648002e-06, + "loss": 0.8411369323730469, + "step": 1130 + }, + { + "epoch": 2.3932346723044398, + "grad_norm": 1.2774096727371216, + "learning_rate": 2.2143311807592292e-06, + "loss": 0.7468405961990356, + "step": 1132 + }, + { + "epoch": 2.397463002114165, + "grad_norm": 0.6452171206474304, + "learning_rate": 2.206693581636982e-06, + "loss": 1.111289620399475, + "step": 1134 + }, + { + "epoch": 2.40169133192389, + "grad_norm": 5.754592418670654, + "learning_rate": 2.1990626246539753e-06, + "loss": 0.6915456056594849, + "step": 1136 + }, + { + "epoch": 2.4059196617336154, + "grad_norm": 1.6072407960891724, + "learning_rate": 2.1914384031017265e-06, + "loss": 0.8382232189178467, + "step": 1138 + }, + { + "epoch": 2.41014799154334, + "grad_norm": 0.4873308837413788, + "learning_rate": 2.1838210101894062e-06, + "loss": 1.0329222679138184, + "step": 1140 + }, + { + "epoch": 2.4143763213530653, + "grad_norm": 0.7448446154594421, + "learning_rate": 2.1762105390427026e-06, + "loss": 1.19656503200531, + "step": 1142 + }, + { + "epoch": 2.4186046511627906, + "grad_norm": 2.470224618911743, + "learning_rate": 2.168607082702684e-06, + "loss": 0.6114988923072815, + "step": 1144 + }, + { + "epoch": 2.4228329809725158, + "grad_norm": 4.100384712219238, + "learning_rate": 2.161010734124658e-06, + "loss": 0.7755101323127747, + "step": 1146 + }, + { + "epoch": 2.427061310782241, + "grad_norm": 0.8485273122787476, + "learning_rate": 2.153421586177038e-06, + "loss": 0.8298628926277161, + "step": 1148 + }, + { + "epoch": 2.431289640591966, + "grad_norm": 1.0596591234207153, + "learning_rate": 2.145839731640208e-06, + "loss": 0.5695077180862427, + "step": 1150 + }, + { + "epoch": 2.4355179704016914, + "grad_norm": 0.32878732681274414, + "learning_rate": 2.138265263205384e-06, + "loss": 0.6108872890472412, + "step": 1152 + }, + { + "epoch": 2.4397463002114166, + "grad_norm": 0.47924017906188965, + "learning_rate": 2.130698273473486e-06, + "loss": 0.575315535068512, + "step": 1154 + }, + { + "epoch": 2.443974630021142, + "grad_norm": 0.5258365273475647, + "learning_rate": 2.1231388549540045e-06, + "loss": 0.9532243609428406, + "step": 1156 + }, + { + "epoch": 2.448202959830867, + "grad_norm": 4.6877546310424805, + "learning_rate": 2.115587100063868e-06, + "loss": 0.5808656811714172, + "step": 1158 + }, + { + "epoch": 2.452431289640592, + "grad_norm": 0.8416226506233215, + "learning_rate": 2.108043101126312e-06, + "loss": 1.0306192636489868, + "step": 1160 + }, + { + "epoch": 2.456659619450317, + "grad_norm": 3.2165985107421875, + "learning_rate": 2.1005069503697566e-06, + "loss": 1.0111299753189087, + "step": 1162 + }, + { + "epoch": 2.460887949260042, + "grad_norm": 0.6864579916000366, + "learning_rate": 2.092978739926672e-06, + "loss": 0.8028541207313538, + "step": 1164 + }, + { + "epoch": 2.4651162790697674, + "grad_norm": 0.9489989280700684, + "learning_rate": 2.0854585618324548e-06, + "loss": 1.2172460556030273, + "step": 1166 + }, + { + "epoch": 2.4693446088794926, + "grad_norm": 1.215120553970337, + "learning_rate": 2.0779465080243037e-06, + "loss": 1.3246065378189087, + "step": 1168 + }, + { + "epoch": 2.473572938689218, + "grad_norm": 0.6394163370132446, + "learning_rate": 2.0704426703400944e-06, + "loss": 0.7735956311225891, + "step": 1170 + }, + { + "epoch": 2.477801268498943, + "grad_norm": 1.1398952007293701, + "learning_rate": 2.0629471405172585e-06, + "loss": 0.8254691362380981, + "step": 1172 + }, + { + "epoch": 2.482029598308668, + "grad_norm": 0.5559751987457275, + "learning_rate": 2.055460010191658e-06, + "loss": 0.7504424452781677, + "step": 1174 + }, + { + "epoch": 2.4862579281183934, + "grad_norm": 0.8105632066726685, + "learning_rate": 2.0479813708964693e-06, + "loss": 0.7769438028335571, + "step": 1176 + }, + { + "epoch": 2.4904862579281186, + "grad_norm": 1.449171781539917, + "learning_rate": 2.0405113140610634e-06, + "loss": 0.8921318650245667, + "step": 1178 + }, + { + "epoch": 2.4947145877378434, + "grad_norm": 1.4208768606185913, + "learning_rate": 2.033049931009885e-06, + "loss": 0.6842445135116577, + "step": 1180 + }, + { + "epoch": 2.4989429175475686, + "grad_norm": 0.4888696074485779, + "learning_rate": 2.0255973129613406e-06, + "loss": 0.567357063293457, + "step": 1182 + }, + { + "epoch": 2.503171247357294, + "grad_norm": 0.8814659118652344, + "learning_rate": 2.0181535510266796e-06, + "loss": 0.1589071899652481, + "step": 1184 + }, + { + "epoch": 2.507399577167019, + "grad_norm": 1.7633031606674194, + "learning_rate": 2.0107187362088816e-06, + "loss": 0.9725368618965149, + "step": 1186 + }, + { + "epoch": 2.511627906976744, + "grad_norm": 2.5048136711120605, + "learning_rate": 2.0032929594015456e-06, + "loss": 0.9178006649017334, + "step": 1188 + }, + { + "epoch": 2.5158562367864694, + "grad_norm": 1.5520225763320923, + "learning_rate": 1.9958763113877755e-06, + "loss": 0.7678893804550171, + "step": 1190 + }, + { + "epoch": 2.5200845665961946, + "grad_norm": 0.5215038061141968, + "learning_rate": 1.988468882839075e-06, + "loss": 1.001523733139038, + "step": 1192 + }, + { + "epoch": 2.52431289640592, + "grad_norm": 0.6024693846702576, + "learning_rate": 1.9810707643142325e-06, + "loss": 0.6263225674629211, + "step": 1194 + }, + { + "epoch": 2.528541226215645, + "grad_norm": 1.617968201637268, + "learning_rate": 1.9736820462582186e-06, + "loss": 1.0076720714569092, + "step": 1196 + }, + { + "epoch": 2.53276955602537, + "grad_norm": 0.7982508540153503, + "learning_rate": 1.9663028190010815e-06, + "loss": 1.0421154499053955, + "step": 1198 + }, + { + "epoch": 2.536997885835095, + "grad_norm": 1.1996971368789673, + "learning_rate": 1.9589331727568384e-06, + "loss": 0.7256770133972168, + "step": 1200 + }, + { + "epoch": 2.54122621564482, + "grad_norm": 0.744490921497345, + "learning_rate": 1.9515731976223746e-06, + "loss": 1.0210518836975098, + "step": 1202 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 1.66182541847229, + "learning_rate": 1.9442229835763454e-06, + "loss": 0.44427788257598877, + "step": 1204 + }, + { + "epoch": 2.5496828752642706, + "grad_norm": 0.6226742267608643, + "learning_rate": 1.936882620478069e-06, + "loss": 1.068085789680481, + "step": 1206 + }, + { + "epoch": 2.553911205073996, + "grad_norm": 1.4527463912963867, + "learning_rate": 1.9295521980664317e-06, + "loss": 1.060996174812317, + "step": 1208 + }, + { + "epoch": 2.558139534883721, + "grad_norm": 0.6856507062911987, + "learning_rate": 1.922231805958795e-06, + "loss": 1.039587140083313, + "step": 1210 + }, + { + "epoch": 2.5623678646934462, + "grad_norm": 1.3432971239089966, + "learning_rate": 1.914921533649894e-06, + "loss": 0.7191824316978455, + "step": 1212 + }, + { + "epoch": 2.5665961945031714, + "grad_norm": 0.7632008194923401, + "learning_rate": 1.9076214705107417e-06, + "loss": 1.0393006801605225, + "step": 1214 + }, + { + "epoch": 2.570824524312896, + "grad_norm": 1.0369495153427124, + "learning_rate": 1.9003317057875443e-06, + "loss": 0.6147840023040771, + "step": 1216 + }, + { + "epoch": 2.5750528541226214, + "grad_norm": 1.336530089378357, + "learning_rate": 1.8930523286006052e-06, + "loss": 0.6377484202384949, + "step": 1218 + }, + { + "epoch": 2.5792811839323466, + "grad_norm": 2.0891432762145996, + "learning_rate": 1.8857834279432336e-06, + "loss": 0.509937584400177, + "step": 1220 + }, + { + "epoch": 2.583509513742072, + "grad_norm": 3.55784010887146, + "learning_rate": 1.8785250926806613e-06, + "loss": 0.5913651585578918, + "step": 1222 + }, + { + "epoch": 2.587737843551797, + "grad_norm": 4.819112777709961, + "learning_rate": 1.8712774115489524e-06, + "loss": 0.8116767406463623, + "step": 1224 + }, + { + "epoch": 2.5919661733615222, + "grad_norm": 0.43531814217567444, + "learning_rate": 1.8640404731539218e-06, + "loss": 0.47326603531837463, + "step": 1226 + }, + { + "epoch": 2.5961945031712474, + "grad_norm": 0.8789650201797485, + "learning_rate": 1.8568143659700472e-06, + "loss": 0.7499734163284302, + "step": 1228 + }, + { + "epoch": 2.6004228329809727, + "grad_norm": 1.4755181074142456, + "learning_rate": 1.8495991783393924e-06, + "loss": 0.8303921222686768, + "step": 1230 + }, + { + "epoch": 2.604651162790698, + "grad_norm": 3.1309523582458496, + "learning_rate": 1.8423949984705257e-06, + "loss": 0.7273667454719543, + "step": 1232 + }, + { + "epoch": 2.6088794926004226, + "grad_norm": 1.6632975339889526, + "learning_rate": 1.8352019144374406e-06, + "loss": 0.8571827411651611, + "step": 1234 + }, + { + "epoch": 2.613107822410148, + "grad_norm": 0.7448071241378784, + "learning_rate": 1.8280200141784771e-06, + "loss": 0.8664517998695374, + "step": 1236 + }, + { + "epoch": 2.617336152219873, + "grad_norm": 0.8705071210861206, + "learning_rate": 1.8208493854952535e-06, + "loss": 0.9958084225654602, + "step": 1238 + }, + { + "epoch": 2.6215644820295982, + "grad_norm": 0.7441583275794983, + "learning_rate": 1.8136901160515869e-06, + "loss": 0.7479358315467834, + "step": 1240 + }, + { + "epoch": 2.6257928118393234, + "grad_norm": 0.6350056529045105, + "learning_rate": 1.8065422933724192e-06, + "loss": 0.8547337651252747, + "step": 1242 + }, + { + "epoch": 2.6300211416490487, + "grad_norm": 1.2663848400115967, + "learning_rate": 1.799406004842757e-06, + "loss": 1.0284228324890137, + "step": 1244 + }, + { + "epoch": 2.634249471458774, + "grad_norm": 2.9096410274505615, + "learning_rate": 1.7922813377065946e-06, + "loss": 0.6996232867240906, + "step": 1246 + }, + { + "epoch": 2.638477801268499, + "grad_norm": 2.602738857269287, + "learning_rate": 1.7851683790658492e-06, + "loss": 0.5642688274383545, + "step": 1248 + }, + { + "epoch": 2.6427061310782243, + "grad_norm": 0.6103304028511047, + "learning_rate": 1.7780672158792979e-06, + "loss": 1.0508077144622803, + "step": 1250 + }, + { + "epoch": 2.646934460887949, + "grad_norm": 3.92741322517395, + "learning_rate": 1.7709779349615152e-06, + "loss": 0.5398973822593689, + "step": 1252 + }, + { + "epoch": 2.6511627906976747, + "grad_norm": 3.5654373168945312, + "learning_rate": 1.763900622981805e-06, + "loss": 0.7100467681884766, + "step": 1254 + }, + { + "epoch": 2.6553911205073994, + "grad_norm": 0.8442944288253784, + "learning_rate": 1.7568353664631528e-06, + "loss": 1.0310944318771362, + "step": 1256 + }, + { + "epoch": 2.6596194503171247, + "grad_norm": 0.7679892778396606, + "learning_rate": 1.7497822517811576e-06, + "loss": 0.3732684850692749, + "step": 1258 + }, + { + "epoch": 2.66384778012685, + "grad_norm": 2.0411362648010254, + "learning_rate": 1.7427413651629787e-06, + "loss": 0.5446974635124207, + "step": 1260 + }, + { + "epoch": 2.668076109936575, + "grad_norm": 7.280208587646484, + "learning_rate": 1.735712792686285e-06, + "loss": 0.7429797649383545, + "step": 1262 + }, + { + "epoch": 2.6723044397463003, + "grad_norm": 1.769396185874939, + "learning_rate": 1.7286966202781983e-06, + "loss": 0.7472846508026123, + "step": 1264 + }, + { + "epoch": 2.6765327695560255, + "grad_norm": 7.442590236663818, + "learning_rate": 1.7216929337142447e-06, + "loss": 0.4331527650356293, + "step": 1266 + }, + { + "epoch": 2.6807610993657507, + "grad_norm": 2.3331828117370605, + "learning_rate": 1.714701818617307e-06, + "loss": 0.7867488861083984, + "step": 1268 + }, + { + "epoch": 2.6849894291754755, + "grad_norm": 0.9535698890686035, + "learning_rate": 1.7077233604565758e-06, + "loss": 1.0159664154052734, + "step": 1270 + }, + { + "epoch": 2.689217758985201, + "grad_norm": 1.7152396440505981, + "learning_rate": 1.7007576445465054e-06, + "loss": 0.8122742176055908, + "step": 1272 + }, + { + "epoch": 2.693446088794926, + "grad_norm": 0.8219133019447327, + "learning_rate": 1.6938047560457716e-06, + "loss": 0.49331924319267273, + "step": 1274 + }, + { + "epoch": 2.697674418604651, + "grad_norm": 1.0974379777908325, + "learning_rate": 1.6868647799562296e-06, + "loss": 0.5021317601203918, + "step": 1276 + }, + { + "epoch": 2.7019027484143763, + "grad_norm": 1.0276836156845093, + "learning_rate": 1.6799378011218753e-06, + "loss": 1.0597912073135376, + "step": 1278 + }, + { + "epoch": 2.7061310782241015, + "grad_norm": 1.305923581123352, + "learning_rate": 1.6730239042278078e-06, + "loss": 0.7645857334136963, + "step": 1280 + }, + { + "epoch": 2.7103594080338267, + "grad_norm": 0.5968372225761414, + "learning_rate": 1.666123173799195e-06, + "loss": 1.1030560731887817, + "step": 1282 + }, + { + "epoch": 2.714587737843552, + "grad_norm": 1.4355765581130981, + "learning_rate": 1.659235694200238e-06, + "loss": 0.8181160092353821, + "step": 1284 + }, + { + "epoch": 2.718816067653277, + "grad_norm": 0.18032093346118927, + "learning_rate": 1.6523615496331417e-06, + "loss": 1.1456607580184937, + "step": 1286 + }, + { + "epoch": 2.723044397463002, + "grad_norm": 1.4734760522842407, + "learning_rate": 1.6455008241370874e-06, + "loss": 0.5729717016220093, + "step": 1288 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.7612594962120056, + "learning_rate": 1.6386536015871976e-06, + "loss": 1.0644044876098633, + "step": 1290 + }, + { + "epoch": 2.7315010570824523, + "grad_norm": 0.8396774530410767, + "learning_rate": 1.6318199656935195e-06, + "loss": 1.0980335474014282, + "step": 1292 + }, + { + "epoch": 2.7357293868921775, + "grad_norm": 0.7830753922462463, + "learning_rate": 1.6250000000000007e-06, + "loss": 1.054038405418396, + "step": 1294 + }, + { + "epoch": 2.7399577167019027, + "grad_norm": 0.3286767899990082, + "learning_rate": 1.618193787883458e-06, + "loss": 0.3908301591873169, + "step": 1296 + }, + { + "epoch": 2.744186046511628, + "grad_norm": 0.6592679619789124, + "learning_rate": 1.611401412552569e-06, + "loss": 0.5546691417694092, + "step": 1298 + }, + { + "epoch": 2.748414376321353, + "grad_norm": 0.5871724486351013, + "learning_rate": 1.604622957046854e-06, + "loss": 0.6974177360534668, + "step": 1300 + }, + { + "epoch": 2.7526427061310783, + "grad_norm": 3.476322650909424, + "learning_rate": 1.5978585042356526e-06, + "loss": 0.9717587828636169, + "step": 1302 + }, + { + "epoch": 2.7568710359408035, + "grad_norm": 1.426347017288208, + "learning_rate": 1.5911081368171174e-06, + "loss": 0.696022093296051, + "step": 1304 + }, + { + "epoch": 2.7610993657505283, + "grad_norm": 1.42485511302948, + "learning_rate": 1.5843719373172043e-06, + "loss": 0.8914967775344849, + "step": 1306 + }, + { + "epoch": 2.765327695560254, + "grad_norm": 0.9887190461158752, + "learning_rate": 1.5776499880886583e-06, + "loss": 0.8718952536582947, + "step": 1308 + }, + { + "epoch": 2.7695560253699787, + "grad_norm": 0.5860939621925354, + "learning_rate": 1.5709423713100066e-06, + "loss": 1.0336132049560547, + "step": 1310 + }, + { + "epoch": 2.773784355179704, + "grad_norm": 0.6642679572105408, + "learning_rate": 1.5642491689845623e-06, + "loss": 0.9066874980926514, + "step": 1312 + }, + { + "epoch": 2.778012684989429, + "grad_norm": 0.6993511319160461, + "learning_rate": 1.5575704629394118e-06, + "loss": 0.5353021025657654, + "step": 1314 + }, + { + "epoch": 2.7822410147991543, + "grad_norm": 0.8484950065612793, + "learning_rate": 1.550906334824419e-06, + "loss": 0.979564905166626, + "step": 1316 + }, + { + "epoch": 2.7864693446088795, + "grad_norm": 0.3303600251674652, + "learning_rate": 1.5442568661112273e-06, + "loss": 0.6826730966567993, + "step": 1318 + }, + { + "epoch": 2.7906976744186047, + "grad_norm": 3.2293996810913086, + "learning_rate": 1.5376221380922645e-06, + "loss": 0.9952559471130371, + "step": 1320 + }, + { + "epoch": 2.79492600422833, + "grad_norm": 2.9947149753570557, + "learning_rate": 1.5310022318797468e-06, + "loss": 0.5234836339950562, + "step": 1322 + }, + { + "epoch": 2.7991543340380547, + "grad_norm": 0.8693253397941589, + "learning_rate": 1.5243972284046843e-06, + "loss": 1.0908644199371338, + "step": 1324 + }, + { + "epoch": 2.8033826638477803, + "grad_norm": 1.8999295234680176, + "learning_rate": 1.5178072084159006e-06, + "loss": 0.30439692735671997, + "step": 1326 + }, + { + "epoch": 2.807610993657505, + "grad_norm": 1.2835586071014404, + "learning_rate": 1.5112322524790373e-06, + "loss": 0.3868151009082794, + "step": 1328 + }, + { + "epoch": 2.8118393234672303, + "grad_norm": 0.6148664355278015, + "learning_rate": 1.5046724409755708e-06, + "loss": 0.655669093132019, + "step": 1330 + }, + { + "epoch": 2.8160676532769555, + "grad_norm": 0.9131718277931213, + "learning_rate": 1.4981278541018338e-06, + "loss": 1.027086615562439, + "step": 1332 + }, + { + "epoch": 2.8202959830866807, + "grad_norm": 0.9928625226020813, + "learning_rate": 1.4915985718680303e-06, + "loss": 0.6656888723373413, + "step": 1334 + }, + { + "epoch": 2.824524312896406, + "grad_norm": 0.759575366973877, + "learning_rate": 1.4850846740972566e-06, + "loss": 1.0963438749313354, + "step": 1336 + }, + { + "epoch": 2.828752642706131, + "grad_norm": 1.3372092247009277, + "learning_rate": 1.478586240424532e-06, + "loss": 1.0407531261444092, + "step": 1338 + }, + { + "epoch": 2.8329809725158563, + "grad_norm": 1.6881779432296753, + "learning_rate": 1.4721033502958188e-06, + "loss": 0.8279685974121094, + "step": 1340 + }, + { + "epoch": 2.8372093023255816, + "grad_norm": 0.5533450245857239, + "learning_rate": 1.4656360829670524e-06, + "loss": 1.0067516565322876, + "step": 1342 + }, + { + "epoch": 2.8414376321353068, + "grad_norm": 0.5380539298057556, + "learning_rate": 1.4591845175031755e-06, + "loss": 0.7166640162467957, + "step": 1344 + }, + { + "epoch": 2.8456659619450315, + "grad_norm": 1.2064310312271118, + "learning_rate": 1.4527487327771667e-06, + "loss": 0.6947576403617859, + "step": 1346 + }, + { + "epoch": 2.8498942917547567, + "grad_norm": 0.6393163800239563, + "learning_rate": 1.44632880746908e-06, + "loss": 0.9154616594314575, + "step": 1348 + }, + { + "epoch": 2.854122621564482, + "grad_norm": 2.2646851539611816, + "learning_rate": 1.4399248200650822e-06, + "loss": 0.5946722626686096, + "step": 1350 + }, + { + "epoch": 2.858350951374207, + "grad_norm": 6.771807670593262, + "learning_rate": 1.4335368488564921e-06, + "loss": 0.9889756441116333, + "step": 1352 + }, + { + "epoch": 2.8625792811839323, + "grad_norm": 0.6024413108825684, + "learning_rate": 1.4271649719388235e-06, + "loss": 1.0145889520645142, + "step": 1354 + }, + { + "epoch": 2.8668076109936576, + "grad_norm": 6.5942912101745605, + "learning_rate": 1.420809267210832e-06, + "loss": 0.4889359176158905, + "step": 1356 + }, + { + "epoch": 2.8710359408033828, + "grad_norm": 1.2514537572860718, + "learning_rate": 1.4144698123735614e-06, + "loss": 1.060815453529358, + "step": 1358 + }, + { + "epoch": 2.875264270613108, + "grad_norm": 3.7441012859344482, + "learning_rate": 1.408146684929394e-06, + "loss": 0.795141875743866, + "step": 1360 + }, + { + "epoch": 2.879492600422833, + "grad_norm": 1.1793290376663208, + "learning_rate": 1.401839962181103e-06, + "loss": 0.7162335515022278, + "step": 1362 + }, + { + "epoch": 2.883720930232558, + "grad_norm": 1.296712875366211, + "learning_rate": 1.3955497212309082e-06, + "loss": 1.0849847793579102, + "step": 1364 + }, + { + "epoch": 2.887949260042283, + "grad_norm": 3.475389003753662, + "learning_rate": 1.389276038979532e-06, + "loss": 0.875495970249176, + "step": 1366 + }, + { + "epoch": 2.8921775898520083, + "grad_norm": 0.5793375372886658, + "learning_rate": 1.3830189921252605e-06, + "loss": 1.020584225654602, + "step": 1368 + }, + { + "epoch": 2.8964059196617336, + "grad_norm": 0.7720953226089478, + "learning_rate": 1.3767786571630054e-06, + "loss": 1.035544753074646, + "step": 1370 + }, + { + "epoch": 2.9006342494714588, + "grad_norm": 0.6335976123809814, + "learning_rate": 1.3705551103833687e-06, + "loss": 1.0688656568527222, + "step": 1372 + }, + { + "epoch": 2.904862579281184, + "grad_norm": 0.5152540802955627, + "learning_rate": 1.364348427871709e-06, + "loss": 0.8726412057876587, + "step": 1374 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.8800987601280212, + "learning_rate": 1.3581586855072162e-06, + "loss": 1.01813542842865, + "step": 1376 + }, + { + "epoch": 2.9133192389006344, + "grad_norm": 3.5038645267486572, + "learning_rate": 1.3519859589619756e-06, + "loss": 0.7246266603469849, + "step": 1378 + }, + { + "epoch": 2.9175475687103596, + "grad_norm": 0.6655816435813904, + "learning_rate": 1.3458303237000483e-06, + "loss": 0.7696600556373596, + "step": 1380 + }, + { + "epoch": 2.9217758985200843, + "grad_norm": 0.9207125902175903, + "learning_rate": 1.3396918549765514e-06, + "loss": 1.0463422536849976, + "step": 1382 + }, + { + "epoch": 2.92600422832981, + "grad_norm": 1.4580425024032593, + "learning_rate": 1.3335706278367289e-06, + "loss": 0.9288692474365234, + "step": 1384 + }, + { + "epoch": 2.9302325581395348, + "grad_norm": 0.6088332533836365, + "learning_rate": 1.3274667171150422e-06, + "loss": 0.7489819526672363, + "step": 1386 + }, + { + "epoch": 2.93446088794926, + "grad_norm": 1.341879963874817, + "learning_rate": 1.3213801974342516e-06, + "loss": 1.0183134078979492, + "step": 1388 + }, + { + "epoch": 2.938689217758985, + "grad_norm": 0.258132666349411, + "learning_rate": 1.3153111432045079e-06, + "loss": 0.8709487318992615, + "step": 1390 + }, + { + "epoch": 2.9429175475687104, + "grad_norm": 0.5611593127250671, + "learning_rate": 1.309259628622435e-06, + "loss": 1.007150411605835, + "step": 1392 + }, + { + "epoch": 2.9471458773784356, + "grad_norm": 0.7201411724090576, + "learning_rate": 1.3032257276702296e-06, + "loss": 0.32561811804771423, + "step": 1394 + }, + { + "epoch": 2.951374207188161, + "grad_norm": 0.5295475721359253, + "learning_rate": 1.2972095141147578e-06, + "loss": 0.529960572719574, + "step": 1396 + }, + { + "epoch": 2.955602536997886, + "grad_norm": 2.0177695751190186, + "learning_rate": 1.2912110615066447e-06, + "loss": 0.9622781276702881, + "step": 1398 + }, + { + "epoch": 2.9598308668076108, + "grad_norm": 0.5386593341827393, + "learning_rate": 1.2852304431793838e-06, + "loss": 1.2505404949188232, + "step": 1400 + }, + { + "epoch": 2.9640591966173364, + "grad_norm": 4.687948226928711, + "learning_rate": 1.2792677322484386e-06, + "loss": 0.8016545176506042, + "step": 1402 + }, + { + "epoch": 2.968287526427061, + "grad_norm": 1.594322681427002, + "learning_rate": 1.2733230016103436e-06, + "loss": 0.5189470052719116, + "step": 1404 + }, + { + "epoch": 2.9725158562367864, + "grad_norm": 0.9102961421012878, + "learning_rate": 1.26739632394182e-06, + "loss": 0.9059958457946777, + "step": 1406 + }, + { + "epoch": 2.9767441860465116, + "grad_norm": 0.8692654371261597, + "learning_rate": 1.2614877716988845e-06, + "loss": 0.8937259316444397, + "step": 1408 + }, + { + "epoch": 2.980972515856237, + "grad_norm": 2.1760952472686768, + "learning_rate": 1.255597417115961e-06, + "loss": 0.833085834980011, + "step": 1410 + }, + { + "epoch": 2.985200845665962, + "grad_norm": 1.076922059059143, + "learning_rate": 1.249725332205e-06, + "loss": 1.064079999923706, + "step": 1412 + }, + { + "epoch": 2.989429175475687, + "grad_norm": 0.4375395178794861, + "learning_rate": 1.2438715887546002e-06, + "loss": 0.8243948221206665, + "step": 1414 + }, + { + "epoch": 2.9936575052854124, + "grad_norm": 2.233292579650879, + "learning_rate": 1.2380362583291272e-06, + "loss": 0.8824648261070251, + "step": 1416 + }, + { + "epoch": 2.997885835095137, + "grad_norm": 0.4582400321960449, + "learning_rate": 1.2322194122678375e-06, + "loss": 0.5593487620353699, + "step": 1418 + }, + { + "epoch": 3.0021141649048624, + "grad_norm": 0.6782700419425964, + "learning_rate": 1.226421121684014e-06, + "loss": 1.03118097782135, + "step": 1420 + }, + { + "epoch": 3.0063424947145876, + "grad_norm": 1.0694071054458618, + "learning_rate": 1.2206414574640868e-06, + "loss": 0.6397127509117126, + "step": 1422 + }, + { + "epoch": 3.010570824524313, + "grad_norm": 1.2534350156784058, + "learning_rate": 1.2148804902667736e-06, + "loss": 1.1392219066619873, + "step": 1424 + }, + { + "epoch": 3.014799154334038, + "grad_norm": 1.9186782836914062, + "learning_rate": 1.2091382905222132e-06, + "loss": 0.520480215549469, + "step": 1426 + }, + { + "epoch": 3.019027484143763, + "grad_norm": 0.8564389944076538, + "learning_rate": 1.2034149284311041e-06, + "loss": 0.6777791976928711, + "step": 1428 + }, + { + "epoch": 3.0232558139534884, + "grad_norm": 0.7034538388252258, + "learning_rate": 1.197710473963847e-06, + "loss": 0.8563777804374695, + "step": 1430 + }, + { + "epoch": 3.0274841437632136, + "grad_norm": 0.7909392714500427, + "learning_rate": 1.1920249968596902e-06, + "loss": 1.0257045030593872, + "step": 1432 + }, + { + "epoch": 3.031712473572939, + "grad_norm": 0.47468486428260803, + "learning_rate": 1.1863585666258748e-06, + "loss": 0.9145489931106567, + "step": 1434 + }, + { + "epoch": 3.035940803382664, + "grad_norm": 1.9823009967803955, + "learning_rate": 1.1807112525367876e-06, + "loss": 0.6615996360778809, + "step": 1436 + }, + { + "epoch": 3.040169133192389, + "grad_norm": 0.8342152833938599, + "learning_rate": 1.1750831236331117e-06, + "loss": 0.2739180326461792, + "step": 1438 + }, + { + "epoch": 3.044397463002114, + "grad_norm": 1.0090795755386353, + "learning_rate": 1.1694742487209842e-06, + "loss": 1.0122308731079102, + "step": 1440 + }, + { + "epoch": 3.048625792811839, + "grad_norm": 0.5868484973907471, + "learning_rate": 1.1638846963711545e-06, + "loss": 0.7627484798431396, + "step": 1442 + }, + { + "epoch": 3.0528541226215644, + "grad_norm": 2.0811753273010254, + "learning_rate": 1.1583145349181456e-06, + "loss": 0.21038176119327545, + "step": 1444 + }, + { + "epoch": 3.0570824524312896, + "grad_norm": 2.1059317588806152, + "learning_rate": 1.152763832459419e-06, + "loss": 0.6727972030639648, + "step": 1446 + }, + { + "epoch": 3.061310782241015, + "grad_norm": 2.8190877437591553, + "learning_rate": 1.1472326568545424e-06, + "loss": 0.7937036156654358, + "step": 1448 + }, + { + "epoch": 3.06553911205074, + "grad_norm": 0.6504483819007874, + "learning_rate": 1.1417210757243603e-06, + "loss": 0.7131494879722595, + "step": 1450 + }, + { + "epoch": 3.0697674418604652, + "grad_norm": 2.133265972137451, + "learning_rate": 1.136229156450165e-06, + "loss": 0.7005563378334045, + "step": 1452 + }, + { + "epoch": 3.0739957716701904, + "grad_norm": 0.7323704957962036, + "learning_rate": 1.1307569661728775e-06, + "loss": 0.9205468893051147, + "step": 1454 + }, + { + "epoch": 3.0782241014799157, + "grad_norm": 0.2374901920557022, + "learning_rate": 1.1253045717922215e-06, + "loss": 0.3031374216079712, + "step": 1456 + }, + { + "epoch": 3.0824524312896404, + "grad_norm": 1.803215742111206, + "learning_rate": 1.119872039965909e-06, + "loss": 0.7160661220550537, + "step": 1458 + }, + { + "epoch": 3.0866807610993656, + "grad_norm": 1.3725308179855347, + "learning_rate": 1.1144594371088245e-06, + "loss": 1.020361065864563, + "step": 1460 + }, + { + "epoch": 3.090909090909091, + "grad_norm": 0.6326039433479309, + "learning_rate": 1.1090668293922122e-06, + "loss": 0.971651554107666, + "step": 1462 + }, + { + "epoch": 3.095137420718816, + "grad_norm": 0.6070262789726257, + "learning_rate": 1.103694282742868e-06, + "loss": 0.6768549680709839, + "step": 1464 + }, + { + "epoch": 3.0993657505285412, + "grad_norm": 0.5303124785423279, + "learning_rate": 1.098341862842333e-06, + "loss": 0.7792209982872009, + "step": 1466 + }, + { + "epoch": 3.1035940803382664, + "grad_norm": 0.7507800459861755, + "learning_rate": 1.0930096351260913e-06, + "loss": 0.9888483881950378, + "step": 1468 + }, + { + "epoch": 3.1078224101479917, + "grad_norm": 0.6695652008056641, + "learning_rate": 1.0876976647827677e-06, + "loss": 0.9820244312286377, + "step": 1470 + }, + { + "epoch": 3.112050739957717, + "grad_norm": 4.5699615478515625, + "learning_rate": 1.0824060167533365e-06, + "loss": 0.6230260133743286, + "step": 1472 + }, + { + "epoch": 3.116279069767442, + "grad_norm": 1.4406520128250122, + "learning_rate": 1.0771347557303184e-06, + "loss": 1.0396496057510376, + "step": 1474 + }, + { + "epoch": 3.120507399577167, + "grad_norm": 0.8460061550140381, + "learning_rate": 1.0718839461569972e-06, + "loss": 0.9403010606765747, + "step": 1476 + }, + { + "epoch": 3.124735729386892, + "grad_norm": 2.2458934783935547, + "learning_rate": 1.0666536522266314e-06, + "loss": 0.4271532893180847, + "step": 1478 + }, + { + "epoch": 3.1289640591966172, + "grad_norm": 0.7539458870887756, + "learning_rate": 1.0614439378816634e-06, + "loss": 0.9892304539680481, + "step": 1480 + }, + { + "epoch": 3.1331923890063424, + "grad_norm": 0.8904014825820923, + "learning_rate": 1.0562548668129449e-06, + "loss": 0.9543983340263367, + "step": 1482 + }, + { + "epoch": 3.1374207188160677, + "grad_norm": 0.8391467928886414, + "learning_rate": 1.0510865024589558e-06, + "loss": 0.33414945006370544, + "step": 1484 + }, + { + "epoch": 3.141649048625793, + "grad_norm": 2.2866015434265137, + "learning_rate": 1.045938908005025e-06, + "loss": 1.0479934215545654, + "step": 1486 + }, + { + "epoch": 3.145877378435518, + "grad_norm": 0.8269973397254944, + "learning_rate": 1.0408121463825627e-06, + "loss": 1.0214964151382446, + "step": 1488 + }, + { + "epoch": 3.1501057082452433, + "grad_norm": 2.0901854038238525, + "learning_rate": 1.0357062802682905e-06, + "loss": 0.7124687433242798, + "step": 1490 + }, + { + "epoch": 3.1543340380549685, + "grad_norm": 2.464489459991455, + "learning_rate": 1.0306213720834738e-06, + "loss": 0.7923527956008911, + "step": 1492 + }, + { + "epoch": 3.1585623678646932, + "grad_norm": 0.5960375666618347, + "learning_rate": 1.0255574839931555e-06, + "loss": 0.5037514567375183, + "step": 1494 + }, + { + "epoch": 3.1627906976744184, + "grad_norm": 0.2680164575576782, + "learning_rate": 1.0205146779054037e-06, + "loss": 0.8170030117034912, + "step": 1496 + }, + { + "epoch": 3.1670190274841437, + "grad_norm": 0.6705971360206604, + "learning_rate": 1.0154930154705493e-06, + "loss": 0.9746053814888, + "step": 1498 + }, + { + "epoch": 3.171247357293869, + "grad_norm": 1.046158790588379, + "learning_rate": 1.0104925580804307e-06, + "loss": 1.0264575481414795, + "step": 1500 + }, + { + "epoch": 3.175475687103594, + "grad_norm": 2.6368725299835205, + "learning_rate": 1.0055133668676505e-06, + "loss": 0.46951693296432495, + "step": 1502 + }, + { + "epoch": 3.1797040169133193, + "grad_norm": 0.954997181892395, + "learning_rate": 1.0005555027048216e-06, + "loss": 0.5769892930984497, + "step": 1504 + }, + { + "epoch": 3.1839323467230445, + "grad_norm": 0.8056331276893616, + "learning_rate": 9.956190262038252e-07, + "loss": 0.7956379055976868, + "step": 1506 + }, + { + "epoch": 3.1881606765327697, + "grad_norm": 1.2383183240890503, + "learning_rate": 9.90703997715068e-07, + "loss": 0.4002586901187897, + "step": 1508 + }, + { + "epoch": 3.192389006342495, + "grad_norm": 3.1095306873321533, + "learning_rate": 9.8581047732675e-07, + "loss": 0.3678751289844513, + "step": 1510 + }, + { + "epoch": 3.1966173361522197, + "grad_norm": 0.2970428764820099, + "learning_rate": 9.809385248641244e-07, + "loss": 0.10512058436870575, + "step": 1512 + }, + { + "epoch": 3.200845665961945, + "grad_norm": 2.6694912910461426, + "learning_rate": 9.760881998887647e-07, + "loss": 0.7792633771896362, + "step": 1514 + }, + { + "epoch": 3.20507399577167, + "grad_norm": 1.5287692546844482, + "learning_rate": 9.712595616978445e-07, + "loss": 1.0101102590560913, + "step": 1516 + }, + { + "epoch": 3.2093023255813953, + "grad_norm": 0.7071142792701721, + "learning_rate": 9.66452669323406e-07, + "loss": 0.6074497699737549, + "step": 1518 + }, + { + "epoch": 3.2135306553911205, + "grad_norm": 1.0035736560821533, + "learning_rate": 9.616675815316373e-07, + "loss": 0.8396947383880615, + "step": 1520 + }, + { + "epoch": 3.2177589852008457, + "grad_norm": 0.7858723998069763, + "learning_rate": 9.569043568221613e-07, + "loss": 0.9395447969436646, + "step": 1522 + }, + { + "epoch": 3.221987315010571, + "grad_norm": 4.942752361297607, + "learning_rate": 9.52163053427313e-07, + "loss": 1.0000540018081665, + "step": 1524 + }, + { + "epoch": 3.226215644820296, + "grad_norm": 0.7960143685340881, + "learning_rate": 9.474437293114311e-07, + "loss": 0.948387086391449, + "step": 1526 + }, + { + "epoch": 3.2304439746300213, + "grad_norm": 2.0574419498443604, + "learning_rate": 9.427464421701493e-07, + "loss": 0.2774934768676758, + "step": 1528 + }, + { + "epoch": 3.234672304439746, + "grad_norm": 1.1152596473693848, + "learning_rate": 9.380712494296898e-07, + "loss": 0.823591411113739, + "step": 1530 + }, + { + "epoch": 3.2389006342494713, + "grad_norm": 2.095369338989258, + "learning_rate": 9.334182082461624e-07, + "loss": 0.8626236319541931, + "step": 1532 + }, + { + "epoch": 3.2431289640591965, + "grad_norm": 0.8906185626983643, + "learning_rate": 9.287873755048647e-07, + "loss": 0.9925634264945984, + "step": 1534 + }, + { + "epoch": 3.2473572938689217, + "grad_norm": 0.876634955406189, + "learning_rate": 9.241788078195874e-07, + "loss": 0.8858959078788757, + "step": 1536 + }, + { + "epoch": 3.251585623678647, + "grad_norm": 0.8159791231155396, + "learning_rate": 9.195925615319221e-07, + "loss": 0.7304887175559998, + "step": 1538 + }, + { + "epoch": 3.255813953488372, + "grad_norm": 0.8356714248657227, + "learning_rate": 9.150286927105726e-07, + "loss": 0.6133416891098022, + "step": 1540 + }, + { + "epoch": 3.2600422832980973, + "grad_norm": 1.4572813510894775, + "learning_rate": 9.104872571506682e-07, + "loss": 1.211620807647705, + "step": 1542 + }, + { + "epoch": 3.2642706131078225, + "grad_norm": 0.5943049788475037, + "learning_rate": 9.059683103730835e-07, + "loss": 0.9767951369285583, + "step": 1544 + }, + { + "epoch": 3.2684989429175477, + "grad_norm": 1.6723552942276, + "learning_rate": 9.014719076237579e-07, + "loss": 0.9184189438819885, + "step": 1546 + }, + { + "epoch": 3.2727272727272725, + "grad_norm": 0.5673151016235352, + "learning_rate": 8.969981038730224e-07, + "loss": 0.3618415892124176, + "step": 1548 + }, + { + "epoch": 3.276955602536998, + "grad_norm": 1.0060195922851562, + "learning_rate": 8.925469538149245e-07, + "loss": 0.9330455660820007, + "step": 1550 + }, + { + "epoch": 3.281183932346723, + "grad_norm": 0.9557608366012573, + "learning_rate": 8.881185118665616e-07, + "loss": 1.0155820846557617, + "step": 1552 + }, + { + "epoch": 3.285412262156448, + "grad_norm": 0.13276489078998566, + "learning_rate": 8.837128321674174e-07, + "loss": 0.1570519506931305, + "step": 1554 + }, + { + "epoch": 3.2896405919661733, + "grad_norm": 0.714574933052063, + "learning_rate": 8.793299685786944e-07, + "loss": 0.942793607711792, + "step": 1556 + }, + { + "epoch": 3.2938689217758985, + "grad_norm": 0.9168136715888977, + "learning_rate": 8.749699746826612e-07, + "loss": 0.5292172431945801, + "step": 1558 + }, + { + "epoch": 3.2980972515856237, + "grad_norm": 1.3022035360336304, + "learning_rate": 8.706329037819961e-07, + "loss": 1.1990944147109985, + "step": 1560 + }, + { + "epoch": 3.302325581395349, + "grad_norm": 1.6504409313201904, + "learning_rate": 8.663188088991317e-07, + "loss": 0.7757396697998047, + "step": 1562 + }, + { + "epoch": 3.306553911205074, + "grad_norm": 1.3289718627929688, + "learning_rate": 8.620277427756112e-07, + "loss": 0.5169369578361511, + "step": 1564 + }, + { + "epoch": 3.3107822410147993, + "grad_norm": 0.875095546245575, + "learning_rate": 8.577597578714439e-07, + "loss": 0.7265094518661499, + "step": 1566 + }, + { + "epoch": 3.3150105708245245, + "grad_norm": 0.33962443470954895, + "learning_rate": 8.53514906364458e-07, + "loss": 0.12319551408290863, + "step": 1568 + }, + { + "epoch": 3.3192389006342493, + "grad_norm": 1.1597821712493896, + "learning_rate": 8.492932401496683e-07, + "loss": 0.5623422861099243, + "step": 1570 + }, + { + "epoch": 3.3234672304439745, + "grad_norm": 3.1550745964050293, + "learning_rate": 8.45094810838642e-07, + "loss": 0.7283601760864258, + "step": 1572 + }, + { + "epoch": 3.3276955602536997, + "grad_norm": 1.145011305809021, + "learning_rate": 8.40919669758864e-07, + "loss": 0.26868027448654175, + "step": 1574 + }, + { + "epoch": 3.331923890063425, + "grad_norm": 3.8039538860321045, + "learning_rate": 8.3676786795311e-07, + "loss": 0.4419690668582916, + "step": 1576 + }, + { + "epoch": 3.33615221987315, + "grad_norm": 0.6252729892730713, + "learning_rate": 8.326394561788257e-07, + "loss": 0.5640559196472168, + "step": 1578 + }, + { + "epoch": 3.3403805496828753, + "grad_norm": 0.55072021484375, + "learning_rate": 8.285344849075047e-07, + "loss": 0.6380379796028137, + "step": 1580 + }, + { + "epoch": 3.3446088794926006, + "grad_norm": 1.0008291006088257, + "learning_rate": 8.244530043240687e-07, + "loss": 0.98517906665802, + "step": 1582 + }, + { + "epoch": 3.3488372093023258, + "grad_norm": 1.933143138885498, + "learning_rate": 8.203950643262576e-07, + "loss": 0.717485785484314, + "step": 1584 + }, + { + "epoch": 3.353065539112051, + "grad_norm": 0.8978578448295593, + "learning_rate": 8.163607145240191e-07, + "loss": 0.6533565521240234, + "step": 1586 + }, + { + "epoch": 3.3572938689217757, + "grad_norm": 1.672323226928711, + "learning_rate": 8.123500042389003e-07, + "loss": 1.1361911296844482, + "step": 1588 + }, + { + "epoch": 3.361522198731501, + "grad_norm": 0.7658936381340027, + "learning_rate": 8.083629825034443e-07, + "loss": 0.6171827912330627, + "step": 1590 + }, + { + "epoch": 3.365750528541226, + "grad_norm": 1.5416232347488403, + "learning_rate": 8.043996980605952e-07, + "loss": 0.8929522633552551, + "step": 1592 + }, + { + "epoch": 3.3699788583509513, + "grad_norm": 0.6201046109199524, + "learning_rate": 8.004601993630979e-07, + "loss": 0.4101506471633911, + "step": 1594 + }, + { + "epoch": 3.3742071881606766, + "grad_norm": 0.8717901706695557, + "learning_rate": 7.965445345729045e-07, + "loss": 0.9818314909934998, + "step": 1596 + }, + { + "epoch": 3.3784355179704018, + "grad_norm": 2.7879254817962646, + "learning_rate": 7.926527515605922e-07, + "loss": 0.644636332988739, + "step": 1598 + }, + { + "epoch": 3.382663847780127, + "grad_norm": 1.3160312175750732, + "learning_rate": 7.88784897904772e-07, + "loss": 0.41142430901527405, + "step": 1600 + }, + { + "epoch": 3.386892177589852, + "grad_norm": 1.4033868312835693, + "learning_rate": 7.849410208915069e-07, + "loss": 0.5842673778533936, + "step": 1602 + }, + { + "epoch": 3.3911205073995774, + "grad_norm": 1.1946991682052612, + "learning_rate": 7.811211675137392e-07, + "loss": 1.0320261716842651, + "step": 1604 + }, + { + "epoch": 3.395348837209302, + "grad_norm": 0.639847993850708, + "learning_rate": 7.773253844707108e-07, + "loss": 1.0384889841079712, + "step": 1606 + }, + { + "epoch": 3.3995771670190273, + "grad_norm": 4.001772403717041, + "learning_rate": 7.735537181673947e-07, + "loss": 0.6584277749061584, + "step": 1608 + }, + { + "epoch": 3.4038054968287526, + "grad_norm": 0.6378755569458008, + "learning_rate": 7.69806214713926e-07, + "loss": 1.018156886100769, + "step": 1610 + }, + { + "epoch": 3.4080338266384778, + "grad_norm": 0.7776346802711487, + "learning_rate": 7.660829199250404e-07, + "loss": 0.8746322393417358, + "step": 1612 + }, + { + "epoch": 3.412262156448203, + "grad_norm": 1.3227170705795288, + "learning_rate": 7.623838793195128e-07, + "loss": 0.8452064990997314, + "step": 1614 + }, + { + "epoch": 3.416490486257928, + "grad_norm": 1.253333330154419, + "learning_rate": 7.587091381196004e-07, + "loss": 0.9873075485229492, + "step": 1616 + }, + { + "epoch": 3.4207188160676534, + "grad_norm": 0.5858563184738159, + "learning_rate": 7.550587412504907e-07, + "loss": 0.9376651644706726, + "step": 1618 + }, + { + "epoch": 3.4249471458773786, + "grad_norm": 1.6567012071609497, + "learning_rate": 7.514327333397521e-07, + "loss": 0.9783826470375061, + "step": 1620 + }, + { + "epoch": 3.429175475687104, + "grad_norm": 7.168039321899414, + "learning_rate": 7.47831158716788e-07, + "loss": 0.6209827661514282, + "step": 1622 + }, + { + "epoch": 3.4334038054968286, + "grad_norm": 0.9959341883659363, + "learning_rate": 7.442540614122954e-07, + "loss": 0.9962281584739685, + "step": 1624 + }, + { + "epoch": 3.4376321353065538, + "grad_norm": 0.6434539556503296, + "learning_rate": 7.407014851577257e-07, + "loss": 0.7141914367675781, + "step": 1626 + }, + { + "epoch": 3.441860465116279, + "grad_norm": 1.174318552017212, + "learning_rate": 7.371734733847509e-07, + "loss": 0.9825333952903748, + "step": 1628 + }, + { + "epoch": 3.446088794926004, + "grad_norm": 3.244459867477417, + "learning_rate": 7.336700692247326e-07, + "loss": 0.598316490650177, + "step": 1630 + }, + { + "epoch": 3.4503171247357294, + "grad_norm": 0.6823888421058655, + "learning_rate": 7.301913155081937e-07, + "loss": 0.9444507360458374, + "step": 1632 + }, + { + "epoch": 3.4545454545454546, + "grad_norm": 3.044529676437378, + "learning_rate": 7.267372547642965e-07, + "loss": 0.6880492568016052, + "step": 1634 + }, + { + "epoch": 3.45877378435518, + "grad_norm": 0.7098934650421143, + "learning_rate": 7.23307929220321e-07, + "loss": 0.8510515689849854, + "step": 1636 + }, + { + "epoch": 3.463002114164905, + "grad_norm": 2.740060806274414, + "learning_rate": 7.199033808011497e-07, + "loss": 0.4582882225513458, + "step": 1638 + }, + { + "epoch": 3.46723044397463, + "grad_norm": 0.6570109724998474, + "learning_rate": 7.16523651128755e-07, + "loss": 0.5597135424613953, + "step": 1640 + }, + { + "epoch": 3.471458773784355, + "grad_norm": 0.6645305156707764, + "learning_rate": 7.131687815216901e-07, + "loss": 0.22359013557434082, + "step": 1642 + }, + { + "epoch": 3.47568710359408, + "grad_norm": 0.8287932872772217, + "learning_rate": 7.098388129945833e-07, + "loss": 0.9671212434768677, + "step": 1644 + }, + { + "epoch": 3.4799154334038054, + "grad_norm": 1.3744875192642212, + "learning_rate": 7.065337862576381e-07, + "loss": 0.9185785055160522, + "step": 1646 + }, + { + "epoch": 3.4841437632135306, + "grad_norm": 0.5585587024688721, + "learning_rate": 7.032537417161339e-07, + "loss": 0.5719754695892334, + "step": 1648 + }, + { + "epoch": 3.488372093023256, + "grad_norm": 0.6185411810874939, + "learning_rate": 6.999987194699334e-07, + "loss": 0.5411649942398071, + "step": 1650 + }, + { + "epoch": 3.492600422832981, + "grad_norm": 1.760116457939148, + "learning_rate": 6.967687593129909e-07, + "loss": 0.6113811731338501, + "step": 1652 + }, + { + "epoch": 3.496828752642706, + "grad_norm": 1.6597703695297241, + "learning_rate": 6.935639007328666e-07, + "loss": 0.9229161143302917, + "step": 1654 + }, + { + "epoch": 3.5010570824524314, + "grad_norm": 0.8579858541488647, + "learning_rate": 6.903841829102457e-07, + "loss": 0.9809255003929138, + "step": 1656 + }, + { + "epoch": 3.5052854122621566, + "grad_norm": 0.428204208612442, + "learning_rate": 6.872296447184546e-07, + "loss": 0.843367338180542, + "step": 1658 + }, + { + "epoch": 3.5095137420718814, + "grad_norm": 0.5644919276237488, + "learning_rate": 6.841003247229903e-07, + "loss": 0.6564947962760925, + "step": 1660 + }, + { + "epoch": 3.513742071881607, + "grad_norm": 4.3857879638671875, + "learning_rate": 6.80996261181048e-07, + "loss": 0.6874603629112244, + "step": 1662 + }, + { + "epoch": 3.517970401691332, + "grad_norm": 2.124926805496216, + "learning_rate": 6.779174920410505e-07, + "loss": 0.9908625483512878, + "step": 1664 + }, + { + "epoch": 3.522198731501057, + "grad_norm": 1.0560848712921143, + "learning_rate": 6.748640549421873e-07, + "loss": 1.0359817743301392, + "step": 1666 + }, + { + "epoch": 3.526427061310782, + "grad_norm": 0.21484586596488953, + "learning_rate": 6.71835987213955e-07, + "loss": 0.2849699854850769, + "step": 1668 + }, + { + "epoch": 3.5306553911205074, + "grad_norm": 0.4259510040283203, + "learning_rate": 6.688333258756966e-07, + "loss": 0.8330371975898743, + "step": 1670 + }, + { + "epoch": 3.5348837209302326, + "grad_norm": 0.9718641042709351, + "learning_rate": 6.658561076361539e-07, + "loss": 0.6728772521018982, + "step": 1672 + }, + { + "epoch": 3.539112050739958, + "grad_norm": 1.3043420314788818, + "learning_rate": 6.629043688930161e-07, + "loss": 1.06952702999115, + "step": 1674 + }, + { + "epoch": 3.543340380549683, + "grad_norm": 2.601339340209961, + "learning_rate": 6.599781457324759e-07, + "loss": 0.7122786641120911, + "step": 1676 + }, + { + "epoch": 3.547568710359408, + "grad_norm": 0.5953323841094971, + "learning_rate": 6.570774739287855e-07, + "loss": 0.9681164026260376, + "step": 1678 + }, + { + "epoch": 3.5517970401691334, + "grad_norm": 0.6733061075210571, + "learning_rate": 6.542023889438244e-07, + "loss": 0.660723090171814, + "step": 1680 + }, + { + "epoch": 3.556025369978858, + "grad_norm": 0.5774943828582764, + "learning_rate": 6.513529259266614e-07, + "loss": 0.6790302991867065, + "step": 1682 + }, + { + "epoch": 3.5602536997885834, + "grad_norm": 1.2416257858276367, + "learning_rate": 6.485291197131258e-07, + "loss": 0.6007125377655029, + "step": 1684 + }, + { + "epoch": 3.5644820295983086, + "grad_norm": 1.9640257358551025, + "learning_rate": 6.45731004825384e-07, + "loss": 0.29832443594932556, + "step": 1686 + }, + { + "epoch": 3.568710359408034, + "grad_norm": 1.3899872303009033, + "learning_rate": 6.429586154715143e-07, + "loss": 0.7014768719673157, + "step": 1688 + }, + { + "epoch": 3.572938689217759, + "grad_norm": 0.20190729200839996, + "learning_rate": 6.402119855450905e-07, + "loss": 0.33684778213500977, + "step": 1690 + }, + { + "epoch": 3.5771670190274842, + "grad_norm": 1.3706597089767456, + "learning_rate": 6.374911486247666e-07, + "loss": 0.4806325137615204, + "step": 1692 + }, + { + "epoch": 3.5813953488372094, + "grad_norm": 1.9231373071670532, + "learning_rate": 6.347961379738678e-07, + "loss": 0.6597048044204712, + "step": 1694 + }, + { + "epoch": 3.585623678646934, + "grad_norm": 3.228268623352051, + "learning_rate": 6.321269865399811e-07, + "loss": 0.44895780086517334, + "step": 1696 + }, + { + "epoch": 3.58985200845666, + "grad_norm": 0.6945326328277588, + "learning_rate": 6.294837269545557e-07, + "loss": 0.9701504111289978, + "step": 1698 + }, + { + "epoch": 3.5940803382663846, + "grad_norm": 0.8373157978057861, + "learning_rate": 6.268663915325021e-07, + "loss": 1.074630856513977, + "step": 1700 + }, + { + "epoch": 3.59830866807611, + "grad_norm": 1.8083549737930298, + "learning_rate": 6.24275012271797e-07, + "loss": 1.0470349788665771, + "step": 1702 + }, + { + "epoch": 3.602536997885835, + "grad_norm": 3.190058469772339, + "learning_rate": 6.217096208530931e-07, + "loss": 0.4534735679626465, + "step": 1704 + }, + { + "epoch": 3.6067653276955602, + "grad_norm": 0.6707348823547363, + "learning_rate": 6.191702486393313e-07, + "loss": 0.5571319460868835, + "step": 1706 + }, + { + "epoch": 3.6109936575052854, + "grad_norm": 2.686514377593994, + "learning_rate": 6.166569266753569e-07, + "loss": 0.8430109620094299, + "step": 1708 + }, + { + "epoch": 3.6152219873150107, + "grad_norm": 1.0745434761047363, + "learning_rate": 6.141696856875408e-07, + "loss": 0.8707183599472046, + "step": 1710 + }, + { + "epoch": 3.619450317124736, + "grad_norm": 1.010704517364502, + "learning_rate": 6.117085560834034e-07, + "loss": 0.5877060890197754, + "step": 1712 + }, + { + "epoch": 3.6236786469344606, + "grad_norm": 0.396779328584671, + "learning_rate": 6.092735679512427e-07, + "loss": 0.49770990014076233, + "step": 1714 + }, + { + "epoch": 3.6279069767441863, + "grad_norm": 3.3722569942474365, + "learning_rate": 6.068647510597671e-07, + "loss": 0.7864755988121033, + "step": 1716 + }, + { + "epoch": 3.632135306553911, + "grad_norm": 0.5849418044090271, + "learning_rate": 6.044821348577306e-07, + "loss": 0.588261604309082, + "step": 1718 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 1.558585286140442, + "learning_rate": 6.021257484735737e-07, + "loss": 0.7706260681152344, + "step": 1720 + }, + { + "epoch": 3.6405919661733614, + "grad_norm": 0.7773210406303406, + "learning_rate": 5.997956207150664e-07, + "loss": 0.8451033234596252, + "step": 1722 + }, + { + "epoch": 3.6448202959830867, + "grad_norm": 1.4222577810287476, + "learning_rate": 5.974917800689572e-07, + "loss": 0.7600279450416565, + "step": 1724 + }, + { + "epoch": 3.649048625792812, + "grad_norm": 0.6138176918029785, + "learning_rate": 5.952142547006232e-07, + "loss": 1.0202842950820923, + "step": 1726 + }, + { + "epoch": 3.653276955602537, + "grad_norm": 0.6351314783096313, + "learning_rate": 5.92963072453727e-07, + "loss": 0.9493424296379089, + "step": 1728 + }, + { + "epoch": 3.6575052854122623, + "grad_norm": 1.68190598487854, + "learning_rate": 5.907382608498761e-07, + "loss": 0.8003555536270142, + "step": 1730 + }, + { + "epoch": 3.6617336152219875, + "grad_norm": 0.9876241683959961, + "learning_rate": 5.885398470882863e-07, + "loss": 0.9022297263145447, + "step": 1732 + }, + { + "epoch": 3.6659619450317127, + "grad_norm": 1.069425106048584, + "learning_rate": 5.863678580454489e-07, + "loss": 0.9579256772994995, + "step": 1734 + }, + { + "epoch": 3.6701902748414374, + "grad_norm": 0.5899412035942078, + "learning_rate": 5.842223202748026e-07, + "loss": 1.0141502618789673, + "step": 1736 + }, + { + "epoch": 3.6744186046511627, + "grad_norm": 2.7078421115875244, + "learning_rate": 5.821032600064089e-07, + "loss": 0.31864723563194275, + "step": 1738 + }, + { + "epoch": 3.678646934460888, + "grad_norm": 1.3227430582046509, + "learning_rate": 5.800107031466306e-07, + "loss": 0.52090984582901, + "step": 1740 + }, + { + "epoch": 3.682875264270613, + "grad_norm": 1.3572659492492676, + "learning_rate": 5.779446752778158e-07, + "loss": 0.40007254481315613, + "step": 1742 + }, + { + "epoch": 3.6871035940803383, + "grad_norm": 0.9358891248703003, + "learning_rate": 5.759052016579858e-07, + "loss": 0.9531795382499695, + "step": 1744 + }, + { + "epoch": 3.6913319238900635, + "grad_norm": 0.22946986556053162, + "learning_rate": 5.738923072205247e-07, + "loss": 0.6118672490119934, + "step": 1746 + }, + { + "epoch": 3.6955602536997887, + "grad_norm": 1.7882148027420044, + "learning_rate": 5.719060165738753e-07, + "loss": 0.5476849675178528, + "step": 1748 + }, + { + "epoch": 3.699788583509514, + "grad_norm": 0.6446103453636169, + "learning_rate": 5.699463540012398e-07, + "loss": 1.0358470678329468, + "step": 1750 + }, + { + "epoch": 3.704016913319239, + "grad_norm": 1.7342896461486816, + "learning_rate": 5.680133434602796e-07, + "loss": 0.43426331877708435, + "step": 1752 + }, + { + "epoch": 3.708245243128964, + "grad_norm": 3.078566789627075, + "learning_rate": 5.661070085828253e-07, + "loss": 0.5601077079772949, + "step": 1754 + }, + { + "epoch": 3.712473572938689, + "grad_norm": 0.696753978729248, + "learning_rate": 5.642273726745867e-07, + "loss": 0.8815577030181885, + "step": 1756 + }, + { + "epoch": 3.7167019027484143, + "grad_norm": 3.4322452545166016, + "learning_rate": 5.623744587148686e-07, + "loss": 0.55597984790802, + "step": 1758 + }, + { + "epoch": 3.7209302325581395, + "grad_norm": 0.21880486607551575, + "learning_rate": 5.605482893562872e-07, + "loss": 0.49099811911582947, + "step": 1760 + }, + { + "epoch": 3.7251585623678647, + "grad_norm": 0.806815505027771, + "learning_rate": 5.587488869244977e-07, + "loss": 0.9834616184234619, + "step": 1762 + }, + { + "epoch": 3.72938689217759, + "grad_norm": 0.905757486820221, + "learning_rate": 5.569762734179175e-07, + "loss": 0.5867785215377808, + "step": 1764 + }, + { + "epoch": 3.733615221987315, + "grad_norm": 1.050884485244751, + "learning_rate": 5.552304705074587e-07, + "loss": 0.8268157243728638, + "step": 1766 + }, + { + "epoch": 3.7378435517970403, + "grad_norm": 3.77276611328125, + "learning_rate": 5.535114995362631e-07, + "loss": 0.9136216044425964, + "step": 1768 + }, + { + "epoch": 3.7420718816067655, + "grad_norm": 0.35950765013694763, + "learning_rate": 5.518193815194421e-07, + "loss": 0.3232070505619049, + "step": 1770 + }, + { + "epoch": 3.7463002114164903, + "grad_norm": 1.1717166900634766, + "learning_rate": 5.50154137143818e-07, + "loss": 0.586397111415863, + "step": 1772 + }, + { + "epoch": 3.7505285412262155, + "grad_norm": 1.6452980041503906, + "learning_rate": 5.485157867676717e-07, + "loss": 1.2943792343139648, + "step": 1774 + }, + { + "epoch": 3.7547568710359407, + "grad_norm": 1.183484673500061, + "learning_rate": 5.469043504204954e-07, + "loss": 1.0138071775436401, + "step": 1776 + }, + { + "epoch": 3.758985200845666, + "grad_norm": 0.6358800530433655, + "learning_rate": 5.453198478027459e-07, + "loss": 1.0095187425613403, + "step": 1778 + }, + { + "epoch": 3.763213530655391, + "grad_norm": 1.2916791439056396, + "learning_rate": 5.437622982856039e-07, + "loss": 1.0655014514923096, + "step": 1780 + }, + { + "epoch": 3.7674418604651163, + "grad_norm": 1.0883994102478027, + "learning_rate": 5.422317209107381e-07, + "loss": 0.856255829334259, + "step": 1782 + }, + { + "epoch": 3.7716701902748415, + "grad_norm": 5.774519443511963, + "learning_rate": 5.407281343900724e-07, + "loss": 0.20018130540847778, + "step": 1784 + }, + { + "epoch": 3.7758985200845667, + "grad_norm": 1.4228441715240479, + "learning_rate": 5.392515571055551e-07, + "loss": 0.7519955039024353, + "step": 1786 + }, + { + "epoch": 3.780126849894292, + "grad_norm": 2.516164779663086, + "learning_rate": 5.378020071089375e-07, + "loss": 0.6696423292160034, + "step": 1788 + }, + { + "epoch": 3.7843551797040167, + "grad_norm": 1.3914198875427246, + "learning_rate": 5.363795021215504e-07, + "loss": 0.354766309261322, + "step": 1790 + }, + { + "epoch": 3.7885835095137423, + "grad_norm": 0.269010454416275, + "learning_rate": 5.349840595340888e-07, + "loss": 0.953768253326416, + "step": 1792 + }, + { + "epoch": 3.792811839323467, + "grad_norm": 0.7044401168823242, + "learning_rate": 5.33615696406399e-07, + "loss": 0.9254974722862244, + "step": 1794 + }, + { + "epoch": 3.7970401691331923, + "grad_norm": 2.0106935501098633, + "learning_rate": 5.322744294672698e-07, + "loss": 0.5682697296142578, + "step": 1796 + }, + { + "epoch": 3.8012684989429175, + "grad_norm": 2.6919407844543457, + "learning_rate": 5.309602751142287e-07, + "loss": 0.9588193297386169, + "step": 1798 + }, + { + "epoch": 3.8054968287526427, + "grad_norm": 1.6973198652267456, + "learning_rate": 5.296732494133406e-07, + "loss": 1.0144344568252563, + "step": 1800 + }, + { + "epoch": 3.809725158562368, + "grad_norm": 1.7578473091125488, + "learning_rate": 5.284133680990113e-07, + "loss": 0.7145028114318848, + "step": 1802 + }, + { + "epoch": 3.813953488372093, + "grad_norm": 0.8779058456420898, + "learning_rate": 5.271806465737967e-07, + "loss": 0.9277461767196655, + "step": 1804 + }, + { + "epoch": 3.8181818181818183, + "grad_norm": 0.7843415141105652, + "learning_rate": 5.259750999082123e-07, + "loss": 1.0387165546417236, + "step": 1806 + }, + { + "epoch": 3.822410147991543, + "grad_norm": 1.58511483669281, + "learning_rate": 5.247967428405505e-07, + "loss": 0.1425338089466095, + "step": 1808 + }, + { + "epoch": 3.8266384778012688, + "grad_norm": 1.0016520023345947, + "learning_rate": 5.236455897766998e-07, + "loss": 0.9441636204719543, + "step": 1810 + }, + { + "epoch": 3.8308668076109935, + "grad_norm": 0.822861909866333, + "learning_rate": 5.22521654789969e-07, + "loss": 1.029585838317871, + "step": 1812 + }, + { + "epoch": 3.8350951374207187, + "grad_norm": 0.6523001194000244, + "learning_rate": 5.214249516209148e-07, + "loss": 0.7822322249412537, + "step": 1814 + }, + { + "epoch": 3.839323467230444, + "grad_norm": 0.8254392743110657, + "learning_rate": 5.203554936771742e-07, + "loss": 0.6645534634590149, + "step": 1816 + }, + { + "epoch": 3.843551797040169, + "grad_norm": 1.8470152616500854, + "learning_rate": 5.193132940332998e-07, + "loss": 0.6678524613380432, + "step": 1818 + }, + { + "epoch": 3.8477801268498943, + "grad_norm": 0.7378912568092346, + "learning_rate": 5.182983654306015e-07, + "loss": 0.660444438457489, + "step": 1820 + }, + { + "epoch": 3.8520084566596196, + "grad_norm": 0.21690633893013, + "learning_rate": 5.173107202769891e-07, + "loss": 0.77535080909729, + "step": 1822 + }, + { + "epoch": 3.8562367864693448, + "grad_norm": 0.9497125148773193, + "learning_rate": 5.163503706468209e-07, + "loss": 0.6644335389137268, + "step": 1824 + }, + { + "epoch": 3.8604651162790695, + "grad_norm": 2.4505763053894043, + "learning_rate": 5.154173282807579e-07, + "loss": 0.6357966065406799, + "step": 1826 + }, + { + "epoch": 3.864693446088795, + "grad_norm": 0.5043659806251526, + "learning_rate": 5.145116045856168e-07, + "loss": 0.9884635210037231, + "step": 1828 + }, + { + "epoch": 3.86892177589852, + "grad_norm": 1.1076487302780151, + "learning_rate": 5.136332106342344e-07, + "loss": 1.014207124710083, + "step": 1830 + }, + { + "epoch": 3.873150105708245, + "grad_norm": 0.912322461605072, + "learning_rate": 5.127821571653295e-07, + "loss": 0.7557728886604309, + "step": 1832 + }, + { + "epoch": 3.8773784355179703, + "grad_norm": 4.824005126953125, + "learning_rate": 5.119584545833723e-07, + "loss": 0.5752384066581726, + "step": 1834 + }, + { + "epoch": 3.8816067653276956, + "grad_norm": 0.30750563740730286, + "learning_rate": 5.111621129584585e-07, + "loss": 0.6163195371627808, + "step": 1836 + }, + { + "epoch": 3.8858350951374208, + "grad_norm": 0.22983339428901672, + "learning_rate": 5.103931420261836e-07, + "loss": 0.5606608986854553, + "step": 1838 + }, + { + "epoch": 3.890063424947146, + "grad_norm": 1.0208609104156494, + "learning_rate": 5.096515511875267e-07, + "loss": 0.9524738788604736, + "step": 1840 + }, + { + "epoch": 3.894291754756871, + "grad_norm": 0.7148008942604065, + "learning_rate": 5.08937349508734e-07, + "loss": 0.9512585997581482, + "step": 1842 + }, + { + "epoch": 3.898520084566596, + "grad_norm": 0.737912654876709, + "learning_rate": 5.082505457212071e-07, + "loss": 0.6485314965248108, + "step": 1844 + }, + { + "epoch": 3.9027484143763216, + "grad_norm": 8.045441627502441, + "learning_rate": 5.07591148221399e-07, + "loss": 0.5995697379112244, + "step": 1846 + }, + { + "epoch": 3.9069767441860463, + "grad_norm": 1.6817905902862549, + "learning_rate": 5.069591650707088e-07, + "loss": 0.21968799829483032, + "step": 1848 + }, + { + "epoch": 3.9112050739957716, + "grad_norm": 0.6323149800300598, + "learning_rate": 5.063546039953841e-07, + "loss": 0.9831611514091492, + "step": 1850 + }, + { + "epoch": 3.9154334038054968, + "grad_norm": 0.21932940185070038, + "learning_rate": 5.057774723864276e-07, + "loss": 0.584568977355957, + "step": 1852 + }, + { + "epoch": 3.919661733615222, + "grad_norm": 0.534582793712616, + "learning_rate": 5.052277772995044e-07, + "loss": 0.9615625143051147, + "step": 1854 + }, + { + "epoch": 3.923890063424947, + "grad_norm": 1.219119668006897, + "learning_rate": 5.04705525454858e-07, + "loss": 0.5662239193916321, + "step": 1856 + }, + { + "epoch": 3.9281183932346724, + "grad_norm": 1.430123209953308, + "learning_rate": 5.042107232372275e-07, + "loss": 0.8200953006744385, + "step": 1858 + }, + { + "epoch": 3.9323467230443976, + "grad_norm": 1.7414133548736572, + "learning_rate": 5.037433766957684e-07, + "loss": 0.35313427448272705, + "step": 1860 + }, + { + "epoch": 3.9365750528541223, + "grad_norm": 2.733624219894409, + "learning_rate": 5.033034915439797e-07, + "loss": 1.0163811445236206, + "step": 1862 + }, + { + "epoch": 3.940803382663848, + "grad_norm": 1.1092981100082397, + "learning_rate": 5.028910731596344e-07, + "loss": 0.9771573543548584, + "step": 1864 + }, + { + "epoch": 3.9450317124735728, + "grad_norm": 1.123976707458496, + "learning_rate": 5.02506126584713e-07, + "loss": 0.9256449937820435, + "step": 1866 + }, + { + "epoch": 3.949260042283298, + "grad_norm": 0.1970531940460205, + "learning_rate": 5.021486565253419e-07, + "loss": 0.006525847129523754, + "step": 1868 + }, + { + "epoch": 3.953488372093023, + "grad_norm": 1.2699693441390991, + "learning_rate": 5.01818667351736e-07, + "loss": 0.9931344389915466, + "step": 1870 + }, + { + "epoch": 3.9577167019027484, + "grad_norm": 0.8406357765197754, + "learning_rate": 5.015161630981461e-07, + "loss": 0.7480917572975159, + "step": 1872 + }, + { + "epoch": 3.9619450317124736, + "grad_norm": 1.2351728677749634, + "learning_rate": 5.012411474628075e-07, + "loss": 0.6757962703704834, + "step": 1874 + }, + { + "epoch": 3.966173361522199, + "grad_norm": 0.8207041025161743, + "learning_rate": 5.009936238078976e-07, + "loss": 0.9615821838378906, + "step": 1876 + }, + { + "epoch": 3.970401691331924, + "grad_norm": 3.9564754962921143, + "learning_rate": 5.007735951594917e-07, + "loss": 0.47021615505218506, + "step": 1878 + }, + { + "epoch": 3.974630021141649, + "grad_norm": 0.9730548858642578, + "learning_rate": 5.005810642075292e-07, + "loss": 0.5955108404159546, + "step": 1880 + }, + { + "epoch": 3.9788583509513744, + "grad_norm": 1.1672887802124023, + "learning_rate": 5.00416033305778e-07, + "loss": 0.8675932884216309, + "step": 1882 + }, + { + "epoch": 3.983086680761099, + "grad_norm": 0.9139389395713806, + "learning_rate": 5.002785044718068e-07, + "loss": 1.0263160467147827, + "step": 1884 + }, + { + "epoch": 3.9873150105708244, + "grad_norm": 0.7081848978996277, + "learning_rate": 5.001684793869617e-07, + "loss": 0.7986045479774475, + "step": 1886 + }, + { + "epoch": 3.9915433403805496, + "grad_norm": 0.6204723119735718, + "learning_rate": 5.000859593963427e-07, + "loss": 0.962172269821167, + "step": 1888 + }, + { + "epoch": 3.995771670190275, + "grad_norm": 0.7160171270370483, + "learning_rate": 5.000309455087906e-07, + "loss": 0.9778663516044617, + "step": 1890 + }, + { + "epoch": 4.0, + "grad_norm": 0.8123812079429626, + "learning_rate": 5.000034383968715e-07, + "loss": 0.5614367723464966, + "step": 1892 + }, + { + "epoch": 4.0, + "step": 1892, + "total_flos": 3.554237146892075e+18, + "train_loss": 0.9275046758280858, + "train_runtime": 19158.8521, + "train_samples_per_second": 2.963, + "train_steps_per_second": 0.099 + } + ], + "logging_steps": 2, + "max_steps": 1892, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 99999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.554237146892075e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}