diff --git "a/Ins/trainer_state.json" "b/Ins/trainer_state.json" new file mode 100644--- /dev/null +++ "b/Ins/trainer_state.json" @@ -0,0 +1,6819 @@ +{ + "best_metric": 2.729660987854004, + "best_model_checkpoint": "./Ins/checkpoint-9678", + "epoch": 1.0, + "eval_steps": 1000, + "global_step": 9678, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0010332713370531101, + "grad_norm": 9.612689971923828, + "learning_rate": 5.1546391752577315e-05, + "loss": 4.2162, + "step": 10 + }, + { + "epoch": 0.0020665426741062203, + "grad_norm": 6.56850004196167, + "learning_rate": 0.00010309278350515463, + "loss": 3.388, + "step": 20 + }, + { + "epoch": 0.0030998140111593306, + "grad_norm": 4.049291610717773, + "learning_rate": 0.00015463917525773197, + "loss": 2.7081, + "step": 30 + }, + { + "epoch": 0.0041330853482124405, + "grad_norm": 2.604156970977783, + "learning_rate": 0.00020618556701030926, + "loss": 2.8688, + "step": 40 + }, + { + "epoch": 0.0051663566852655505, + "grad_norm": 1.8748372793197632, + "learning_rate": 0.00025773195876288655, + "loss": 2.6251, + "step": 50 + }, + { + "epoch": 0.006199628022318661, + "grad_norm": 2.638007164001465, + "learning_rate": 0.00030927835051546395, + "loss": 2.5802, + "step": 60 + }, + { + "epoch": 0.007232899359371771, + "grad_norm": 1.8762423992156982, + "learning_rate": 0.00036082474226804123, + "loss": 2.4978, + "step": 70 + }, + { + "epoch": 0.008266170696424881, + "grad_norm": 1.7499455213546753, + "learning_rate": 0.0004123711340206185, + "loss": 2.4351, + "step": 80 + }, + { + "epoch": 0.009299442033477991, + "grad_norm": 1.4334783554077148, + "learning_rate": 0.0004639175257731959, + "loss": 2.4811, + "step": 90 + }, + { + "epoch": 0.010332713370531101, + "grad_norm": 1.4335546493530273, + "learning_rate": 0.0004999998790431266, + "loss": 2.3653, + "step": 100 + }, + { + "epoch": 0.01136598470758421, + "grad_norm": 1.379575490951538, + "learning_rate": 0.0004999977287019656, + "loss": 2.2769, + "step": 110 + }, + { + "epoch": 0.012399256044637322, + "grad_norm": 1.357542872428894, + "learning_rate": 0.0004999928904568952, + "loss": 2.1347, + "step": 120 + }, + { + "epoch": 0.013432527381690432, + "grad_norm": 1.6458231210708618, + "learning_rate": 0.0004999853643599349, + "loss": 2.1943, + "step": 130 + }, + { + "epoch": 0.014465798718743542, + "grad_norm": 1.7004691362380981, + "learning_rate": 0.0004999751504920031, + "loss": 2.204, + "step": 140 + }, + { + "epoch": 0.015499070055796652, + "grad_norm": 1.3042373657226562, + "learning_rate": 0.0004999622489629164, + "loss": 2.0817, + "step": 150 + }, + { + "epoch": 0.016532341392849762, + "grad_norm": 1.5508415699005127, + "learning_rate": 0.0004999466599113884, + "loss": 2.115, + "step": 160 + }, + { + "epoch": 0.017565612729902872, + "grad_norm": 1.3333240747451782, + "learning_rate": 0.0004999283835050278, + "loss": 2.1762, + "step": 170 + }, + { + "epoch": 0.018598884066955982, + "grad_norm": 1.3132269382476807, + "learning_rate": 0.0004999074199403377, + "loss": 2.0493, + "step": 180 + }, + { + "epoch": 0.019632155404009092, + "grad_norm": 1.250373363494873, + "learning_rate": 0.000499883769442712, + "loss": 2.2234, + "step": 190 + }, + { + "epoch": 0.020665426741062202, + "grad_norm": 1.2733889818191528, + "learning_rate": 0.0004998574322664343, + "loss": 1.9751, + "step": 200 + }, + { + "epoch": 0.02169869807811531, + "grad_norm": 0.9156686663627625, + "learning_rate": 0.0004998284086946743, + "loss": 2.1314, + "step": 210 + }, + { + "epoch": 0.02273196941516842, + "grad_norm": 1.0921283960342407, + "learning_rate": 0.0004997966990394851, + "loss": 1.8402, + "step": 220 + }, + { + "epoch": 0.023765240752221535, + "grad_norm": 1.2582165002822876, + "learning_rate": 0.0004997623036417998, + "loss": 2.1236, + "step": 230 + }, + { + "epoch": 0.024798512089274645, + "grad_norm": 1.1085331439971924, + "learning_rate": 0.0004997252228714278, + "loss": 2.0778, + "step": 240 + }, + { + "epoch": 0.025831783426327755, + "grad_norm": 1.096815824508667, + "learning_rate": 0.0004996854571270512, + "loss": 2.146, + "step": 250 + }, + { + "epoch": 0.026865054763380865, + "grad_norm": 1.0710986852645874, + "learning_rate": 0.0004996430068362196, + "loss": 1.9776, + "step": 260 + }, + { + "epoch": 0.027898326100433975, + "grad_norm": 1.4760586023330688, + "learning_rate": 0.0004995978724553464, + "loss": 1.9536, + "step": 270 + }, + { + "epoch": 0.028931597437487085, + "grad_norm": 1.3000599145889282, + "learning_rate": 0.0004995500544697036, + "loss": 1.7954, + "step": 280 + }, + { + "epoch": 0.029964868774540195, + "grad_norm": 1.2158455848693848, + "learning_rate": 0.0004994995533934163, + "loss": 1.8429, + "step": 290 + }, + { + "epoch": 0.030998140111593304, + "grad_norm": 1.1453113555908203, + "learning_rate": 0.0004994463697694579, + "loss": 2.0769, + "step": 300 + }, + { + "epoch": 0.032031411448646414, + "grad_norm": 0.7675238847732544, + "learning_rate": 0.0004993905041696433, + "loss": 1.903, + "step": 310 + }, + { + "epoch": 0.033064682785699524, + "grad_norm": 1.1099224090576172, + "learning_rate": 0.0004993319571946235, + "loss": 1.9536, + "step": 320 + }, + { + "epoch": 0.034097954122752634, + "grad_norm": 1.617551326751709, + "learning_rate": 0.0004992707294738786, + "loss": 1.9095, + "step": 330 + }, + { + "epoch": 0.035131225459805744, + "grad_norm": 1.486296534538269, + "learning_rate": 0.0004992068216657115, + "loss": 1.8174, + "step": 340 + }, + { + "epoch": 0.036164496796858854, + "grad_norm": 1.4763352870941162, + "learning_rate": 0.0004991402344572409, + "loss": 1.7963, + "step": 350 + }, + { + "epoch": 0.037197768133911964, + "grad_norm": 1.2317055463790894, + "learning_rate": 0.0004990709685643932, + "loss": 1.9319, + "step": 360 + }, + { + "epoch": 0.038231039470965074, + "grad_norm": 1.074246883392334, + "learning_rate": 0.0004989990247318954, + "loss": 1.9447, + "step": 370 + }, + { + "epoch": 0.039264310808018184, + "grad_norm": 1.2657012939453125, + "learning_rate": 0.0004989244037332671, + "loss": 1.7053, + "step": 380 + }, + { + "epoch": 0.040297582145071294, + "grad_norm": 1.3143869638442993, + "learning_rate": 0.0004988471063708116, + "loss": 1.8009, + "step": 390 + }, + { + "epoch": 0.041330853482124404, + "grad_norm": 1.381238579750061, + "learning_rate": 0.0004987671334756084, + "loss": 1.8229, + "step": 400 + }, + { + "epoch": 0.042364124819177514, + "grad_norm": 1.2067453861236572, + "learning_rate": 0.0004986844859075028, + "loss": 1.7401, + "step": 410 + }, + { + "epoch": 0.04339739615623062, + "grad_norm": 1.2656382322311401, + "learning_rate": 0.0004985991645550979, + "loss": 1.6774, + "step": 420 + }, + { + "epoch": 0.04443066749328373, + "grad_norm": 0.9306573271751404, + "learning_rate": 0.0004985111703357444, + "loss": 1.7884, + "step": 430 + }, + { + "epoch": 0.04546393883033684, + "grad_norm": 0.9404155611991882, + "learning_rate": 0.0004984205041955309, + "loss": 1.6522, + "step": 440 + }, + { + "epoch": 0.04649721016738995, + "grad_norm": 1.2634170055389404, + "learning_rate": 0.0004983271671092738, + "loss": 1.7681, + "step": 450 + }, + { + "epoch": 0.04753048150444307, + "grad_norm": 1.6461280584335327, + "learning_rate": 0.0004982311600805066, + "loss": 1.8906, + "step": 460 + }, + { + "epoch": 0.04856375284149618, + "grad_norm": 1.242746114730835, + "learning_rate": 0.0004981324841414695, + "loss": 1.8049, + "step": 470 + }, + { + "epoch": 0.04959702417854929, + "grad_norm": 1.0817033052444458, + "learning_rate": 0.0004980311403530978, + "loss": 1.6272, + "step": 480 + }, + { + "epoch": 0.0506302955156024, + "grad_norm": 1.2890691757202148, + "learning_rate": 0.0004979271298050107, + "loss": 1.6988, + "step": 490 + }, + { + "epoch": 0.05166356685265551, + "grad_norm": 1.0929986238479614, + "learning_rate": 0.0004978204536155003, + "loss": 1.728, + "step": 500 + }, + { + "epoch": 0.05269683818970862, + "grad_norm": 1.2483139038085938, + "learning_rate": 0.0004977111129315177, + "loss": 1.5616, + "step": 510 + }, + { + "epoch": 0.05373010952676173, + "grad_norm": 1.0845454931259155, + "learning_rate": 0.0004975991089286632, + "loss": 1.5409, + "step": 520 + }, + { + "epoch": 0.05476338086381484, + "grad_norm": 1.4180911779403687, + "learning_rate": 0.0004974844428111715, + "loss": 1.7888, + "step": 530 + }, + { + "epoch": 0.05579665220086795, + "grad_norm": 1.0860427618026733, + "learning_rate": 0.0004973671158118998, + "loss": 1.6986, + "step": 540 + }, + { + "epoch": 0.05682992353792106, + "grad_norm": 1.1166560649871826, + "learning_rate": 0.0004972471291923143, + "loss": 1.8097, + "step": 550 + }, + { + "epoch": 0.05786319487497417, + "grad_norm": 1.2548375129699707, + "learning_rate": 0.0004971244842424768, + "loss": 1.6169, + "step": 560 + }, + { + "epoch": 0.05889646621202728, + "grad_norm": 0.991226851940155, + "learning_rate": 0.0004969991822810307, + "loss": 1.6965, + "step": 570 + }, + { + "epoch": 0.05992973754908039, + "grad_norm": 1.1663395166397095, + "learning_rate": 0.0004968712246551868, + "loss": 1.6587, + "step": 580 + }, + { + "epoch": 0.0609630088861335, + "grad_norm": 1.3420109748840332, + "learning_rate": 0.0004967406127407086, + "loss": 1.6837, + "step": 590 + }, + { + "epoch": 0.06199628022318661, + "grad_norm": 1.0421655178070068, + "learning_rate": 0.0004966073479418982, + "loss": 1.637, + "step": 600 + }, + { + "epoch": 0.06302955156023972, + "grad_norm": 1.2786924839019775, + "learning_rate": 0.0004964714316915803, + "loss": 1.5727, + "step": 610 + }, + { + "epoch": 0.06406282289729283, + "grad_norm": 1.507780909538269, + "learning_rate": 0.0004963328654510877, + "loss": 1.6056, + "step": 620 + }, + { + "epoch": 0.06509609423434594, + "grad_norm": 1.0936928987503052, + "learning_rate": 0.0004961916507102447, + "loss": 1.5742, + "step": 630 + }, + { + "epoch": 0.06612936557139905, + "grad_norm": 1.2347360849380493, + "learning_rate": 0.0004960477889873517, + "loss": 1.6692, + "step": 640 + }, + { + "epoch": 0.06716263690845216, + "grad_norm": 1.4181699752807617, + "learning_rate": 0.0004959012818291688, + "loss": 1.6412, + "step": 650 + }, + { + "epoch": 0.06819590824550527, + "grad_norm": 1.3150395154953003, + "learning_rate": 0.000495752130810899, + "loss": 1.4951, + "step": 660 + }, + { + "epoch": 0.06922917958255838, + "grad_norm": 1.5858718156814575, + "learning_rate": 0.0004956003375361712, + "loss": 1.6635, + "step": 670 + }, + { + "epoch": 0.07026245091961149, + "grad_norm": 1.1488349437713623, + "learning_rate": 0.0004954459036370231, + "loss": 1.6101, + "step": 680 + }, + { + "epoch": 0.0712957222566646, + "grad_norm": 1.0433565378189087, + "learning_rate": 0.0004952888307738839, + "loss": 1.7019, + "step": 690 + }, + { + "epoch": 0.07232899359371771, + "grad_norm": 1.2093909978866577, + "learning_rate": 0.000495129120635556, + "loss": 1.6477, + "step": 700 + }, + { + "epoch": 0.07336226493077082, + "grad_norm": 1.1268610954284668, + "learning_rate": 0.0004949667749391967, + "loss": 1.5262, + "step": 710 + }, + { + "epoch": 0.07439553626782393, + "grad_norm": 1.0410875082015991, + "learning_rate": 0.0004948017954303007, + "loss": 1.7201, + "step": 720 + }, + { + "epoch": 0.07542880760487704, + "grad_norm": 1.5368582010269165, + "learning_rate": 0.0004946341838826803, + "loss": 1.7024, + "step": 730 + }, + { + "epoch": 0.07646207894193015, + "grad_norm": 0.9913454651832581, + "learning_rate": 0.0004944639420984469, + "loss": 1.4697, + "step": 740 + }, + { + "epoch": 0.07749535027898326, + "grad_norm": 1.1712089776992798, + "learning_rate": 0.0004942910719079912, + "loss": 1.4866, + "step": 750 + }, + { + "epoch": 0.07852862161603637, + "grad_norm": 1.5118815898895264, + "learning_rate": 0.0004941155751699639, + "loss": 1.6489, + "step": 760 + }, + { + "epoch": 0.07956189295308948, + "grad_norm": 0.9319295883178711, + "learning_rate": 0.0004939374537712558, + "loss": 1.5053, + "step": 770 + }, + { + "epoch": 0.08059516429014259, + "grad_norm": 2.0669829845428467, + "learning_rate": 0.0004937567096269769, + "loss": 1.6412, + "step": 780 + }, + { + "epoch": 0.0816284356271957, + "grad_norm": 1.876446008682251, + "learning_rate": 0.0004935733446804367, + "loss": 1.7186, + "step": 790 + }, + { + "epoch": 0.08266170696424881, + "grad_norm": 1.0011149644851685, + "learning_rate": 0.0004933873609031224, + "loss": 1.4197, + "step": 800 + }, + { + "epoch": 0.08369497830130192, + "grad_norm": 84.56254577636719, + "learning_rate": 0.000493198760294678, + "loss": 2.006, + "step": 810 + }, + { + "epoch": 0.08472824963835503, + "grad_norm": 9.466862678527832, + "learning_rate": 0.0004930075448828836, + "loss": 3.6223, + "step": 820 + }, + { + "epoch": 0.08576152097540814, + "grad_norm": 1.8478171825408936, + "learning_rate": 0.0004928137167236323, + "loss": 1.8625, + "step": 830 + }, + { + "epoch": 0.08679479231246125, + "grad_norm": 1.6235121488571167, + "learning_rate": 0.0004926172779009089, + "loss": 1.6312, + "step": 840 + }, + { + "epoch": 0.08782806364951436, + "grad_norm": 1.0582070350646973, + "learning_rate": 0.0004924182305267672, + "loss": 1.4884, + "step": 850 + }, + { + "epoch": 0.08886133498656747, + "grad_norm": 1.6170542240142822, + "learning_rate": 0.0004922165767413077, + "loss": 1.4772, + "step": 860 + }, + { + "epoch": 0.08989460632362058, + "grad_norm": 1.1165497303009033, + "learning_rate": 0.0004920123187126539, + "loss": 1.396, + "step": 870 + }, + { + "epoch": 0.09092787766067369, + "grad_norm": 1.5052683353424072, + "learning_rate": 0.0004918054586369299, + "loss": 1.7157, + "step": 880 + }, + { + "epoch": 0.0919611489977268, + "grad_norm": 1.5653349161148071, + "learning_rate": 0.0004915959987382355, + "loss": 1.5859, + "step": 890 + }, + { + "epoch": 0.0929944203347799, + "grad_norm": 1.4743231534957886, + "learning_rate": 0.0004913839412686238, + "loss": 1.5155, + "step": 900 + }, + { + "epoch": 0.09402769167183303, + "grad_norm": 1.931356430053711, + "learning_rate": 0.0004911692885080757, + "loss": 1.6259, + "step": 910 + }, + { + "epoch": 0.09506096300888614, + "grad_norm": 1.209402322769165, + "learning_rate": 0.0004909520427644762, + "loss": 1.544, + "step": 920 + }, + { + "epoch": 0.09609423434593925, + "grad_norm": 1.3506582975387573, + "learning_rate": 0.0004907322063735895, + "loss": 1.6404, + "step": 930 + }, + { + "epoch": 0.09712750568299236, + "grad_norm": 1.0187022686004639, + "learning_rate": 0.0004905097816990331, + "loss": 1.4471, + "step": 940 + }, + { + "epoch": 0.09816077702004547, + "grad_norm": 1.3293033838272095, + "learning_rate": 0.0004902847711322535, + "loss": 1.5233, + "step": 950 + }, + { + "epoch": 0.09919404835709858, + "grad_norm": 1.0673298835754395, + "learning_rate": 0.0004900571770924999, + "loss": 1.4875, + "step": 960 + }, + { + "epoch": 0.10022731969415169, + "grad_norm": 1.2059558629989624, + "learning_rate": 0.000489827002026798, + "loss": 1.5143, + "step": 970 + }, + { + "epoch": 0.1012605910312048, + "grad_norm": 1.5264915227890015, + "learning_rate": 0.0004895942484099241, + "loss": 1.3904, + "step": 980 + }, + { + "epoch": 0.10229386236825791, + "grad_norm": 1.055259108543396, + "learning_rate": 0.0004893589187443786, + "loss": 1.5727, + "step": 990 + }, + { + "epoch": 0.10332713370531102, + "grad_norm": 1.3388723134994507, + "learning_rate": 0.0004891210155603585, + "loss": 1.3997, + "step": 1000 + }, + { + "epoch": 0.10436040504236413, + "grad_norm": 1.146324872970581, + "learning_rate": 0.0004888805414157304, + "loss": 1.5489, + "step": 1010 + }, + { + "epoch": 0.10539367637941724, + "grad_norm": 1.446535348892212, + "learning_rate": 0.0004886374988960036, + "loss": 1.5821, + "step": 1020 + }, + { + "epoch": 0.10642694771647035, + "grad_norm": 1.3649553060531616, + "learning_rate": 0.0004883918906143016, + "loss": 1.4677, + "step": 1030 + }, + { + "epoch": 0.10746021905352346, + "grad_norm": 1.2442352771759033, + "learning_rate": 0.00048814371921133417, + "loss": 1.4691, + "step": 1040 + }, + { + "epoch": 0.10849349039057657, + "grad_norm": 1.2473433017730713, + "learning_rate": 0.00048789298735536904, + "loss": 1.6451, + "step": 1050 + }, + { + "epoch": 0.10952676172762968, + "grad_norm": 1.7001152038574219, + "learning_rate": 0.0004876396977422033, + "loss": 1.6799, + "step": 1060 + }, + { + "epoch": 0.11056003306468279, + "grad_norm": 1.1802128553390503, + "learning_rate": 0.00048738385309513434, + "loss": 1.4816, + "step": 1070 + }, + { + "epoch": 0.1115933044017359, + "grad_norm": 1.1580235958099365, + "learning_rate": 0.0004871254561649303, + "loss": 1.681, + "step": 1080 + }, + { + "epoch": 0.11262657573878901, + "grad_norm": 1.2179391384124756, + "learning_rate": 0.000486864509729801, + "loss": 1.507, + "step": 1090 + }, + { + "epoch": 0.11365984707584212, + "grad_norm": 1.1689331531524658, + "learning_rate": 0.00048660101659536763, + "loss": 1.276, + "step": 1100 + }, + { + "epoch": 0.11469311841289523, + "grad_norm": 1.1789075136184692, + "learning_rate": 0.0004863349795946329, + "loss": 1.5065, + "step": 1110 + }, + { + "epoch": 0.11572638974994834, + "grad_norm": 0.98433917760849, + "learning_rate": 0.00048606640158795034, + "loss": 1.286, + "step": 1120 + }, + { + "epoch": 0.11675966108700145, + "grad_norm": 0.9510318636894226, + "learning_rate": 0.0004857952854629938, + "loss": 1.4637, + "step": 1130 + }, + { + "epoch": 0.11779293242405456, + "grad_norm": 1.0436633825302124, + "learning_rate": 0.0004855216341347259, + "loss": 1.4925, + "step": 1140 + }, + { + "epoch": 0.11882620376110767, + "grad_norm": 1.4930212497711182, + "learning_rate": 0.0004852454505453674, + "loss": 1.3586, + "step": 1150 + }, + { + "epoch": 0.11985947509816078, + "grad_norm": 0.8224968910217285, + "learning_rate": 0.00048496673766436517, + "loss": 1.5212, + "step": 1160 + }, + { + "epoch": 0.12089274643521389, + "grad_norm": 1.2815650701522827, + "learning_rate": 0.00048468549848835996, + "loss": 1.4901, + "step": 1170 + }, + { + "epoch": 0.121926017772267, + "grad_norm": 1.2067404985427856, + "learning_rate": 0.000484401736041155, + "loss": 1.5946, + "step": 1180 + }, + { + "epoch": 0.12295928910932011, + "grad_norm": 1.530190348625183, + "learning_rate": 0.0004841154533736827, + "loss": 1.5052, + "step": 1190 + }, + { + "epoch": 0.12399256044637322, + "grad_norm": 1.2579395771026611, + "learning_rate": 0.0004838266535639722, + "loss": 1.4186, + "step": 1200 + }, + { + "epoch": 0.12502583178342633, + "grad_norm": 1.5573033094406128, + "learning_rate": 0.00048353533971711625, + "loss": 1.3459, + "step": 1210 + }, + { + "epoch": 0.12605910312047944, + "grad_norm": 1.2861477136611938, + "learning_rate": 0.0004832415149652378, + "loss": 1.4907, + "step": 1220 + }, + { + "epoch": 0.12709237445753255, + "grad_norm": 0.8996832370758057, + "learning_rate": 0.00048294518246745643, + "loss": 1.3579, + "step": 1230 + }, + { + "epoch": 0.12812564579458566, + "grad_norm": 1.4587196111679077, + "learning_rate": 0.0004826463454098542, + "loss": 1.3771, + "step": 1240 + }, + { + "epoch": 0.12915891713163877, + "grad_norm": 1.1128901243209839, + "learning_rate": 0.00048234500700544144, + "loss": 1.3366, + "step": 1250 + }, + { + "epoch": 0.13019218846869188, + "grad_norm": 1.0845853090286255, + "learning_rate": 0.0004820411704941223, + "loss": 1.3485, + "step": 1260 + }, + { + "epoch": 0.131225459805745, + "grad_norm": 1.1594990491867065, + "learning_rate": 0.00048173483914265976, + "loss": 1.4484, + "step": 1270 + }, + { + "epoch": 0.1322587311427981, + "grad_norm": 1.2621511220932007, + "learning_rate": 0.0004814260162446408, + "loss": 1.4147, + "step": 1280 + }, + { + "epoch": 0.1332920024798512, + "grad_norm": 1.6021978855133057, + "learning_rate": 0.00048111470512044065, + "loss": 1.3724, + "step": 1290 + }, + { + "epoch": 0.13432527381690432, + "grad_norm": 1.3557963371276855, + "learning_rate": 0.0004808009091171873, + "loss": 1.3844, + "step": 1300 + }, + { + "epoch": 0.13535854515395743, + "grad_norm": 1.341562271118164, + "learning_rate": 0.0004804846316087254, + "loss": 1.2895, + "step": 1310 + }, + { + "epoch": 0.13639181649101054, + "grad_norm": 1.1911447048187256, + "learning_rate": 0.00048016587599557996, + "loss": 1.3963, + "step": 1320 + }, + { + "epoch": 0.13742508782806365, + "grad_norm": 1.0620297193527222, + "learning_rate": 0.0004798446457049201, + "loss": 1.3621, + "step": 1330 + }, + { + "epoch": 0.13845835916511676, + "grad_norm": 1.1608030796051025, + "learning_rate": 0.0004795209441905217, + "loss": 1.3658, + "step": 1340 + }, + { + "epoch": 0.13949163050216987, + "grad_norm": 1.5855592489242554, + "learning_rate": 0.0004791947749327308, + "loss": 1.3683, + "step": 1350 + }, + { + "epoch": 0.14052490183922298, + "grad_norm": 1.5676512718200684, + "learning_rate": 0.0004788661414384255, + "loss": 1.3186, + "step": 1360 + }, + { + "epoch": 0.1415581731762761, + "grad_norm": 1.149056315422058, + "learning_rate": 0.0004785350472409792, + "loss": 1.3231, + "step": 1370 + }, + { + "epoch": 0.1425914445133292, + "grad_norm": 0.9153057336807251, + "learning_rate": 0.00047820149590022153, + "loss": 1.2976, + "step": 1380 + }, + { + "epoch": 0.1436247158503823, + "grad_norm": 1.2601430416107178, + "learning_rate": 0.00047786549100240107, + "loss": 1.4205, + "step": 1390 + }, + { + "epoch": 0.14465798718743542, + "grad_norm": 1.2112687826156616, + "learning_rate": 0.0004775270361601461, + "loss": 1.3192, + "step": 1400 + }, + { + "epoch": 0.14569125852448853, + "grad_norm": 1.1192986965179443, + "learning_rate": 0.00047718613501242604, + "loss": 1.355, + "step": 1410 + }, + { + "epoch": 0.14672452986154164, + "grad_norm": 1.2099722623825073, + "learning_rate": 0.00047684279122451223, + "loss": 1.3638, + "step": 1420 + }, + { + "epoch": 0.14775780119859475, + "grad_norm": 1.124934434890747, + "learning_rate": 0.00047649700848793886, + "loss": 1.331, + "step": 1430 + }, + { + "epoch": 0.14879107253564786, + "grad_norm": 1.2407630681991577, + "learning_rate": 0.00047614879052046254, + "loss": 1.5308, + "step": 1440 + }, + { + "epoch": 0.14982434387270097, + "grad_norm": 1.2991951704025269, + "learning_rate": 0.00047579814106602316, + "loss": 1.3365, + "step": 1450 + }, + { + "epoch": 0.15085761520975408, + "grad_norm": 1.3195995092391968, + "learning_rate": 0.0004754450638947032, + "loss": 1.3209, + "step": 1460 + }, + { + "epoch": 0.15189088654680719, + "grad_norm": 1.4838430881500244, + "learning_rate": 0.000475089562802687, + "loss": 1.4705, + "step": 1470 + }, + { + "epoch": 0.1529241578838603, + "grad_norm": 1.126265287399292, + "learning_rate": 0.0004747316416122205, + "loss": 1.3316, + "step": 1480 + }, + { + "epoch": 0.1539574292209134, + "grad_norm": 1.0109881162643433, + "learning_rate": 0.00047437130417156973, + "loss": 1.4045, + "step": 1490 + }, + { + "epoch": 0.15499070055796652, + "grad_norm": 0.8355922698974609, + "learning_rate": 0.00047400855435497945, + "loss": 1.3058, + "step": 1500 + }, + { + "epoch": 0.15602397189501963, + "grad_norm": 1.406782865524292, + "learning_rate": 0.00047364339606263185, + "loss": 1.2732, + "step": 1510 + }, + { + "epoch": 0.15705724323207274, + "grad_norm": 0.9180629849433899, + "learning_rate": 0.00047327583322060406, + "loss": 1.3472, + "step": 1520 + }, + { + "epoch": 0.15809051456912585, + "grad_norm": 1.1642565727233887, + "learning_rate": 0.00047290586978082654, + "loss": 1.4165, + "step": 1530 + }, + { + "epoch": 0.15912378590617896, + "grad_norm": 1.6167621612548828, + "learning_rate": 0.00047253350972104003, + "loss": 1.5038, + "step": 1540 + }, + { + "epoch": 0.16015705724323206, + "grad_norm": 1.1963645219802856, + "learning_rate": 0.00047215875704475314, + "loss": 1.3744, + "step": 1550 + }, + { + "epoch": 0.16119032858028517, + "grad_norm": 1.246496319770813, + "learning_rate": 0.00047178161578119925, + "loss": 1.3314, + "step": 1560 + }, + { + "epoch": 0.16222359991733828, + "grad_norm": 1.1050803661346436, + "learning_rate": 0.00047140208998529303, + "loss": 1.4149, + "step": 1570 + }, + { + "epoch": 0.1632568712543914, + "grad_norm": 1.379414439201355, + "learning_rate": 0.00047102018373758716, + "loss": 1.3817, + "step": 1580 + }, + { + "epoch": 0.1642901425914445, + "grad_norm": 1.0111711025238037, + "learning_rate": 0.00047063590114422804, + "loss": 1.2865, + "step": 1590 + }, + { + "epoch": 0.16532341392849761, + "grad_norm": 1.5682810544967651, + "learning_rate": 0.00047024924633691197, + "loss": 1.317, + "step": 1600 + }, + { + "epoch": 0.16635668526555072, + "grad_norm": 0.9005609154701233, + "learning_rate": 0.0004698602234728406, + "loss": 1.3659, + "step": 1610 + }, + { + "epoch": 0.16738995660260383, + "grad_norm": 1.00556218624115, + "learning_rate": 0.00046946883673467624, + "loss": 1.3373, + "step": 1620 + }, + { + "epoch": 0.16842322793965694, + "grad_norm": 1.387304425239563, + "learning_rate": 0.00046907509033049683, + "loss": 1.4858, + "step": 1630 + }, + { + "epoch": 0.16945649927671005, + "grad_norm": 1.2545139789581299, + "learning_rate": 0.0004686789884937509, + "loss": 1.4221, + "step": 1640 + }, + { + "epoch": 0.17048977061376316, + "grad_norm": 1.3092625141143799, + "learning_rate": 0.00046828053548321187, + "loss": 1.3609, + "step": 1650 + }, + { + "epoch": 0.17152304195081627, + "grad_norm": 1.0507373809814453, + "learning_rate": 0.0004678797355829322, + "loss": 1.3406, + "step": 1660 + }, + { + "epoch": 0.17255631328786938, + "grad_norm": 1.0271055698394775, + "learning_rate": 0.0004674765931021976, + "loss": 1.3098, + "step": 1670 + }, + { + "epoch": 0.1735895846249225, + "grad_norm": 1.1777933835983276, + "learning_rate": 0.0004670711123754804, + "loss": 1.2113, + "step": 1680 + }, + { + "epoch": 0.1746228559619756, + "grad_norm": 1.1888736486434937, + "learning_rate": 0.00046666329776239306, + "loss": 1.3137, + "step": 1690 + }, + { + "epoch": 0.1756561272990287, + "grad_norm": 1.4923595190048218, + "learning_rate": 0.00046625315364764156, + "loss": 1.23, + "step": 1700 + }, + { + "epoch": 0.17668939863608182, + "grad_norm": 1.3654290437698364, + "learning_rate": 0.00046584068444097766, + "loss": 1.3182, + "step": 1710 + }, + { + "epoch": 0.17772266997313493, + "grad_norm": 1.5486831665039062, + "learning_rate": 0.0004654258945771521, + "loss": 1.2854, + "step": 1720 + }, + { + "epoch": 0.17875594131018804, + "grad_norm": 1.3204468488693237, + "learning_rate": 0.0004650087885158667, + "loss": 1.3328, + "step": 1730 + }, + { + "epoch": 0.17978921264724115, + "grad_norm": 1.4773411750793457, + "learning_rate": 0.000464589370741726, + "loss": 1.3741, + "step": 1740 + }, + { + "epoch": 0.18082248398429426, + "grad_norm": 1.20607328414917, + "learning_rate": 0.00046416764576418993, + "loss": 1.26, + "step": 1750 + }, + { + "epoch": 0.18185575532134737, + "grad_norm": 1.2402129173278809, + "learning_rate": 0.00046374361811752436, + "loss": 1.2398, + "step": 1760 + }, + { + "epoch": 0.18288902665840048, + "grad_norm": 1.1439170837402344, + "learning_rate": 0.00046331729236075305, + "loss": 1.4529, + "step": 1770 + }, + { + "epoch": 0.1839222979954536, + "grad_norm": 1.268829107284546, + "learning_rate": 0.0004628886730776084, + "loss": 1.4535, + "step": 1780 + }, + { + "epoch": 0.1849555693325067, + "grad_norm": 1.37249755859375, + "learning_rate": 0.0004624577648764819, + "loss": 1.3249, + "step": 1790 + }, + { + "epoch": 0.1859888406695598, + "grad_norm": 1.2565507888793945, + "learning_rate": 0.0004620245723903751, + "loss": 1.2866, + "step": 1800 + }, + { + "epoch": 0.18702211200661292, + "grad_norm": 1.1188476085662842, + "learning_rate": 0.0004615891002768494, + "loss": 1.1712, + "step": 1810 + }, + { + "epoch": 0.18805538334366606, + "grad_norm": 1.3613345623016357, + "learning_rate": 0.00046115135321797617, + "loss": 1.1788, + "step": 1820 + }, + { + "epoch": 0.18908865468071917, + "grad_norm": 1.3465733528137207, + "learning_rate": 0.00046071133592028626, + "loss": 1.2391, + "step": 1830 + }, + { + "epoch": 0.19012192601777228, + "grad_norm": 1.2718288898468018, + "learning_rate": 0.00046026905311471946, + "loss": 1.2213, + "step": 1840 + }, + { + "epoch": 0.1911551973548254, + "grad_norm": 1.372878909111023, + "learning_rate": 0.0004598245095565738, + "loss": 1.2036, + "step": 1850 + }, + { + "epoch": 0.1921884686918785, + "grad_norm": 1.198723316192627, + "learning_rate": 0.00045937771002545403, + "loss": 1.192, + "step": 1860 + }, + { + "epoch": 0.1932217400289316, + "grad_norm": 0.9199055433273315, + "learning_rate": 0.00045892865932522077, + "loss": 1.2999, + "step": 1870 + }, + { + "epoch": 0.19425501136598472, + "grad_norm": 1.3330020904541016, + "learning_rate": 0.0004584773622839383, + "loss": 1.3587, + "step": 1880 + }, + { + "epoch": 0.19528828270303783, + "grad_norm": 1.228247046470642, + "learning_rate": 0.0004580238237538232, + "loss": 1.4654, + "step": 1890 + }, + { + "epoch": 0.19632155404009094, + "grad_norm": 1.6616053581237793, + "learning_rate": 0.0004575680486111915, + "loss": 1.1583, + "step": 1900 + }, + { + "epoch": 0.19735482537714405, + "grad_norm": 1.0149476528167725, + "learning_rate": 0.000457110041756407, + "loss": 1.2121, + "step": 1910 + }, + { + "epoch": 0.19838809671419716, + "grad_norm": 1.6001296043395996, + "learning_rate": 0.00045664980811382813, + "loss": 1.2938, + "step": 1920 + }, + { + "epoch": 0.19942136805125027, + "grad_norm": 1.263951301574707, + "learning_rate": 0.00045618735263175495, + "loss": 1.1885, + "step": 1930 + }, + { + "epoch": 0.20045463938830338, + "grad_norm": 1.4672034978866577, + "learning_rate": 0.00045572268028237637, + "loss": 1.2981, + "step": 1940 + }, + { + "epoch": 0.2014879107253565, + "grad_norm": 1.1751412153244019, + "learning_rate": 0.0004552557960617161, + "loss": 1.117, + "step": 1950 + }, + { + "epoch": 0.2025211820624096, + "grad_norm": 1.131339192390442, + "learning_rate": 0.0004547867049895796, + "loss": 1.1909, + "step": 1960 + }, + { + "epoch": 0.2035544533994627, + "grad_norm": 0.991729736328125, + "learning_rate": 0.0004543154121094996, + "loss": 1.1328, + "step": 1970 + }, + { + "epoch": 0.20458772473651582, + "grad_norm": 0.9073534607887268, + "learning_rate": 0.0004538419224886819, + "loss": 0.9973, + "step": 1980 + }, + { + "epoch": 0.20562099607356893, + "grad_norm": 1.2480231523513794, + "learning_rate": 0.00045336624121795124, + "loss": 1.2367, + "step": 1990 + }, + { + "epoch": 0.20665426741062204, + "grad_norm": 1.28248131275177, + "learning_rate": 0.0004528883734116963, + "loss": 1.1738, + "step": 2000 + }, + { + "epoch": 0.20768753874767515, + "grad_norm": 1.386885166168213, + "learning_rate": 0.00045240832420781474, + "loss": 1.3788, + "step": 2010 + }, + { + "epoch": 0.20872081008472826, + "grad_norm": 1.1158126592636108, + "learning_rate": 0.0004519260987676579, + "loss": 1.3046, + "step": 2020 + }, + { + "epoch": 0.20975408142178137, + "grad_norm": 1.162636399269104, + "learning_rate": 0.00045144170227597545, + "loss": 1.2355, + "step": 2030 + }, + { + "epoch": 0.21078735275883448, + "grad_norm": 1.4077121019363403, + "learning_rate": 0.0004509551399408598, + "loss": 1.1542, + "step": 2040 + }, + { + "epoch": 0.2118206240958876, + "grad_norm": 1.1237014532089233, + "learning_rate": 0.00045046641699368953, + "loss": 1.3411, + "step": 2050 + }, + { + "epoch": 0.2128538954329407, + "grad_norm": 1.1503592729568481, + "learning_rate": 0.0004499755386890736, + "loss": 1.1783, + "step": 2060 + }, + { + "epoch": 0.2138871667699938, + "grad_norm": 1.2852717638015747, + "learning_rate": 0.0004494825103047949, + "loss": 1.3331, + "step": 2070 + }, + { + "epoch": 0.21492043810704692, + "grad_norm": 1.2611557245254517, + "learning_rate": 0.0004489873371417534, + "loss": 1.3272, + "step": 2080 + }, + { + "epoch": 0.21595370944410003, + "grad_norm": 1.4016764163970947, + "learning_rate": 0.00044849002452390874, + "loss": 1.1658, + "step": 2090 + }, + { + "epoch": 0.21698698078115314, + "grad_norm": 1.4920654296875, + "learning_rate": 0.0004479905777982238, + "loss": 1.2651, + "step": 2100 + }, + { + "epoch": 0.21802025211820625, + "grad_norm": 1.1262540817260742, + "learning_rate": 0.0004474890023346066, + "loss": 1.2347, + "step": 2110 + }, + { + "epoch": 0.21905352345525936, + "grad_norm": 1.2998685836791992, + "learning_rate": 0.0004469853035258526, + "loss": 1.3167, + "step": 2120 + }, + { + "epoch": 0.22008679479231247, + "grad_norm": 1.0999696254730225, + "learning_rate": 0.0004464794867875871, + "loss": 1.1621, + "step": 2130 + }, + { + "epoch": 0.22112006612936558, + "grad_norm": 0.9801570177078247, + "learning_rate": 0.0004459715575582066, + "loss": 1.1631, + "step": 2140 + }, + { + "epoch": 0.2221533374664187, + "grad_norm": 1.161098837852478, + "learning_rate": 0.00044546152129882054, + "loss": 1.1256, + "step": 2150 + }, + { + "epoch": 0.2231866088034718, + "grad_norm": 1.2714890241622925, + "learning_rate": 0.0004449493834931927, + "loss": 1.1182, + "step": 2160 + }, + { + "epoch": 0.2242198801405249, + "grad_norm": 0.9561129808425903, + "learning_rate": 0.0004444351496476818, + "loss": 1.2051, + "step": 2170 + }, + { + "epoch": 0.22525315147757802, + "grad_norm": 1.3160920143127441, + "learning_rate": 0.0004439188252911828, + "loss": 1.2268, + "step": 2180 + }, + { + "epoch": 0.22628642281463113, + "grad_norm": 1.1478712558746338, + "learning_rate": 0.0004434004159750671, + "loss": 1.1928, + "step": 2190 + }, + { + "epoch": 0.22731969415168424, + "grad_norm": 1.2426934242248535, + "learning_rate": 0.0004428799272731231, + "loss": 1.1548, + "step": 2200 + }, + { + "epoch": 0.22835296548873735, + "grad_norm": 1.3587703704833984, + "learning_rate": 0.0004423573647814961, + "loss": 1.1371, + "step": 2210 + }, + { + "epoch": 0.22938623682579046, + "grad_norm": 1.0638786554336548, + "learning_rate": 0.0004418327341186282, + "loss": 1.1668, + "step": 2220 + }, + { + "epoch": 0.23041950816284357, + "grad_norm": 1.1600587368011475, + "learning_rate": 0.00044130604092519794, + "loss": 1.1524, + "step": 2230 + }, + { + "epoch": 0.23145277949989668, + "grad_norm": 1.402921199798584, + "learning_rate": 0.0004407772908640595, + "loss": 1.3963, + "step": 2240 + }, + { + "epoch": 0.2324860508369498, + "grad_norm": 1.3043389320373535, + "learning_rate": 0.00044024648962018184, + "loss": 1.2359, + "step": 2250 + }, + { + "epoch": 0.2335193221740029, + "grad_norm": 1.1807844638824463, + "learning_rate": 0.0004397136429005879, + "loss": 1.366, + "step": 2260 + }, + { + "epoch": 0.234552593511056, + "grad_norm": 1.397064208984375, + "learning_rate": 0.00043917875643429284, + "loss": 1.234, + "step": 2270 + }, + { + "epoch": 0.23558586484810912, + "grad_norm": 1.3520923852920532, + "learning_rate": 0.0004386418359722426, + "loss": 1.1863, + "step": 2280 + }, + { + "epoch": 0.23661913618516223, + "grad_norm": 1.3684437274932861, + "learning_rate": 0.00043810288728725203, + "loss": 1.3008, + "step": 2290 + }, + { + "epoch": 0.23765240752221534, + "grad_norm": 1.1893550157546997, + "learning_rate": 0.0004375619161739428, + "loss": 1.0295, + "step": 2300 + }, + { + "epoch": 0.23868567885926845, + "grad_norm": 1.57694673538208, + "learning_rate": 0.0004370189284486814, + "loss": 1.2778, + "step": 2310 + }, + { + "epoch": 0.23971895019632156, + "grad_norm": 1.601629376411438, + "learning_rate": 0.0004364739299495162, + "loss": 1.0583, + "step": 2320 + }, + { + "epoch": 0.24075222153337467, + "grad_norm": 1.3246372938156128, + "learning_rate": 0.00043592692653611465, + "loss": 1.201, + "step": 2330 + }, + { + "epoch": 0.24178549287042778, + "grad_norm": 1.020665168762207, + "learning_rate": 0.0004353779240897008, + "loss": 1.3127, + "step": 2340 + }, + { + "epoch": 0.24281876420748089, + "grad_norm": 1.082587480545044, + "learning_rate": 0.00043482692851299146, + "loss": 1.225, + "step": 2350 + }, + { + "epoch": 0.243852035544534, + "grad_norm": 1.093636155128479, + "learning_rate": 0.00043427394573013314, + "loss": 1.0587, + "step": 2360 + }, + { + "epoch": 0.2448853068815871, + "grad_norm": 1.3400568962097168, + "learning_rate": 0.00043371898168663816, + "loss": 1.2375, + "step": 2370 + }, + { + "epoch": 0.24591857821864022, + "grad_norm": 1.223877191543579, + "learning_rate": 0.00043316204234932084, + "loss": 1.3076, + "step": 2380 + }, + { + "epoch": 0.24695184955569333, + "grad_norm": 1.4696872234344482, + "learning_rate": 0.0004326031337062333, + "loss": 1.251, + "step": 2390 + }, + { + "epoch": 0.24798512089274644, + "grad_norm": 1.3883171081542969, + "learning_rate": 0.00043204226176660107, + "loss": 1.2215, + "step": 2400 + }, + { + "epoch": 0.24901839222979955, + "grad_norm": 1.1045054197311401, + "learning_rate": 0.0004314794325607584, + "loss": 1.1014, + "step": 2410 + }, + { + "epoch": 0.25005166356685266, + "grad_norm": 1.273431658744812, + "learning_rate": 0.00043091465214008354, + "loss": 1.0845, + "step": 2420 + }, + { + "epoch": 0.25108493490390577, + "grad_norm": 1.0618531703948975, + "learning_rate": 0.0004303479265769337, + "loss": 1.1162, + "step": 2430 + }, + { + "epoch": 0.2521182062409589, + "grad_norm": 1.3230130672454834, + "learning_rate": 0.0004297792619645797, + "loss": 1.0385, + "step": 2440 + }, + { + "epoch": 0.253151477578012, + "grad_norm": 1.3020095825195312, + "learning_rate": 0.0004292086644171403, + "loss": 1.1548, + "step": 2450 + }, + { + "epoch": 0.2541847489150651, + "grad_norm": 1.3359605073928833, + "learning_rate": 0.0004286361400695169, + "loss": 1.3534, + "step": 2460 + }, + { + "epoch": 0.2552180202521182, + "grad_norm": 0.9598666429519653, + "learning_rate": 0.00042806169507732706, + "loss": 1.2432, + "step": 2470 + }, + { + "epoch": 0.2562512915891713, + "grad_norm": 0.9668743014335632, + "learning_rate": 0.00042748533561683865, + "loss": 1.2512, + "step": 2480 + }, + { + "epoch": 0.2572845629262244, + "grad_norm": 1.0706520080566406, + "learning_rate": 0.0004269070678849034, + "loss": 1.136, + "step": 2490 + }, + { + "epoch": 0.25831783426327753, + "grad_norm": 1.0470647811889648, + "learning_rate": 0.00042632689809889027, + "loss": 1.1211, + "step": 2500 + }, + { + "epoch": 0.25935110560033064, + "grad_norm": 1.1314566135406494, + "learning_rate": 0.0004257448324966183, + "loss": 1.0704, + "step": 2510 + }, + { + "epoch": 0.26038437693738375, + "grad_norm": 1.085749626159668, + "learning_rate": 0.00042516087733629004, + "loss": 1.1727, + "step": 2520 + }, + { + "epoch": 0.26141764827443686, + "grad_norm": 1.2759054899215698, + "learning_rate": 0.00042457503889642396, + "loss": 1.2864, + "step": 2530 + }, + { + "epoch": 0.26245091961149, + "grad_norm": 1.2468546628952026, + "learning_rate": 0.0004239873234757871, + "loss": 1.0734, + "step": 2540 + }, + { + "epoch": 0.2634841909485431, + "grad_norm": 1.2701605558395386, + "learning_rate": 0.00042339773739332706, + "loss": 1.1816, + "step": 2550 + }, + { + "epoch": 0.2645174622855962, + "grad_norm": 1.0027897357940674, + "learning_rate": 0.00042280628698810447, + "loss": 1.0726, + "step": 2560 + }, + { + "epoch": 0.2655507336226493, + "grad_norm": 1.4462809562683105, + "learning_rate": 0.0004222129786192245, + "loss": 1.278, + "step": 2570 + }, + { + "epoch": 0.2665840049597024, + "grad_norm": 1.241769552230835, + "learning_rate": 0.0004216178186657686, + "loss": 1.1829, + "step": 2580 + }, + { + "epoch": 0.2676172762967555, + "grad_norm": 1.6216635704040527, + "learning_rate": 0.00042102081352672594, + "loss": 1.1479, + "step": 2590 + }, + { + "epoch": 0.26865054763380863, + "grad_norm": 4.33831787109375, + "learning_rate": 0.0004204219696209248, + "loss": 1.1709, + "step": 2600 + }, + { + "epoch": 0.26968381897086174, + "grad_norm": 0.952393114566803, + "learning_rate": 0.00041982129338696296, + "loss": 1.1018, + "step": 2610 + }, + { + "epoch": 0.27071709030791485, + "grad_norm": 1.4484752416610718, + "learning_rate": 0.0004192187912831393, + "loss": 1.081, + "step": 2620 + }, + { + "epoch": 0.27175036164496796, + "grad_norm": 1.0843290090560913, + "learning_rate": 0.0004186144697873835, + "loss": 1.251, + "step": 2630 + }, + { + "epoch": 0.2727836329820211, + "grad_norm": 1.5165774822235107, + "learning_rate": 0.0004180083353971871, + "loss": 1.134, + "step": 2640 + }, + { + "epoch": 0.2738169043190742, + "grad_norm": 1.2171909809112549, + "learning_rate": 0.00041740039462953343, + "loss": 1.1763, + "step": 2650 + }, + { + "epoch": 0.2748501756561273, + "grad_norm": 1.0561660528182983, + "learning_rate": 0.00041679065402082724, + "loss": 1.1178, + "step": 2660 + }, + { + "epoch": 0.2758834469931804, + "grad_norm": 1.2738356590270996, + "learning_rate": 0.0004161791201268247, + "loss": 1.2379, + "step": 2670 + }, + { + "epoch": 0.2769167183302335, + "grad_norm": 1.127266764640808, + "learning_rate": 0.000415565799522563, + "loss": 1.2347, + "step": 2680 + }, + { + "epoch": 0.2779499896672866, + "grad_norm": 1.4078705310821533, + "learning_rate": 0.0004149506988022894, + "loss": 1.2155, + "step": 2690 + }, + { + "epoch": 0.27898326100433973, + "grad_norm": 0.9871230125427246, + "learning_rate": 0.0004143338245793906, + "loss": 1.1583, + "step": 2700 + }, + { + "epoch": 0.28001653234139284, + "grad_norm": 0.8731614947319031, + "learning_rate": 0.0004137151834863213, + "loss": 1.171, + "step": 2710 + }, + { + "epoch": 0.28104980367844595, + "grad_norm": 1.0814101696014404, + "learning_rate": 0.0004130947821745332, + "loss": 1.2112, + "step": 2720 + }, + { + "epoch": 0.28208307501549906, + "grad_norm": 1.695854902267456, + "learning_rate": 0.0004124726273144034, + "loss": 1.0269, + "step": 2730 + }, + { + "epoch": 0.2831163463525522, + "grad_norm": 0.9004189372062683, + "learning_rate": 0.00041184872559516244, + "loss": 1.1974, + "step": 2740 + }, + { + "epoch": 0.2841496176896053, + "grad_norm": 1.034590482711792, + "learning_rate": 0.0004112230837248229, + "loss": 1.1068, + "step": 2750 + }, + { + "epoch": 0.2851828890266584, + "grad_norm": 1.4124444723129272, + "learning_rate": 0.0004105957084301069, + "loss": 1.0922, + "step": 2760 + }, + { + "epoch": 0.2862161603637115, + "grad_norm": 1.4991061687469482, + "learning_rate": 0.00040996660645637345, + "loss": 1.2392, + "step": 2770 + }, + { + "epoch": 0.2872494317007646, + "grad_norm": 1.316277027130127, + "learning_rate": 0.00040933578456754687, + "loss": 1.0433, + "step": 2780 + }, + { + "epoch": 0.2882827030378177, + "grad_norm": 1.6624524593353271, + "learning_rate": 0.00040870324954604323, + "loss": 0.9623, + "step": 2790 + }, + { + "epoch": 0.28931597437487083, + "grad_norm": 1.2658953666687012, + "learning_rate": 0.00040806900819269766, + "loss": 1.108, + "step": 2800 + }, + { + "epoch": 0.29034924571192394, + "grad_norm": 1.3295769691467285, + "learning_rate": 0.00040743306732669133, + "loss": 1.137, + "step": 2810 + }, + { + "epoch": 0.29138251704897705, + "grad_norm": 1.6526551246643066, + "learning_rate": 0.0004067954337854783, + "loss": 1.1281, + "step": 2820 + }, + { + "epoch": 0.29241578838603016, + "grad_norm": 1.1587738990783691, + "learning_rate": 0.0004061561144247115, + "loss": 1.0086, + "step": 2830 + }, + { + "epoch": 0.29344905972308327, + "grad_norm": 1.454256534576416, + "learning_rate": 0.0004055151161181694, + "loss": 1.0723, + "step": 2840 + }, + { + "epoch": 0.2944823310601364, + "grad_norm": 1.476521611213684, + "learning_rate": 0.00040487244575768193, + "loss": 1.2486, + "step": 2850 + }, + { + "epoch": 0.2955156023971895, + "grad_norm": 0.9193058609962463, + "learning_rate": 0.00040422811025305676, + "loss": 1.1071, + "step": 2860 + }, + { + "epoch": 0.2965488737342426, + "grad_norm": 1.8984912633895874, + "learning_rate": 0.0004035821165320045, + "loss": 1.1724, + "step": 2870 + }, + { + "epoch": 0.2975821450712957, + "grad_norm": 1.7232333421707153, + "learning_rate": 0.00040293447154006435, + "loss": 1.174, + "step": 2880 + }, + { + "epoch": 0.2986154164083488, + "grad_norm": 0.9074342846870422, + "learning_rate": 0.0004022851822405297, + "loss": 1.2448, + "step": 2890 + }, + { + "epoch": 0.29964868774540193, + "grad_norm": 1.2896414995193481, + "learning_rate": 0.0004016342556143727, + "loss": 1.1207, + "step": 2900 + }, + { + "epoch": 0.30068195908245504, + "grad_norm": 1.264952301979065, + "learning_rate": 0.0004009816986601702, + "loss": 1.0868, + "step": 2910 + }, + { + "epoch": 0.30171523041950815, + "grad_norm": 1.1345736980438232, + "learning_rate": 0.00040032751839402727, + "loss": 1.1881, + "step": 2920 + }, + { + "epoch": 0.30274850175656126, + "grad_norm": 1.0581121444702148, + "learning_rate": 0.00039967172184950263, + "loss": 1.1796, + "step": 2930 + }, + { + "epoch": 0.30378177309361437, + "grad_norm": 1.1877319812774658, + "learning_rate": 0.0003990143160775327, + "loss": 1.108, + "step": 2940 + }, + { + "epoch": 0.3048150444306675, + "grad_norm": 1.0478935241699219, + "learning_rate": 0.0003983553081463559, + "loss": 0.9986, + "step": 2950 + }, + { + "epoch": 0.3058483157677206, + "grad_norm": 1.1299808025360107, + "learning_rate": 0.0003976947051414366, + "loss": 1.0221, + "step": 2960 + }, + { + "epoch": 0.3068815871047737, + "grad_norm": 1.0618709325790405, + "learning_rate": 0.00039703251416538883, + "loss": 0.9024, + "step": 2970 + }, + { + "epoch": 0.3079148584418268, + "grad_norm": 1.099435806274414, + "learning_rate": 0.0003963687423379003, + "loss": 1.2754, + "step": 2980 + }, + { + "epoch": 0.3089481297788799, + "grad_norm": 1.1851657629013062, + "learning_rate": 0.00039570339679565527, + "loss": 1.1154, + "step": 2990 + }, + { + "epoch": 0.30998140111593303, + "grad_norm": 1.1812468767166138, + "learning_rate": 0.00039503648469225826, + "loss": 0.9979, + "step": 3000 + }, + { + "epoch": 0.31101467245298614, + "grad_norm": 1.1941215991973877, + "learning_rate": 0.00039436801319815717, + "loss": 1.0625, + "step": 3010 + }, + { + "epoch": 0.31204794379003925, + "grad_norm": 0.9969115853309631, + "learning_rate": 0.00039369798950056567, + "loss": 1.189, + "step": 3020 + }, + { + "epoch": 0.31308121512709236, + "grad_norm": 1.1364282369613647, + "learning_rate": 0.00039302642080338664, + "loss": 1.107, + "step": 3030 + }, + { + "epoch": 0.31411448646414547, + "grad_norm": 1.234078288078308, + "learning_rate": 0.0003923533143271341, + "loss": 1.238, + "step": 3040 + }, + { + "epoch": 0.3151477578011986, + "grad_norm": 1.0862330198287964, + "learning_rate": 0.0003916786773088559, + "loss": 1.1151, + "step": 3050 + }, + { + "epoch": 0.3161810291382517, + "grad_norm": 0.7923426032066345, + "learning_rate": 0.0003910025170020558, + "loss": 1.1031, + "step": 3060 + }, + { + "epoch": 0.3172143004753048, + "grad_norm": 1.6818230152130127, + "learning_rate": 0.0003903248406766158, + "loss": 1.0546, + "step": 3070 + }, + { + "epoch": 0.3182475718123579, + "grad_norm": 1.0749332904815674, + "learning_rate": 0.00038964565561871723, + "loss": 1.1412, + "step": 3080 + }, + { + "epoch": 0.319280843149411, + "grad_norm": 1.6896083354949951, + "learning_rate": 0.00038896496913076327, + "loss": 1.0668, + "step": 3090 + }, + { + "epoch": 0.32031411448646413, + "grad_norm": 1.24112069606781, + "learning_rate": 0.0003882827885312998, + "loss": 1.2632, + "step": 3100 + }, + { + "epoch": 0.32134738582351724, + "grad_norm": 1.2017937898635864, + "learning_rate": 0.00038759912115493724, + "loss": 1.0969, + "step": 3110 + }, + { + "epoch": 0.32238065716057035, + "grad_norm": 1.20210599899292, + "learning_rate": 0.00038691397435227097, + "loss": 0.9202, + "step": 3120 + }, + { + "epoch": 0.32341392849762346, + "grad_norm": 1.408257246017456, + "learning_rate": 0.0003862273554898032, + "loss": 1.1583, + "step": 3130 + }, + { + "epoch": 0.32444719983467657, + "grad_norm": 1.5274341106414795, + "learning_rate": 0.000385539271949863, + "loss": 1.0382, + "step": 3140 + }, + { + "epoch": 0.3254804711717297, + "grad_norm": 1.1086093187332153, + "learning_rate": 0.00038484973113052736, + "loss": 1.1232, + "step": 3150 + }, + { + "epoch": 0.3265137425087828, + "grad_norm": 1.35590660572052, + "learning_rate": 0.0003841587404455413, + "loss": 1.1019, + "step": 3160 + }, + { + "epoch": 0.3275470138458359, + "grad_norm": 1.0598667860031128, + "learning_rate": 0.000383466307324239, + "loss": 1.1557, + "step": 3170 + }, + { + "epoch": 0.328580285182889, + "grad_norm": 1.1116646528244019, + "learning_rate": 0.00038277243921146253, + "loss": 1.0634, + "step": 3180 + }, + { + "epoch": 0.3296135565199421, + "grad_norm": 1.4738123416900635, + "learning_rate": 0.0003820771435674829, + "loss": 1.0067, + "step": 3190 + }, + { + "epoch": 0.33064682785699523, + "grad_norm": 1.3569648265838623, + "learning_rate": 0.00038138042786791987, + "loss": 1.0798, + "step": 3200 + }, + { + "epoch": 0.33168009919404834, + "grad_norm": 1.073232650756836, + "learning_rate": 0.00038068229960366054, + "loss": 1.0349, + "step": 3210 + }, + { + "epoch": 0.33271337053110145, + "grad_norm": 1.4980833530426025, + "learning_rate": 0.0003799827662807801, + "loss": 1.1281, + "step": 3220 + }, + { + "epoch": 0.33374664186815456, + "grad_norm": 1.4308544397354126, + "learning_rate": 0.0003792818354204601, + "loss": 1.1056, + "step": 3230 + }, + { + "epoch": 0.33477991320520767, + "grad_norm": 1.5736769437789917, + "learning_rate": 0.0003785795145589085, + "loss": 1.0928, + "step": 3240 + }, + { + "epoch": 0.3358131845422608, + "grad_norm": 1.5231635570526123, + "learning_rate": 0.0003778758112472776, + "loss": 1.2857, + "step": 3250 + }, + { + "epoch": 0.3368464558793139, + "grad_norm": 1.0374634265899658, + "learning_rate": 0.00037717073305158376, + "loss": 1.1738, + "step": 3260 + }, + { + "epoch": 0.337879727216367, + "grad_norm": 1.1238751411437988, + "learning_rate": 0.0003764642875526256, + "loss": 0.9849, + "step": 3270 + }, + { + "epoch": 0.3389129985534201, + "grad_norm": 1.039632797241211, + "learning_rate": 0.0003757564823459025, + "loss": 1.059, + "step": 3280 + }, + { + "epoch": 0.3399462698904732, + "grad_norm": 1.1179864406585693, + "learning_rate": 0.0003750473250415334, + "loss": 0.9745, + "step": 3290 + }, + { + "epoch": 0.34097954122752633, + "grad_norm": 1.1467711925506592, + "learning_rate": 0.0003743368232641741, + "loss": 1.1077, + "step": 3300 + }, + { + "epoch": 0.34201281256457944, + "grad_norm": 1.0984638929367065, + "learning_rate": 0.00037362498465293604, + "loss": 1.0454, + "step": 3310 + }, + { + "epoch": 0.34304608390163255, + "grad_norm": 1.0687668323516846, + "learning_rate": 0.000372911816861304, + "loss": 1.073, + "step": 3320 + }, + { + "epoch": 0.34407935523868566, + "grad_norm": 1.6367201805114746, + "learning_rate": 0.00037219732755705353, + "loss": 1.1094, + "step": 3330 + }, + { + "epoch": 0.34511262657573877, + "grad_norm": 1.2905350923538208, + "learning_rate": 0.0003714815244221689, + "loss": 1.1536, + "step": 3340 + }, + { + "epoch": 0.3461458979127919, + "grad_norm": 0.9039587378501892, + "learning_rate": 0.00037076441515276003, + "loss": 1.1906, + "step": 3350 + }, + { + "epoch": 0.347179169249845, + "grad_norm": 1.3063287734985352, + "learning_rate": 0.0003700460074589804, + "loss": 1.1448, + "step": 3360 + }, + { + "epoch": 0.3482124405868981, + "grad_norm": 1.5665302276611328, + "learning_rate": 0.00036932630906494346, + "loss": 1.2537, + "step": 3370 + }, + { + "epoch": 0.3492457119239512, + "grad_norm": 1.0483524799346924, + "learning_rate": 0.0003686053277086401, + "loss": 1.1135, + "step": 3380 + }, + { + "epoch": 0.3502789832610043, + "grad_norm": 1.2478828430175781, + "learning_rate": 0.0003678830711418551, + "loss": 1.0816, + "step": 3390 + }, + { + "epoch": 0.3513122545980574, + "grad_norm": 1.14175283908844, + "learning_rate": 0.00036715954713008406, + "loss": 1.1217, + "step": 3400 + }, + { + "epoch": 0.35234552593511054, + "grad_norm": 1.0000718832015991, + "learning_rate": 0.0003664347634524497, + "loss": 1.1127, + "step": 3410 + }, + { + "epoch": 0.35337879727216365, + "grad_norm": 1.5337084531784058, + "learning_rate": 0.00036570872790161834, + "loss": 1.0619, + "step": 3420 + }, + { + "epoch": 0.35441206860921676, + "grad_norm": 0.878587543964386, + "learning_rate": 0.00036498144828371604, + "loss": 1.1094, + "step": 3430 + }, + { + "epoch": 0.35544533994626987, + "grad_norm": 1.3913145065307617, + "learning_rate": 0.0003642529324182449, + "loss": 0.9874, + "step": 3440 + }, + { + "epoch": 0.356478611283323, + "grad_norm": 1.388934850692749, + "learning_rate": 0.0003635231881379985, + "loss": 1.0177, + "step": 3450 + }, + { + "epoch": 0.3575118826203761, + "grad_norm": 1.2259199619293213, + "learning_rate": 0.00036279222328897826, + "loss": 1.0533, + "step": 3460 + }, + { + "epoch": 0.3585451539574292, + "grad_norm": 1.561023235321045, + "learning_rate": 0.0003620600457303085, + "loss": 1.1231, + "step": 3470 + }, + { + "epoch": 0.3595784252944823, + "grad_norm": 1.6612300872802734, + "learning_rate": 0.0003613266633341528, + "loss": 1.0971, + "step": 3480 + }, + { + "epoch": 0.3606116966315354, + "grad_norm": 1.3272353410720825, + "learning_rate": 0.00036059208398562793, + "loss": 1.024, + "step": 3490 + }, + { + "epoch": 0.3616449679685885, + "grad_norm": 1.2441381216049194, + "learning_rate": 0.00035985631558272074, + "loss": 1.0369, + "step": 3500 + }, + { + "epoch": 0.36267823930564164, + "grad_norm": 1.1805377006530762, + "learning_rate": 0.00035911936603620225, + "loss": 0.9626, + "step": 3510 + }, + { + "epoch": 0.36371151064269475, + "grad_norm": 1.3905142545700073, + "learning_rate": 0.00035838124326954254, + "loss": 1.1415, + "step": 3520 + }, + { + "epoch": 0.36474478197974786, + "grad_norm": 1.2646347284317017, + "learning_rate": 0.0003576419552188261, + "loss": 1.0101, + "step": 3530 + }, + { + "epoch": 0.36577805331680097, + "grad_norm": 1.1667832136154175, + "learning_rate": 0.000356901509832666, + "loss": 0.9862, + "step": 3540 + }, + { + "epoch": 0.3668113246538541, + "grad_norm": 0.8571506142616272, + "learning_rate": 0.00035615991507211887, + "loss": 0.9644, + "step": 3550 + }, + { + "epoch": 0.3678445959909072, + "grad_norm": 0.9799628853797913, + "learning_rate": 0.00035541717891059886, + "loss": 0.8918, + "step": 3560 + }, + { + "epoch": 0.3688778673279603, + "grad_norm": 1.0068583488464355, + "learning_rate": 0.0003546733093337924, + "loss": 0.935, + "step": 3570 + }, + { + "epoch": 0.3699111386650134, + "grad_norm": 0.8947041034698486, + "learning_rate": 0.0003539283143395719, + "loss": 1.0752, + "step": 3580 + }, + { + "epoch": 0.3709444100020665, + "grad_norm": 1.0797079801559448, + "learning_rate": 0.00035318220193790976, + "loss": 0.9534, + "step": 3590 + }, + { + "epoch": 0.3719776813391196, + "grad_norm": 1.0746815204620361, + "learning_rate": 0.0003524349801507929, + "loss": 0.9862, + "step": 3600 + }, + { + "epoch": 0.37301095267617274, + "grad_norm": 1.0419459342956543, + "learning_rate": 0.0003516866570121356, + "loss": 1.0606, + "step": 3610 + }, + { + "epoch": 0.37404422401322585, + "grad_norm": 1.1848483085632324, + "learning_rate": 0.0003509372405676937, + "loss": 0.97, + "step": 3620 + }, + { + "epoch": 0.375077495350279, + "grad_norm": 1.2240206003189087, + "learning_rate": 0.00035018673887497807, + "loss": 1.0237, + "step": 3630 + }, + { + "epoch": 0.3761107666873321, + "grad_norm": 1.6243517398834229, + "learning_rate": 0.0003494351600031677, + "loss": 1.0748, + "step": 3640 + }, + { + "epoch": 0.37714403802438523, + "grad_norm": 1.4265719652175903, + "learning_rate": 0.00034868251203302314, + "loss": 1.0639, + "step": 3650 + }, + { + "epoch": 0.37817730936143834, + "grad_norm": 1.0866336822509766, + "learning_rate": 0.00034792880305679967, + "loss": 0.9635, + "step": 3660 + }, + { + "epoch": 0.37921058069849145, + "grad_norm": 1.0087511539459229, + "learning_rate": 0.00034717404117816, + "loss": 1.1082, + "step": 3670 + }, + { + "epoch": 0.38024385203554456, + "grad_norm": 0.9332922697067261, + "learning_rate": 0.0003464182345120877, + "loss": 1.0258, + "step": 3680 + }, + { + "epoch": 0.38127712337259767, + "grad_norm": 0.9739323854446411, + "learning_rate": 0.000345661391184799, + "loss": 1.0168, + "step": 3690 + }, + { + "epoch": 0.3823103947096508, + "grad_norm": 0.912225067615509, + "learning_rate": 0.00034490351933365673, + "loss": 1.0996, + "step": 3700 + }, + { + "epoch": 0.3833436660467039, + "grad_norm": 0.853449285030365, + "learning_rate": 0.0003441446271070816, + "loss": 0.9759, + "step": 3710 + }, + { + "epoch": 0.384376937383757, + "grad_norm": 1.4455146789550781, + "learning_rate": 0.0003433847226644653, + "loss": 0.9917, + "step": 3720 + }, + { + "epoch": 0.3854102087208101, + "grad_norm": 0.9426379799842834, + "learning_rate": 0.0003426238141760826, + "loss": 1.1549, + "step": 3730 + }, + { + "epoch": 0.3864434800578632, + "grad_norm": 1.3207985162734985, + "learning_rate": 0.00034186190982300323, + "loss": 1.0742, + "step": 3740 + }, + { + "epoch": 0.38747675139491633, + "grad_norm": 0.9599164128303528, + "learning_rate": 0.00034109901779700474, + "loss": 1.0011, + "step": 3750 + }, + { + "epoch": 0.38851002273196944, + "grad_norm": 1.0271940231323242, + "learning_rate": 0.0003403351463004831, + "loss": 0.9508, + "step": 3760 + }, + { + "epoch": 0.38954329406902255, + "grad_norm": 1.680550456047058, + "learning_rate": 0.0003395703035463659, + "loss": 1.134, + "step": 3770 + }, + { + "epoch": 0.39057656540607566, + "grad_norm": 1.4388446807861328, + "learning_rate": 0.00033880449775802306, + "loss": 1.1811, + "step": 3780 + }, + { + "epoch": 0.39160983674312877, + "grad_norm": 1.1560338735580444, + "learning_rate": 0.00033803773716917896, + "loss": 0.8677, + "step": 3790 + }, + { + "epoch": 0.3926431080801819, + "grad_norm": 1.414915680885315, + "learning_rate": 0.00033727003002382346, + "loss": 1.0965, + "step": 3800 + }, + { + "epoch": 0.393676379417235, + "grad_norm": 1.1397249698638916, + "learning_rate": 0.0003365013845761239, + "loss": 1.0057, + "step": 3810 + }, + { + "epoch": 0.3947096507542881, + "grad_norm": 1.470551609992981, + "learning_rate": 0.00033573180909033567, + "loss": 1.1104, + "step": 3820 + }, + { + "epoch": 0.3957429220913412, + "grad_norm": 0.9758129119873047, + "learning_rate": 0.0003349613118407138, + "loss": 1.0244, + "step": 3830 + }, + { + "epoch": 0.3967761934283943, + "grad_norm": 0.9483941793441772, + "learning_rate": 0.0003341899011114238, + "loss": 1.0993, + "step": 3840 + }, + { + "epoch": 0.39780946476544743, + "grad_norm": 1.0213124752044678, + "learning_rate": 0.00033341758519645273, + "loss": 0.9506, + "step": 3850 + }, + { + "epoch": 0.39884273610250054, + "grad_norm": 1.0135273933410645, + "learning_rate": 0.0003326443723995199, + "loss": 0.993, + "step": 3860 + }, + { + "epoch": 0.39987600743955365, + "grad_norm": 0.9621232748031616, + "learning_rate": 0.0003318702710339875, + "loss": 0.9969, + "step": 3870 + }, + { + "epoch": 0.40090927877660676, + "grad_norm": 1.1387019157409668, + "learning_rate": 0.0003310952894227718, + "loss": 1.0203, + "step": 3880 + }, + { + "epoch": 0.40194255011365987, + "grad_norm": 1.406870722770691, + "learning_rate": 0.0003303194358982526, + "loss": 1.0166, + "step": 3890 + }, + { + "epoch": 0.402975821450713, + "grad_norm": 1.339579701423645, + "learning_rate": 0.00032954271880218465, + "loss": 0.9749, + "step": 3900 + }, + { + "epoch": 0.4040090927877661, + "grad_norm": 1.2406803369522095, + "learning_rate": 0.00032876514648560757, + "loss": 0.943, + "step": 3910 + }, + { + "epoch": 0.4050423641248192, + "grad_norm": 1.172455072402954, + "learning_rate": 0.00032798672730875603, + "loss": 0.9874, + "step": 3920 + }, + { + "epoch": 0.4060756354618723, + "grad_norm": 1.2804698944091797, + "learning_rate": 0.0003272074696409697, + "loss": 0.8747, + "step": 3930 + }, + { + "epoch": 0.4071089067989254, + "grad_norm": 1.3023015260696411, + "learning_rate": 0.00032642738186060375, + "loss": 1.0686, + "step": 3940 + }, + { + "epoch": 0.40814217813597853, + "grad_norm": 1.0053542852401733, + "learning_rate": 0.00032564647235493833, + "loss": 1.074, + "step": 3950 + }, + { + "epoch": 0.40917544947303164, + "grad_norm": 1.0846545696258545, + "learning_rate": 0.0003248647495200886, + "loss": 1.0043, + "step": 3960 + }, + { + "epoch": 0.41020872081008475, + "grad_norm": 1.0012749433517456, + "learning_rate": 0.00032408222176091427, + "loss": 1.0302, + "step": 3970 + }, + { + "epoch": 0.41124199214713786, + "grad_norm": 1.074317455291748, + "learning_rate": 0.0003232988974909296, + "loss": 0.9389, + "step": 3980 + }, + { + "epoch": 0.41227526348419097, + "grad_norm": 1.5110340118408203, + "learning_rate": 0.00032251478513221254, + "loss": 0.9772, + "step": 3990 + }, + { + "epoch": 0.4133085348212441, + "grad_norm": 0.9564982652664185, + "learning_rate": 0.00032172989311531427, + "loss": 1.0754, + "step": 4000 + }, + { + "epoch": 0.4143418061582972, + "grad_norm": 1.091377854347229, + "learning_rate": 0.0003209442298791689, + "loss": 0.9343, + "step": 4010 + }, + { + "epoch": 0.4153750774953503, + "grad_norm": 1.3888299465179443, + "learning_rate": 0.00032015780387100227, + "loss": 0.9448, + "step": 4020 + }, + { + "epoch": 0.4164083488324034, + "grad_norm": 0.9059675335884094, + "learning_rate": 0.0003193706235462412, + "loss": 0.9877, + "step": 4030 + }, + { + "epoch": 0.4174416201694565, + "grad_norm": 1.5517827272415161, + "learning_rate": 0.0003185826973684231, + "loss": 1.0814, + "step": 4040 + }, + { + "epoch": 0.4184748915065096, + "grad_norm": 1.1768487691879272, + "learning_rate": 0.00031779403380910427, + "loss": 1.0654, + "step": 4050 + }, + { + "epoch": 0.41950816284356274, + "grad_norm": 1.0996865034103394, + "learning_rate": 0.0003170046413477692, + "loss": 1.0507, + "step": 4060 + }, + { + "epoch": 0.42054143418061585, + "grad_norm": 1.7401593923568726, + "learning_rate": 0.00031621452847173923, + "loss": 1.1611, + "step": 4070 + }, + { + "epoch": 0.42157470551766896, + "grad_norm": 1.4233949184417725, + "learning_rate": 0.00031542370367608176, + "loss": 1.0043, + "step": 4080 + }, + { + "epoch": 0.42260797685472207, + "grad_norm": 1.1471216678619385, + "learning_rate": 0.00031463217546351803, + "loss": 1.0389, + "step": 4090 + }, + { + "epoch": 0.4236412481917752, + "grad_norm": 0.9187898635864258, + "learning_rate": 0.0003138399523443326, + "loss": 0.9048, + "step": 4100 + }, + { + "epoch": 0.4246745195288283, + "grad_norm": 1.0712145566940308, + "learning_rate": 0.0003130470428362811, + "loss": 0.9538, + "step": 4110 + }, + { + "epoch": 0.4257077908658814, + "grad_norm": 1.3978677988052368, + "learning_rate": 0.0003122534554644995, + "loss": 0.9093, + "step": 4120 + }, + { + "epoch": 0.4267410622029345, + "grad_norm": 0.9700504541397095, + "learning_rate": 0.0003114591987614117, + "loss": 0.984, + "step": 4130 + }, + { + "epoch": 0.4277743335399876, + "grad_norm": 1.1521259546279907, + "learning_rate": 0.0003106642812666379, + "loss": 0.9965, + "step": 4140 + }, + { + "epoch": 0.4288076048770407, + "grad_norm": 1.2085431814193726, + "learning_rate": 0.0003098687115269034, + "loss": 0.9018, + "step": 4150 + }, + { + "epoch": 0.42984087621409384, + "grad_norm": 1.1607948541641235, + "learning_rate": 0.0003090724980959457, + "loss": 1.0776, + "step": 4160 + }, + { + "epoch": 0.43087414755114695, + "grad_norm": 1.4290366172790527, + "learning_rate": 0.0003082756495344234, + "loss": 0.9008, + "step": 4170 + }, + { + "epoch": 0.43190741888820006, + "grad_norm": 1.3393192291259766, + "learning_rate": 0.00030747817440982385, + "loss": 0.9175, + "step": 4180 + }, + { + "epoch": 0.43294069022525317, + "grad_norm": 1.4794620275497437, + "learning_rate": 0.00030668008129637104, + "loss": 1.0164, + "step": 4190 + }, + { + "epoch": 0.4339739615623063, + "grad_norm": 0.5630869269371033, + "learning_rate": 0.00030588137877493317, + "loss": 1.0511, + "step": 4200 + }, + { + "epoch": 0.4350072328993594, + "grad_norm": 1.0285252332687378, + "learning_rate": 0.0003050820754329309, + "loss": 0.8834, + "step": 4210 + }, + { + "epoch": 0.4360405042364125, + "grad_norm": 1.4534152746200562, + "learning_rate": 0.00030428217986424444, + "loss": 1.0248, + "step": 4220 + }, + { + "epoch": 0.4370737755734656, + "grad_norm": 1.7549463510513306, + "learning_rate": 0.0003034817006691219, + "loss": 0.9278, + "step": 4230 + }, + { + "epoch": 0.4381070469105187, + "grad_norm": 1.1008073091506958, + "learning_rate": 0.0003026806464540859, + "loss": 1.0172, + "step": 4240 + }, + { + "epoch": 0.4391403182475718, + "grad_norm": 1.042400598526001, + "learning_rate": 0.0003018790258318417, + "loss": 0.9703, + "step": 4250 + }, + { + "epoch": 0.44017358958462494, + "grad_norm": 1.428467035293579, + "learning_rate": 0.00030107684742118466, + "loss": 0.9187, + "step": 4260 + }, + { + "epoch": 0.44120686092167805, + "grad_norm": 0.8269082903862, + "learning_rate": 0.0003002741198469068, + "loss": 0.8501, + "step": 4270 + }, + { + "epoch": 0.44224013225873116, + "grad_norm": 1.0836302042007446, + "learning_rate": 0.00029947085173970506, + "loss": 0.9252, + "step": 4280 + }, + { + "epoch": 0.44327340359578427, + "grad_norm": 0.8747875690460205, + "learning_rate": 0.00029866705173608776, + "loss": 0.8576, + "step": 4290 + }, + { + "epoch": 0.4443066749328374, + "grad_norm": 1.261354923248291, + "learning_rate": 0.0002978627284782821, + "loss": 0.8515, + "step": 4300 + }, + { + "epoch": 0.4453399462698905, + "grad_norm": 1.2424492835998535, + "learning_rate": 0.0002970578906141411, + "loss": 0.9274, + "step": 4310 + }, + { + "epoch": 0.4463732176069436, + "grad_norm": 1.6803146600723267, + "learning_rate": 0.00029625254679705094, + "loss": 1.1236, + "step": 4320 + }, + { + "epoch": 0.4474064889439967, + "grad_norm": 1.284347653388977, + "learning_rate": 0.00029544670568583734, + "loss": 0.9698, + "step": 4330 + }, + { + "epoch": 0.4484397602810498, + "grad_norm": 0.7693723440170288, + "learning_rate": 0.00029464037594467284, + "loss": 0.9518, + "step": 4340 + }, + { + "epoch": 0.4494730316181029, + "grad_norm": 1.0315730571746826, + "learning_rate": 0.00029383356624298383, + "loss": 1.0873, + "step": 4350 + }, + { + "epoch": 0.45050630295515603, + "grad_norm": 0.8411365747451782, + "learning_rate": 0.0002930262852553568, + "loss": 1.0025, + "step": 4360 + }, + { + "epoch": 0.45153957429220914, + "grad_norm": 0.8453785181045532, + "learning_rate": 0.0002922185416614456, + "loss": 0.8811, + "step": 4370 + }, + { + "epoch": 0.45257284562926225, + "grad_norm": 1.2322771549224854, + "learning_rate": 0.0002914103441458775, + "loss": 1.0246, + "step": 4380 + }, + { + "epoch": 0.45360611696631536, + "grad_norm": 1.2167178392410278, + "learning_rate": 0.0002906017013981609, + "loss": 0.9645, + "step": 4390 + }, + { + "epoch": 0.4546393883033685, + "grad_norm": 1.0350911617279053, + "learning_rate": 0.0002897926221125906, + "loss": 0.9242, + "step": 4400 + }, + { + "epoch": 0.4556726596404216, + "grad_norm": 1.1375268697738647, + "learning_rate": 0.0002889831149881551, + "loss": 0.945, + "step": 4410 + }, + { + "epoch": 0.4567059309774747, + "grad_norm": 1.209620475769043, + "learning_rate": 0.0002881731887284429, + "loss": 1.0773, + "step": 4420 + }, + { + "epoch": 0.4577392023145278, + "grad_norm": 1.2932454347610474, + "learning_rate": 0.0002873628520415491, + "loss": 0.8517, + "step": 4430 + }, + { + "epoch": 0.4587724736515809, + "grad_norm": 1.0030730962753296, + "learning_rate": 0.0002865521136399814, + "loss": 0.8911, + "step": 4440 + }, + { + "epoch": 0.459805744988634, + "grad_norm": 0.9559937119483948, + "learning_rate": 0.0002857409822405666, + "loss": 0.9505, + "step": 4450 + }, + { + "epoch": 0.46083901632568713, + "grad_norm": 0.934845507144928, + "learning_rate": 0.00028492946656435704, + "loss": 1.0858, + "step": 4460 + }, + { + "epoch": 0.46187228766274024, + "grad_norm": 1.2203209400177002, + "learning_rate": 0.00028411757533653657, + "loss": 0.9919, + "step": 4470 + }, + { + "epoch": 0.46290555899979335, + "grad_norm": 1.3204402923583984, + "learning_rate": 0.000283305317286327, + "loss": 0.9198, + "step": 4480 + }, + { + "epoch": 0.46393883033684646, + "grad_norm": 0.7426913380622864, + "learning_rate": 0.0002824927011468939, + "loss": 1.0825, + "step": 4490 + }, + { + "epoch": 0.4649721016738996, + "grad_norm": 0.8991280198097229, + "learning_rate": 0.00028167973565525325, + "loss": 0.9528, + "step": 4500 + }, + { + "epoch": 0.4660053730109527, + "grad_norm": 0.697215735912323, + "learning_rate": 0.00028086642955217666, + "loss": 0.8118, + "step": 4510 + }, + { + "epoch": 0.4670386443480058, + "grad_norm": 1.4364171028137207, + "learning_rate": 0.0002800527915820984, + "loss": 0.8903, + "step": 4520 + }, + { + "epoch": 0.4680719156850589, + "grad_norm": 0.9677620530128479, + "learning_rate": 0.00027923883049302067, + "loss": 0.9083, + "step": 4530 + }, + { + "epoch": 0.469105187022112, + "grad_norm": 1.240580439567566, + "learning_rate": 0.00027842455503641965, + "loss": 0.8643, + "step": 4540 + }, + { + "epoch": 0.4701384583591651, + "grad_norm": 1.102919101715088, + "learning_rate": 0.00027760997396715167, + "loss": 1.0802, + "step": 4550 + }, + { + "epoch": 0.47117172969621823, + "grad_norm": 0.921911895275116, + "learning_rate": 0.0002767950960433588, + "loss": 0.8717, + "step": 4560 + }, + { + "epoch": 0.47220500103327134, + "grad_norm": 1.3429372310638428, + "learning_rate": 0.000275979930026375, + "loss": 0.9584, + "step": 4570 + }, + { + "epoch": 0.47323827237032445, + "grad_norm": 1.0819973945617676, + "learning_rate": 0.0002751644846806315, + "loss": 0.7652, + "step": 4580 + }, + { + "epoch": 0.47427154370737756, + "grad_norm": 0.9730139970779419, + "learning_rate": 0.00027434876877356287, + "loss": 0.8873, + "step": 4590 + }, + { + "epoch": 0.4753048150444307, + "grad_norm": 0.9837844371795654, + "learning_rate": 0.00027353279107551276, + "loss": 0.9581, + "step": 4600 + }, + { + "epoch": 0.4763380863814838, + "grad_norm": 1.1134988069534302, + "learning_rate": 0.0002727165603596394, + "loss": 0.9409, + "step": 4610 + }, + { + "epoch": 0.4773713577185369, + "grad_norm": 0.7778913378715515, + "learning_rate": 0.00027190008540182135, + "loss": 0.9099, + "step": 4620 + }, + { + "epoch": 0.47840462905559, + "grad_norm": 1.166568636894226, + "learning_rate": 0.00027108337498056356, + "loss": 0.8565, + "step": 4630 + }, + { + "epoch": 0.4794379003926431, + "grad_norm": 0.7446126341819763, + "learning_rate": 0.0002702664378769021, + "loss": 0.8251, + "step": 4640 + }, + { + "epoch": 0.4804711717296962, + "grad_norm": 1.5371274948120117, + "learning_rate": 0.0002694492828743106, + "loss": 0.9475, + "step": 4650 + }, + { + "epoch": 0.48150444306674933, + "grad_norm": 0.7517269253730774, + "learning_rate": 0.0002686319187586054, + "loss": 0.8798, + "step": 4660 + }, + { + "epoch": 0.48253771440380244, + "grad_norm": 1.4352338314056396, + "learning_rate": 0.00026781435431785116, + "loss": 1.0462, + "step": 4670 + }, + { + "epoch": 0.48357098574085555, + "grad_norm": 1.1265496015548706, + "learning_rate": 0.0002669965983422663, + "loss": 0.9107, + "step": 4680 + }, + { + "epoch": 0.48460425707790866, + "grad_norm": 0.8634279370307922, + "learning_rate": 0.00026617865962412865, + "loss": 0.8822, + "step": 4690 + }, + { + "epoch": 0.48563752841496177, + "grad_norm": 1.0112096071243286, + "learning_rate": 0.0002653605469576809, + "loss": 0.8842, + "step": 4700 + }, + { + "epoch": 0.4866707997520149, + "grad_norm": 0.8251860737800598, + "learning_rate": 0.0002645422691390358, + "loss": 0.908, + "step": 4710 + }, + { + "epoch": 0.487704071089068, + "grad_norm": 0.7483717203140259, + "learning_rate": 0.0002637238349660819, + "loss": 0.8912, + "step": 4720 + }, + { + "epoch": 0.4887373424261211, + "grad_norm": 0.8353511691093445, + "learning_rate": 0.0002629052532383888, + "loss": 0.7702, + "step": 4730 + }, + { + "epoch": 0.4897706137631742, + "grad_norm": 0.9910038113594055, + "learning_rate": 0.00026208653275711265, + "loss": 0.8797, + "step": 4740 + }, + { + "epoch": 0.4908038851002273, + "grad_norm": 1.1162399053573608, + "learning_rate": 0.00026126768232490115, + "loss": 0.9161, + "step": 4750 + }, + { + "epoch": 0.49183715643728043, + "grad_norm": 0.8416900038719177, + "learning_rate": 0.00026044871074579955, + "loss": 1.0406, + "step": 4760 + }, + { + "epoch": 0.49287042777433354, + "grad_norm": 1.2481132745742798, + "learning_rate": 0.0002596296268251556, + "loss": 1.0545, + "step": 4770 + }, + { + "epoch": 0.49390369911138665, + "grad_norm": 1.115768551826477, + "learning_rate": 0.0002588104393695245, + "loss": 0.9128, + "step": 4780 + }, + { + "epoch": 0.49493697044843976, + "grad_norm": 0.7396090030670166, + "learning_rate": 0.0002579911571865752, + "loss": 1.0822, + "step": 4790 + }, + { + "epoch": 0.49597024178549287, + "grad_norm": 0.9519025087356567, + "learning_rate": 0.0002571717890849946, + "loss": 0.8659, + "step": 4800 + }, + { + "epoch": 0.497003513122546, + "grad_norm": 0.7343184947967529, + "learning_rate": 0.0002563523438743939, + "loss": 0.8773, + "step": 4810 + }, + { + "epoch": 0.4980367844595991, + "grad_norm": 1.4175959825515747, + "learning_rate": 0.0002555328303652129, + "loss": 0.7956, + "step": 4820 + }, + { + "epoch": 0.4990700557966522, + "grad_norm": 1.0520236492156982, + "learning_rate": 0.000254713257368626, + "loss": 0.9604, + "step": 4830 + }, + { + "epoch": 0.5001033271337053, + "grad_norm": 0.7733720541000366, + "learning_rate": 0.0002538936336964471, + "loss": 0.7715, + "step": 4840 + }, + { + "epoch": 0.5011365984707584, + "grad_norm": 1.4437015056610107, + "learning_rate": 0.0002530739681610349, + "loss": 0.9329, + "step": 4850 + }, + { + "epoch": 0.5021698698078115, + "grad_norm": 0.892091691493988, + "learning_rate": 0.00025225426957519825, + "loss": 0.8434, + "step": 4860 + }, + { + "epoch": 0.5032031411448646, + "grad_norm": 0.9304814338684082, + "learning_rate": 0.0002514345467521014, + "loss": 0.8912, + "step": 4870 + }, + { + "epoch": 0.5042364124819178, + "grad_norm": 1.063408374786377, + "learning_rate": 0.00025061480850516914, + "loss": 0.9901, + "step": 4880 + }, + { + "epoch": 0.5052696838189709, + "grad_norm": 1.0481808185577393, + "learning_rate": 0.00024979506364799207, + "loss": 0.89, + "step": 4890 + }, + { + "epoch": 0.506302955156024, + "grad_norm": 1.2767094373703003, + "learning_rate": 0.000248975320994232, + "loss": 1.1239, + "step": 4900 + }, + { + "epoch": 0.5073362264930771, + "grad_norm": 1.0354382991790771, + "learning_rate": 0.00024815558935752677, + "loss": 0.8866, + "step": 4910 + }, + { + "epoch": 0.5083694978301302, + "grad_norm": 0.7483660578727722, + "learning_rate": 0.0002473358775513959, + "loss": 0.8817, + "step": 4920 + }, + { + "epoch": 0.5094027691671833, + "grad_norm": 0.6301681399345398, + "learning_rate": 0.000246516194389146, + "loss": 0.8991, + "step": 4930 + }, + { + "epoch": 0.5104360405042364, + "grad_norm": 1.0671608448028564, + "learning_rate": 0.0002456965486837752, + "loss": 0.7891, + "step": 4940 + }, + { + "epoch": 0.5114693118412895, + "grad_norm": 1.3131459951400757, + "learning_rate": 0.00024487694924787935, + "loss": 0.9388, + "step": 4950 + }, + { + "epoch": 0.5125025831783426, + "grad_norm": 1.2082393169403076, + "learning_rate": 0.00024405740489355634, + "loss": 0.8558, + "step": 4960 + }, + { + "epoch": 0.5135358545153957, + "grad_norm": 0.915107786655426, + "learning_rate": 0.0002432379244323124, + "loss": 0.8703, + "step": 4970 + }, + { + "epoch": 0.5145691258524488, + "grad_norm": 1.2518354654312134, + "learning_rate": 0.00024241851667496645, + "loss": 0.9959, + "step": 4980 + }, + { + "epoch": 0.515602397189502, + "grad_norm": 1.0495281219482422, + "learning_rate": 0.00024159919043155556, + "loss": 1.0497, + "step": 4990 + }, + { + "epoch": 0.5166356685265551, + "grad_norm": 1.469854712486267, + "learning_rate": 0.0002407799545112409, + "loss": 0.7946, + "step": 5000 + }, + { + "epoch": 0.5176689398636082, + "grad_norm": 0.7165640592575073, + "learning_rate": 0.0002399608177222122, + "loss": 0.7768, + "step": 5010 + }, + { + "epoch": 0.5187022112006613, + "grad_norm": 0.7148188352584839, + "learning_rate": 0.00023914178887159327, + "loss": 0.8901, + "step": 5020 + }, + { + "epoch": 0.5197354825377144, + "grad_norm": 1.2880741357803345, + "learning_rate": 0.0002383228767653474, + "loss": 1.0241, + "step": 5030 + }, + { + "epoch": 0.5207687538747675, + "grad_norm": 1.3181527853012085, + "learning_rate": 0.0002375040902081831, + "loss": 0.9248, + "step": 5040 + }, + { + "epoch": 0.5218020252118206, + "grad_norm": 1.5515258312225342, + "learning_rate": 0.00023668543800345852, + "loss": 0.8286, + "step": 5050 + }, + { + "epoch": 0.5228352965488737, + "grad_norm": 1.0438510179519653, + "learning_rate": 0.0002358669289530875, + "loss": 0.7546, + "step": 5060 + }, + { + "epoch": 0.5238685678859268, + "grad_norm": 1.179828405380249, + "learning_rate": 0.00023504857185744453, + "loss": 0.8099, + "step": 5070 + }, + { + "epoch": 0.52490183922298, + "grad_norm": 1.0748212337493896, + "learning_rate": 0.00023423037551527085, + "loss": 0.8692, + "step": 5080 + }, + { + "epoch": 0.5259351105600331, + "grad_norm": 0.8671577572822571, + "learning_rate": 0.00023341234872357872, + "loss": 0.8423, + "step": 5090 + }, + { + "epoch": 0.5269683818970862, + "grad_norm": 1.147182583808899, + "learning_rate": 0.00023259450027755764, + "loss": 0.8823, + "step": 5100 + }, + { + "epoch": 0.5280016532341393, + "grad_norm": 1.0983150005340576, + "learning_rate": 0.0002317768389704799, + "loss": 0.879, + "step": 5110 + }, + { + "epoch": 0.5290349245711924, + "grad_norm": 1.5440839529037476, + "learning_rate": 0.00023095937359360525, + "loss": 0.9303, + "step": 5120 + }, + { + "epoch": 0.5300681959082455, + "grad_norm": 1.0175645351409912, + "learning_rate": 0.00023014211293608718, + "loss": 0.8823, + "step": 5130 + }, + { + "epoch": 0.5311014672452986, + "grad_norm": 1.1853641271591187, + "learning_rate": 0.00022932506578487782, + "loss": 0.9467, + "step": 5140 + }, + { + "epoch": 0.5321347385823517, + "grad_norm": 0.9132509827613831, + "learning_rate": 0.00022850824092463413, + "loss": 0.7992, + "step": 5150 + }, + { + "epoch": 0.5331680099194048, + "grad_norm": 0.8958359360694885, + "learning_rate": 0.0002276916471376227, + "loss": 0.9003, + "step": 5160 + }, + { + "epoch": 0.5342012812564579, + "grad_norm": 0.9055427312850952, + "learning_rate": 0.00022687529320362587, + "loss": 0.8922, + "step": 5170 + }, + { + "epoch": 0.535234552593511, + "grad_norm": 1.2413357496261597, + "learning_rate": 0.00022605918789984707, + "loss": 0.8021, + "step": 5180 + }, + { + "epoch": 0.5362678239305642, + "grad_norm": 1.096956729888916, + "learning_rate": 0.00022524334000081664, + "loss": 0.7335, + "step": 5190 + }, + { + "epoch": 0.5373010952676173, + "grad_norm": 1.0181100368499756, + "learning_rate": 0.00022442775827829722, + "loss": 0.9217, + "step": 5200 + }, + { + "epoch": 0.5383343666046704, + "grad_norm": 0.9444778561592102, + "learning_rate": 0.0002236124515011897, + "loss": 0.8444, + "step": 5210 + }, + { + "epoch": 0.5393676379417235, + "grad_norm": 0.9442963004112244, + "learning_rate": 0.00022279742843543887, + "loss": 0.7901, + "step": 5220 + }, + { + "epoch": 0.5404009092787766, + "grad_norm": 1.3280583620071411, + "learning_rate": 0.00022198269784393904, + "loss": 0.9, + "step": 5230 + }, + { + "epoch": 0.5414341806158297, + "grad_norm": 0.7821478843688965, + "learning_rate": 0.00022116826848643994, + "loss": 0.8782, + "step": 5240 + }, + { + "epoch": 0.5424674519528828, + "grad_norm": 0.942866861820221, + "learning_rate": 0.00022035414911945252, + "loss": 0.8268, + "step": 5250 + }, + { + "epoch": 0.5435007232899359, + "grad_norm": 1.0075554847717285, + "learning_rate": 0.0002195403484961549, + "loss": 0.7923, + "step": 5260 + }, + { + "epoch": 0.544533994626989, + "grad_norm": 1.0974137783050537, + "learning_rate": 0.00021872687536629806, + "loss": 0.8108, + "step": 5270 + }, + { + "epoch": 0.5455672659640421, + "grad_norm": 0.8785361647605896, + "learning_rate": 0.00021791373847611186, + "loss": 0.8236, + "step": 5280 + }, + { + "epoch": 0.5466005373010953, + "grad_norm": 1.300614356994629, + "learning_rate": 0.00021710094656821098, + "loss": 0.9875, + "step": 5290 + }, + { + "epoch": 0.5476338086381484, + "grad_norm": 0.9605757594108582, + "learning_rate": 0.0002162885083815011, + "loss": 0.8824, + "step": 5300 + }, + { + "epoch": 0.5486670799752015, + "grad_norm": 0.8986537456512451, + "learning_rate": 0.0002154764326510847, + "loss": 0.9041, + "step": 5310 + }, + { + "epoch": 0.5497003513122546, + "grad_norm": 0.8150350451469421, + "learning_rate": 0.0002146647281081671, + "loss": 0.8553, + "step": 5320 + }, + { + "epoch": 0.5507336226493077, + "grad_norm": 1.396665334701538, + "learning_rate": 0.00021385340347996297, + "loss": 0.9266, + "step": 5330 + }, + { + "epoch": 0.5517668939863608, + "grad_norm": 1.1270840167999268, + "learning_rate": 0.00021304246748960208, + "loss": 0.8426, + "step": 5340 + }, + { + "epoch": 0.5528001653234139, + "grad_norm": 1.4210362434387207, + "learning_rate": 0.00021223192885603571, + "loss": 0.9085, + "step": 5350 + }, + { + "epoch": 0.553833436660467, + "grad_norm": 0.9129871726036072, + "learning_rate": 0.00021142179629394283, + "loss": 0.8978, + "step": 5360 + }, + { + "epoch": 0.5548667079975201, + "grad_norm": 1.0395097732543945, + "learning_rate": 0.00021061207851363661, + "loss": 0.9224, + "step": 5370 + }, + { + "epoch": 0.5558999793345732, + "grad_norm": 0.5978155136108398, + "learning_rate": 0.00020980278422097045, + "loss": 0.789, + "step": 5380 + }, + { + "epoch": 0.5569332506716264, + "grad_norm": 0.8347190022468567, + "learning_rate": 0.0002089939221172446, + "loss": 0.8651, + "step": 5390 + }, + { + "epoch": 0.5579665220086795, + "grad_norm": 1.0471965074539185, + "learning_rate": 0.00020818550089911234, + "loss": 0.9259, + "step": 5400 + }, + { + "epoch": 0.5589997933457326, + "grad_norm": 0.8594735264778137, + "learning_rate": 0.00020737752925848707, + "loss": 0.7926, + "step": 5410 + }, + { + "epoch": 0.5600330646827857, + "grad_norm": 0.9445158243179321, + "learning_rate": 0.00020657001588244816, + "loss": 0.9561, + "step": 5420 + }, + { + "epoch": 0.5610663360198388, + "grad_norm": 0.7682790160179138, + "learning_rate": 0.00020576296945314787, + "loss": 0.8636, + "step": 5430 + }, + { + "epoch": 0.5620996073568919, + "grad_norm": 0.8511012196540833, + "learning_rate": 0.00020495639864771814, + "loss": 0.877, + "step": 5440 + }, + { + "epoch": 0.563132878693945, + "grad_norm": 1.3375914096832275, + "learning_rate": 0.00020415031213817704, + "loss": 0.8865, + "step": 5450 + }, + { + "epoch": 0.5641661500309981, + "grad_norm": 1.4135067462921143, + "learning_rate": 0.0002033447185913356, + "loss": 0.8816, + "step": 5460 + }, + { + "epoch": 0.5651994213680512, + "grad_norm": 1.0292867422103882, + "learning_rate": 0.0002025396266687045, + "loss": 0.7509, + "step": 5470 + }, + { + "epoch": 0.5662326927051043, + "grad_norm": 0.8321288228034973, + "learning_rate": 0.00020173504502640164, + "loss": 0.8376, + "step": 5480 + }, + { + "epoch": 0.5672659640421575, + "grad_norm": 0.7995291948318481, + "learning_rate": 0.000200930982315058, + "loss": 0.822, + "step": 5490 + }, + { + "epoch": 0.5682992353792106, + "grad_norm": 0.7237703800201416, + "learning_rate": 0.0002001274471797252, + "loss": 0.9819, + "step": 5500 + }, + { + "epoch": 0.5693325067162637, + "grad_norm": 1.2666767835617065, + "learning_rate": 0.0001993244482597829, + "loss": 0.7477, + "step": 5510 + }, + { + "epoch": 0.5703657780533168, + "grad_norm": 1.1628350019454956, + "learning_rate": 0.00019852199418884527, + "loss": 0.7239, + "step": 5520 + }, + { + "epoch": 0.5713990493903699, + "grad_norm": 1.1754180192947388, + "learning_rate": 0.00019772009359466828, + "loss": 0.8326, + "step": 5530 + }, + { + "epoch": 0.572432320727423, + "grad_norm": 0.9641863703727722, + "learning_rate": 0.00019691875509905718, + "loss": 0.779, + "step": 5540 + }, + { + "epoch": 0.5734655920644761, + "grad_norm": 1.4042216539382935, + "learning_rate": 0.00019611798731777397, + "loss": 0.8895, + "step": 5550 + }, + { + "epoch": 0.5744988634015292, + "grad_norm": 0.8444200158119202, + "learning_rate": 0.00019531779886044424, + "loss": 0.8636, + "step": 5560 + }, + { + "epoch": 0.5755321347385823, + "grad_norm": 0.8476575016975403, + "learning_rate": 0.0001945181983304647, + "loss": 0.802, + "step": 5570 + }, + { + "epoch": 0.5765654060756354, + "grad_norm": 1.086524486541748, + "learning_rate": 0.00019371919432491108, + "loss": 0.8356, + "step": 5580 + }, + { + "epoch": 0.5775986774126886, + "grad_norm": 0.8224913477897644, + "learning_rate": 0.00019292079543444557, + "loss": 0.8772, + "step": 5590 + }, + { + "epoch": 0.5786319487497417, + "grad_norm": 0.8469100594520569, + "learning_rate": 0.00019212301024322396, + "loss": 0.883, + "step": 5600 + }, + { + "epoch": 0.5796652200867948, + "grad_norm": 0.6162629127502441, + "learning_rate": 0.00019132584732880383, + "loss": 0.7449, + "step": 5610 + }, + { + "epoch": 0.5806984914238479, + "grad_norm": 1.1471900939941406, + "learning_rate": 0.00019052931526205267, + "loss": 0.8848, + "step": 5620 + }, + { + "epoch": 0.581731762760901, + "grad_norm": 1.4094953536987305, + "learning_rate": 0.00018973342260705458, + "loss": 0.7942, + "step": 5630 + }, + { + "epoch": 0.5827650340979541, + "grad_norm": 0.8406280279159546, + "learning_rate": 0.00018893817792101935, + "loss": 0.8205, + "step": 5640 + }, + { + "epoch": 0.5837983054350072, + "grad_norm": 1.5820422172546387, + "learning_rate": 0.00018814358975418984, + "loss": 0.733, + "step": 5650 + }, + { + "epoch": 0.5848315767720603, + "grad_norm": 1.0317678451538086, + "learning_rate": 0.00018734966664975023, + "loss": 0.8889, + "step": 5660 + }, + { + "epoch": 0.5858648481091134, + "grad_norm": 1.1359983682632446, + "learning_rate": 0.00018655641714373413, + "loss": 0.897, + "step": 5670 + }, + { + "epoch": 0.5868981194461665, + "grad_norm": 1.0122281312942505, + "learning_rate": 0.00018576384976493282, + "loss": 0.7574, + "step": 5680 + }, + { + "epoch": 0.5879313907832197, + "grad_norm": 0.9288797378540039, + "learning_rate": 0.00018497197303480348, + "loss": 0.7553, + "step": 5690 + }, + { + "epoch": 0.5889646621202728, + "grad_norm": 0.8775319457054138, + "learning_rate": 0.00018418079546737783, + "loss": 0.74, + "step": 5700 + }, + { + "epoch": 0.5899979334573259, + "grad_norm": 1.075302243232727, + "learning_rate": 0.0001833903255691702, + "loss": 0.8761, + "step": 5710 + }, + { + "epoch": 0.591031204794379, + "grad_norm": 1.0737923383712769, + "learning_rate": 0.00018260057183908625, + "loss": 0.8817, + "step": 5720 + }, + { + "epoch": 0.5920644761314321, + "grad_norm": 1.1469508409500122, + "learning_rate": 0.00018181154276833178, + "loss": 0.7806, + "step": 5730 + }, + { + "epoch": 0.5930977474684852, + "grad_norm": 0.8804445266723633, + "learning_rate": 0.00018102324684032115, + "loss": 0.7683, + "step": 5740 + }, + { + "epoch": 0.5941310188055383, + "grad_norm": 0.9593983888626099, + "learning_rate": 0.00018023569253058613, + "loss": 0.8427, + "step": 5750 + }, + { + "epoch": 0.5951642901425914, + "grad_norm": 0.7528694272041321, + "learning_rate": 0.00017944888830668488, + "loss": 0.8697, + "step": 5760 + }, + { + "epoch": 0.5961975614796445, + "grad_norm": 1.0688844919204712, + "learning_rate": 0.00017866284262811093, + "loss": 0.779, + "step": 5770 + }, + { + "epoch": 0.5972308328166976, + "grad_norm": 1.181410312652588, + "learning_rate": 0.00017787756394620202, + "loss": 0.8387, + "step": 5780 + }, + { + "epoch": 0.5982641041537508, + "grad_norm": 0.7557913064956665, + "learning_rate": 0.0001770930607040494, + "loss": 0.7897, + "step": 5790 + }, + { + "epoch": 0.5992973754908039, + "grad_norm": 0.9230046272277832, + "learning_rate": 0.00017630934133640704, + "loss": 0.7056, + "step": 5800 + }, + { + "epoch": 0.600330646827857, + "grad_norm": 0.839525043964386, + "learning_rate": 0.00017552641426960092, + "loss": 0.8133, + "step": 5810 + }, + { + "epoch": 0.6013639181649101, + "grad_norm": 1.102609395980835, + "learning_rate": 0.0001747442879214384, + "loss": 0.884, + "step": 5820 + }, + { + "epoch": 0.6023971895019632, + "grad_norm": 0.8469570875167847, + "learning_rate": 0.0001739629707011177, + "loss": 0.7398, + "step": 5830 + }, + { + "epoch": 0.6034304608390163, + "grad_norm": 1.2042063474655151, + "learning_rate": 0.0001731824710091377, + "loss": 0.8074, + "step": 5840 + }, + { + "epoch": 0.6044637321760694, + "grad_norm": 2.6056320667266846, + "learning_rate": 0.00017240279723720732, + "loss": 0.8092, + "step": 5850 + }, + { + "epoch": 0.6054970035131225, + "grad_norm": 0.788226306438446, + "learning_rate": 0.00017162395776815532, + "loss": 0.8346, + "step": 5860 + }, + { + "epoch": 0.6065302748501756, + "grad_norm": 0.8313378095626831, + "learning_rate": 0.00017084596097584044, + "loss": 0.7632, + "step": 5870 + }, + { + "epoch": 0.6075635461872287, + "grad_norm": 0.9668551087379456, + "learning_rate": 0.00017006881522506123, + "loss": 0.7813, + "step": 5880 + }, + { + "epoch": 0.6085968175242819, + "grad_norm": 1.2980097532272339, + "learning_rate": 0.00016929252887146597, + "loss": 0.633, + "step": 5890 + }, + { + "epoch": 0.609630088861335, + "grad_norm": 1.296339511871338, + "learning_rate": 0.00016851711026146294, + "loss": 0.8067, + "step": 5900 + }, + { + "epoch": 0.6106633601983881, + "grad_norm": 0.8966313600540161, + "learning_rate": 0.00016774256773213075, + "loss": 0.8509, + "step": 5910 + }, + { + "epoch": 0.6116966315354412, + "grad_norm": 1.408347249031067, + "learning_rate": 0.00016696890961112867, + "loss": 0.6981, + "step": 5920 + }, + { + "epoch": 0.6127299028724943, + "grad_norm": 1.421396017074585, + "learning_rate": 0.00016619614421660702, + "loss": 0.7626, + "step": 5930 + }, + { + "epoch": 0.6137631742095474, + "grad_norm": 1.4539517164230347, + "learning_rate": 0.0001654242798571177, + "loss": 0.7558, + "step": 5940 + }, + { + "epoch": 0.6147964455466005, + "grad_norm": 1.0731654167175293, + "learning_rate": 0.0001646533248315252, + "loss": 0.7626, + "step": 5950 + }, + { + "epoch": 0.6158297168836536, + "grad_norm": 1.100475788116455, + "learning_rate": 0.00016388328742891677, + "loss": 0.7243, + "step": 5960 + }, + { + "epoch": 0.6168629882207067, + "grad_norm": 1.0175180435180664, + "learning_rate": 0.00016311417592851396, + "loss": 0.8071, + "step": 5970 + }, + { + "epoch": 0.6178962595577598, + "grad_norm": 0.9055405259132385, + "learning_rate": 0.00016234599859958287, + "loss": 0.7203, + "step": 5980 + }, + { + "epoch": 0.618929530894813, + "grad_norm": 1.0334464311599731, + "learning_rate": 0.00016157876370134617, + "loss": 0.706, + "step": 5990 + }, + { + "epoch": 0.6199628022318661, + "grad_norm": 1.1770230531692505, + "learning_rate": 0.00016081247948289353, + "loss": 0.7802, + "step": 6000 + }, + { + "epoch": 0.6209960735689192, + "grad_norm": 0.8190944790840149, + "learning_rate": 0.00016004715418309304, + "loss": 0.7762, + "step": 6010 + }, + { + "epoch": 0.6220293449059723, + "grad_norm": 1.0050560235977173, + "learning_rate": 0.0001592827960305029, + "loss": 0.7886, + "step": 6020 + }, + { + "epoch": 0.6230626162430254, + "grad_norm": 1.1464440822601318, + "learning_rate": 0.000158519413243283, + "loss": 0.8016, + "step": 6030 + }, + { + "epoch": 0.6240958875800785, + "grad_norm": 0.8896029591560364, + "learning_rate": 0.00015775701402910626, + "loss": 0.802, + "step": 6040 + }, + { + "epoch": 0.6251291589171316, + "grad_norm": 0.7307144403457642, + "learning_rate": 0.00015699560658507012, + "loss": 0.7755, + "step": 6050 + }, + { + "epoch": 0.6261624302541847, + "grad_norm": 0.767782986164093, + "learning_rate": 0.00015623519909760953, + "loss": 0.697, + "step": 6060 + }, + { + "epoch": 0.6271957015912378, + "grad_norm": 0.8861032724380493, + "learning_rate": 0.00015547579974240766, + "loss": 0.8005, + "step": 6070 + }, + { + "epoch": 0.6282289729282909, + "grad_norm": 0.9863024950027466, + "learning_rate": 0.00015471741668430863, + "loss": 0.8324, + "step": 6080 + }, + { + "epoch": 0.629262244265344, + "grad_norm": 0.8046024441719055, + "learning_rate": 0.0001539600580772296, + "loss": 0.8615, + "step": 6090 + }, + { + "epoch": 0.6302955156023972, + "grad_norm": 1.283008098602295, + "learning_rate": 0.0001532037320640734, + "loss": 0.8317, + "step": 6100 + }, + { + "epoch": 0.6313287869394503, + "grad_norm": 0.9851896166801453, + "learning_rate": 0.00015244844677664036, + "loss": 0.7516, + "step": 6110 + }, + { + "epoch": 0.6323620582765034, + "grad_norm": 1.4205191135406494, + "learning_rate": 0.00015169421033554137, + "loss": 0.7692, + "step": 6120 + }, + { + "epoch": 0.6333953296135565, + "grad_norm": 0.9991669058799744, + "learning_rate": 0.0001509410308501104, + "loss": 0.7453, + "step": 6130 + }, + { + "epoch": 0.6344286009506096, + "grad_norm": 0.723731279373169, + "learning_rate": 0.00015018891641831744, + "loss": 0.8026, + "step": 6140 + }, + { + "epoch": 0.6354618722876627, + "grad_norm": 0.5217877626419067, + "learning_rate": 0.00014943787512668118, + "loss": 0.7759, + "step": 6150 + }, + { + "epoch": 0.6364951436247158, + "grad_norm": 0.6816266179084778, + "learning_rate": 0.0001486879150501822, + "loss": 0.7659, + "step": 6160 + }, + { + "epoch": 0.6375284149617689, + "grad_norm": 0.5098093152046204, + "learning_rate": 0.00014793904425217648, + "loss": 0.8352, + "step": 6170 + }, + { + "epoch": 0.638561686298822, + "grad_norm": 1.0227922201156616, + "learning_rate": 0.00014719127078430795, + "loss": 0.8162, + "step": 6180 + }, + { + "epoch": 0.6395949576358751, + "grad_norm": 1.5966726541519165, + "learning_rate": 0.00014644460268642263, + "loss": 0.7707, + "step": 6190 + }, + { + "epoch": 0.6406282289729283, + "grad_norm": 0.6314913630485535, + "learning_rate": 0.00014569904798648176, + "loss": 0.7286, + "step": 6200 + }, + { + "epoch": 0.6416615003099814, + "grad_norm": 1.072977900505066, + "learning_rate": 0.00014495461470047594, + "loss": 0.7554, + "step": 6210 + }, + { + "epoch": 0.6426947716470345, + "grad_norm": 1.458894968032837, + "learning_rate": 0.0001442113108323383, + "loss": 0.8975, + "step": 6220 + }, + { + "epoch": 0.6437280429840876, + "grad_norm": 0.7281268239021301, + "learning_rate": 0.00014346914437385905, + "loss": 0.6558, + "step": 6230 + }, + { + "epoch": 0.6447613143211407, + "grad_norm": 0.9581575393676758, + "learning_rate": 0.00014272812330459893, + "loss": 0.7082, + "step": 6240 + }, + { + "epoch": 0.6457945856581938, + "grad_norm": 1.271925687789917, + "learning_rate": 0.00014198825559180432, + "loss": 0.8919, + "step": 6250 + }, + { + "epoch": 0.6468278569952469, + "grad_norm": 0.9335398077964783, + "learning_rate": 0.00014124954919032064, + "loss": 0.7847, + "step": 6260 + }, + { + "epoch": 0.6478611283323, + "grad_norm": 0.7705923318862915, + "learning_rate": 0.00014051201204250726, + "loss": 0.8197, + "step": 6270 + }, + { + "epoch": 0.6488943996693531, + "grad_norm": 1.433953881263733, + "learning_rate": 0.00013977565207815238, + "loss": 0.8028, + "step": 6280 + }, + { + "epoch": 0.6499276710064062, + "grad_norm": 1.4676005840301514, + "learning_rate": 0.0001390404772143872, + "loss": 0.6803, + "step": 6290 + }, + { + "epoch": 0.6509609423434594, + "grad_norm": 0.8155161738395691, + "learning_rate": 0.00013830649535560093, + "loss": 0.8868, + "step": 6300 + }, + { + "epoch": 0.6519942136805125, + "grad_norm": 0.799152672290802, + "learning_rate": 0.00013757371439335604, + "loss": 0.716, + "step": 6310 + }, + { + "epoch": 0.6530274850175656, + "grad_norm": 1.120835304260254, + "learning_rate": 0.0001368421422063036, + "loss": 0.7953, + "step": 6320 + }, + { + "epoch": 0.6540607563546187, + "grad_norm": 0.7950730919837952, + "learning_rate": 0.00013611178666009794, + "loss": 0.7389, + "step": 6330 + }, + { + "epoch": 0.6550940276916718, + "grad_norm": 0.9817484617233276, + "learning_rate": 0.0001353826556073125, + "loss": 0.7451, + "step": 6340 + }, + { + "epoch": 0.6561272990287249, + "grad_norm": 0.7330102324485779, + "learning_rate": 0.00013465475688735526, + "loss": 0.7386, + "step": 6350 + }, + { + "epoch": 0.657160570365778, + "grad_norm": 0.7877906560897827, + "learning_rate": 0.00013392809832638486, + "loss": 0.7596, + "step": 6360 + }, + { + "epoch": 0.6581938417028311, + "grad_norm": 0.6060128211975098, + "learning_rate": 0.00013320268773722553, + "loss": 0.6482, + "step": 6370 + }, + { + "epoch": 0.6592271130398842, + "grad_norm": 1.8573722839355469, + "learning_rate": 0.00013247853291928395, + "loss": 0.7416, + "step": 6380 + }, + { + "epoch": 0.6602603843769373, + "grad_norm": 0.8654054403305054, + "learning_rate": 0.00013175564165846532, + "loss": 0.7027, + "step": 6390 + }, + { + "epoch": 0.6612936557139905, + "grad_norm": 0.7953044772148132, + "learning_rate": 0.00013103402172708918, + "loss": 0.7331, + "step": 6400 + }, + { + "epoch": 0.6623269270510436, + "grad_norm": 0.8240858316421509, + "learning_rate": 0.0001303136808838061, + "loss": 0.7007, + "step": 6410 + }, + { + "epoch": 0.6633601983880967, + "grad_norm": 0.8028026819229126, + "learning_rate": 0.00012959462687351435, + "loss": 0.7626, + "step": 6420 + }, + { + "epoch": 0.6643934697251498, + "grad_norm": 1.4915801286697388, + "learning_rate": 0.0001288768674272765, + "loss": 0.8101, + "step": 6430 + }, + { + "epoch": 0.6654267410622029, + "grad_norm": 0.8312230110168457, + "learning_rate": 0.0001281604102622364, + "loss": 0.7186, + "step": 6440 + }, + { + "epoch": 0.666460012399256, + "grad_norm": 0.8355895280838013, + "learning_rate": 0.000127445263081536, + "loss": 0.756, + "step": 6450 + }, + { + "epoch": 0.6674932837363091, + "grad_norm": 0.7141729593276978, + "learning_rate": 0.0001267314335742327, + "loss": 0.707, + "step": 6460 + }, + { + "epoch": 0.6685265550733622, + "grad_norm": 1.2061463594436646, + "learning_rate": 0.000126018929415217, + "loss": 0.6433, + "step": 6470 + }, + { + "epoch": 0.6695598264104153, + "grad_norm": 0.9426934123039246, + "learning_rate": 0.00012530775826512916, + "loss": 0.7131, + "step": 6480 + }, + { + "epoch": 0.6705930977474684, + "grad_norm": 0.5171589255332947, + "learning_rate": 0.00012459792777027754, + "loss": 0.6755, + "step": 6490 + }, + { + "epoch": 0.6716263690845216, + "grad_norm": 0.813228189945221, + "learning_rate": 0.00012388944556255614, + "loss": 0.7517, + "step": 6500 + }, + { + "epoch": 0.6726596404215747, + "grad_norm": 1.0219289064407349, + "learning_rate": 0.0001231823192593625, + "loss": 0.647, + "step": 6510 + }, + { + "epoch": 0.6736929117586278, + "grad_norm": 1.7192949056625366, + "learning_rate": 0.00012247655646351597, + "loss": 0.7193, + "step": 6520 + }, + { + "epoch": 0.6747261830956809, + "grad_norm": 0.8978095054626465, + "learning_rate": 0.00012177216476317565, + "loss": 0.7004, + "step": 6530 + }, + { + "epoch": 0.675759454432734, + "grad_norm": 1.2175655364990234, + "learning_rate": 0.00012106915173175942, + "loss": 0.7798, + "step": 6540 + }, + { + "epoch": 0.6767927257697871, + "grad_norm": 0.7553932070732117, + "learning_rate": 0.00012036752492786171, + "loss": 0.7077, + "step": 6550 + }, + { + "epoch": 0.6778259971068402, + "grad_norm": 1.2669484615325928, + "learning_rate": 0.0001196672918951728, + "loss": 0.6207, + "step": 6560 + }, + { + "epoch": 0.6788592684438933, + "grad_norm": 1.0848954916000366, + "learning_rate": 0.00011896846016239752, + "loss": 0.7216, + "step": 6570 + }, + { + "epoch": 0.6798925397809464, + "grad_norm": 1.306950569152832, + "learning_rate": 0.00011827103724317426, + "loss": 0.8071, + "step": 6580 + }, + { + "epoch": 0.6809258111179995, + "grad_norm": 1.175700306892395, + "learning_rate": 0.00011757503063599436, + "loss": 0.7327, + "step": 6590 + }, + { + "epoch": 0.6819590824550527, + "grad_norm": 1.052412509918213, + "learning_rate": 0.00011688044782412124, + "loss": 0.7375, + "step": 6600 + }, + { + "epoch": 0.6829923537921058, + "grad_norm": 0.922924280166626, + "learning_rate": 0.00011618729627551037, + "loss": 0.7765, + "step": 6610 + }, + { + "epoch": 0.6840256251291589, + "grad_norm": 0.6624295711517334, + "learning_rate": 0.00011549558344272834, + "loss": 0.7365, + "step": 6620 + }, + { + "epoch": 0.685058896466212, + "grad_norm": 0.9047833681106567, + "learning_rate": 0.00011480531676287322, + "loss": 0.7549, + "step": 6630 + }, + { + "epoch": 0.6860921678032651, + "grad_norm": 0.8293854594230652, + "learning_rate": 0.00011411650365749454, + "loss": 0.7531, + "step": 6640 + }, + { + "epoch": 0.6871254391403182, + "grad_norm": 0.865106463432312, + "learning_rate": 0.00011342915153251332, + "loss": 0.6403, + "step": 6650 + }, + { + "epoch": 0.6881587104773713, + "grad_norm": 1.0484962463378906, + "learning_rate": 0.0001127432677781425, + "loss": 0.7709, + "step": 6660 + }, + { + "epoch": 0.6891919818144244, + "grad_norm": 0.9296818375587463, + "learning_rate": 0.00011205885976880753, + "loss": 0.6943, + "step": 6670 + }, + { + "epoch": 0.6902252531514775, + "grad_norm": 0.9692133069038391, + "learning_rate": 0.00011137593486306727, + "loss": 0.6684, + "step": 6680 + }, + { + "epoch": 0.6912585244885306, + "grad_norm": 1.2010059356689453, + "learning_rate": 0.00011069450040353448, + "loss": 0.8146, + "step": 6690 + }, + { + "epoch": 0.6922917958255838, + "grad_norm": 0.8989507555961609, + "learning_rate": 0.00011001456371679707, + "loss": 0.6553, + "step": 6700 + }, + { + "epoch": 0.6933250671626369, + "grad_norm": 1.166821837425232, + "learning_rate": 0.00010933613211333943, + "loss": 0.76, + "step": 6710 + }, + { + "epoch": 0.69435833849969, + "grad_norm": 0.8655950427055359, + "learning_rate": 0.00010865921288746362, + "loss": 0.7237, + "step": 6720 + }, + { + "epoch": 0.6953916098367431, + "grad_norm": 1.0120173692703247, + "learning_rate": 0.00010798381331721108, + "loss": 0.8003, + "step": 6730 + }, + { + "epoch": 0.6964248811737962, + "grad_norm": 1.4091392755508423, + "learning_rate": 0.0001073099406642844, + "loss": 0.8101, + "step": 6740 + }, + { + "epoch": 0.6974581525108493, + "grad_norm": 0.8245342969894409, + "learning_rate": 0.00010663760217396906, + "loss": 0.6408, + "step": 6750 + }, + { + "epoch": 0.6984914238479024, + "grad_norm": 0.7788679003715515, + "learning_rate": 0.00010596680507505596, + "loss": 0.7559, + "step": 6760 + }, + { + "epoch": 0.6995246951849555, + "grad_norm": 0.5751708745956421, + "learning_rate": 0.00010529755657976315, + "loss": 0.6268, + "step": 6770 + }, + { + "epoch": 0.7005579665220086, + "grad_norm": 0.9208303689956665, + "learning_rate": 0.00010462986388365837, + "loss": 0.7369, + "step": 6780 + }, + { + "epoch": 0.7015912378590617, + "grad_norm": 0.8211421966552734, + "learning_rate": 0.00010396373416558233, + "loss": 0.7131, + "step": 6790 + }, + { + "epoch": 0.7026245091961149, + "grad_norm": 0.6540753245353699, + "learning_rate": 0.00010329917458757068, + "loss": 0.7293, + "step": 6800 + }, + { + "epoch": 0.703657780533168, + "grad_norm": 1.3494755029678345, + "learning_rate": 0.0001026361922947775, + "loss": 0.6745, + "step": 6810 + }, + { + "epoch": 0.7046910518702211, + "grad_norm": 0.8039237856864929, + "learning_rate": 0.00010197479441539827, + "loss": 0.6993, + "step": 6820 + }, + { + "epoch": 0.7057243232072742, + "grad_norm": 0.8956775069236755, + "learning_rate": 0.00010131498806059352, + "loss": 0.7667, + "step": 6830 + }, + { + "epoch": 0.7067575945443273, + "grad_norm": 0.7461487650871277, + "learning_rate": 0.00010065678032441208, + "loss": 0.6487, + "step": 6840 + }, + { + "epoch": 0.7077908658813804, + "grad_norm": 1.2220842838287354, + "learning_rate": 0.00010000017828371458, + "loss": 0.6455, + "step": 6850 + }, + { + "epoch": 0.7088241372184335, + "grad_norm": 0.9173330068588257, + "learning_rate": 9.934518899809799e-05, + "loss": 0.6879, + "step": 6860 + }, + { + "epoch": 0.7098574085554866, + "grad_norm": 0.6598500609397888, + "learning_rate": 9.869181950981948e-05, + "loss": 0.7175, + "step": 6870 + }, + { + "epoch": 0.7108906798925397, + "grad_norm": 1.0425814390182495, + "learning_rate": 9.804007684372038e-05, + "loss": 0.7662, + "step": 6880 + }, + { + "epoch": 0.7119239512295928, + "grad_norm": 1.0084232091903687, + "learning_rate": 9.738996800715094e-05, + "loss": 0.7261, + "step": 6890 + }, + { + "epoch": 0.712957222566646, + "grad_norm": 0.9493532180786133, + "learning_rate": 9.674149998989523e-05, + "loss": 0.7575, + "step": 6900 + }, + { + "epoch": 0.7139904939036991, + "grad_norm": 0.6258692741394043, + "learning_rate": 9.609467976409525e-05, + "loss": 0.6136, + "step": 6910 + }, + { + "epoch": 0.7150237652407522, + "grad_norm": 0.6811854243278503, + "learning_rate": 9.544951428417667e-05, + "loss": 0.8021, + "step": 6920 + }, + { + "epoch": 0.7160570365778053, + "grad_norm": 0.8507707118988037, + "learning_rate": 9.480601048677371e-05, + "loss": 0.644, + "step": 6930 + }, + { + "epoch": 0.7170903079148584, + "grad_norm": 1.1403719186782837, + "learning_rate": 9.416417529065488e-05, + "loss": 0.7257, + "step": 6940 + }, + { + "epoch": 0.7181235792519115, + "grad_norm": 0.9672065377235413, + "learning_rate": 9.352401559664817e-05, + "loss": 0.6545, + "step": 6950 + }, + { + "epoch": 0.7191568505889646, + "grad_norm": 1.221876859664917, + "learning_rate": 9.288553828756702e-05, + "loss": 0.6231, + "step": 6960 + }, + { + "epoch": 0.7201901219260177, + "grad_norm": 0.8762854337692261, + "learning_rate": 9.224875022813645e-05, + "loss": 0.7172, + "step": 6970 + }, + { + "epoch": 0.7212233932630708, + "grad_norm": 0.811069667339325, + "learning_rate": 9.161365826491913e-05, + "loss": 0.6319, + "step": 6980 + }, + { + "epoch": 0.7222566646001239, + "grad_norm": 1.3847732543945312, + "learning_rate": 9.098026922624175e-05, + "loss": 0.8109, + "step": 6990 + }, + { + "epoch": 0.723289935937177, + "grad_norm": 1.142921805381775, + "learning_rate": 9.03485899221216e-05, + "loss": 0.7571, + "step": 7000 + }, + { + "epoch": 0.7243232072742302, + "grad_norm": 0.5236563086509705, + "learning_rate": 8.971862714419368e-05, + "loss": 0.6592, + "step": 7010 + }, + { + "epoch": 0.7253564786112833, + "grad_norm": 1.2576050758361816, + "learning_rate": 8.909038766563703e-05, + "loss": 0.7714, + "step": 7020 + }, + { + "epoch": 0.7263897499483364, + "grad_norm": 0.9121869802474976, + "learning_rate": 8.846387824110247e-05, + "loss": 0.8312, + "step": 7030 + }, + { + "epoch": 0.7274230212853895, + "grad_norm": 1.3254677057266235, + "learning_rate": 8.783910560663968e-05, + "loss": 0.7242, + "step": 7040 + }, + { + "epoch": 0.7284562926224426, + "grad_norm": 1.342081904411316, + "learning_rate": 8.721607647962496e-05, + "loss": 0.7367, + "step": 7050 + }, + { + "epoch": 0.7294895639594957, + "grad_norm": 1.3918049335479736, + "learning_rate": 8.659479755868882e-05, + "loss": 0.8162, + "step": 7060 + }, + { + "epoch": 0.7305228352965488, + "grad_norm": 0.9307500720024109, + "learning_rate": 8.597527552364415e-05, + "loss": 0.6592, + "step": 7070 + }, + { + "epoch": 0.7315561066336019, + "grad_norm": 0.958733081817627, + "learning_rate": 8.535751703541417e-05, + "loss": 0.7096, + "step": 7080 + }, + { + "epoch": 0.732589377970655, + "grad_norm": 0.6930942535400391, + "learning_rate": 8.474152873596122e-05, + "loss": 0.6513, + "step": 7090 + }, + { + "epoch": 0.7336226493077082, + "grad_norm": 1.3499219417572021, + "learning_rate": 8.412731724821482e-05, + "loss": 0.7495, + "step": 7100 + }, + { + "epoch": 0.7346559206447613, + "grad_norm": 0.9743627905845642, + "learning_rate": 8.351488917600077e-05, + "loss": 0.6989, + "step": 7110 + }, + { + "epoch": 0.7356891919818144, + "grad_norm": 0.8228883147239685, + "learning_rate": 8.290425110397018e-05, + "loss": 0.7556, + "step": 7120 + }, + { + "epoch": 0.7367224633188675, + "grad_norm": 0.9745365381240845, + "learning_rate": 8.229540959752851e-05, + "loss": 0.6756, + "step": 7130 + }, + { + "epoch": 0.7377557346559206, + "grad_norm": 0.9700762629508972, + "learning_rate": 8.168837120276512e-05, + "loss": 0.7298, + "step": 7140 + }, + { + "epoch": 0.7387890059929737, + "grad_norm": 1.2264478206634521, + "learning_rate": 8.108314244638267e-05, + "loss": 0.7307, + "step": 7150 + }, + { + "epoch": 0.7398222773300268, + "grad_norm": 1.1239992380142212, + "learning_rate": 8.047972983562748e-05, + "loss": 0.7043, + "step": 7160 + }, + { + "epoch": 0.7408555486670799, + "grad_norm": 0.8346306681632996, + "learning_rate": 7.987813985821882e-05, + "loss": 0.7194, + "step": 7170 + }, + { + "epoch": 0.741888820004133, + "grad_norm": 0.8321127891540527, + "learning_rate": 7.927837898227966e-05, + "loss": 0.7095, + "step": 7180 + }, + { + "epoch": 0.7429220913411861, + "grad_norm": 0.9774494767189026, + "learning_rate": 7.868045365626702e-05, + "loss": 0.7328, + "step": 7190 + }, + { + "epoch": 0.7439553626782393, + "grad_norm": 0.8818146586418152, + "learning_rate": 7.808437030890258e-05, + "loss": 0.6084, + "step": 7200 + }, + { + "epoch": 0.7449886340152924, + "grad_norm": 0.6830877661705017, + "learning_rate": 7.749013534910362e-05, + "loss": 0.6242, + "step": 7210 + }, + { + "epoch": 0.7460219053523455, + "grad_norm": 0.7373770475387573, + "learning_rate": 7.689775516591397e-05, + "loss": 0.6592, + "step": 7220 + }, + { + "epoch": 0.7470551766893986, + "grad_norm": 1.5116766691207886, + "learning_rate": 7.63072361284357e-05, + "loss": 0.7781, + "step": 7230 + }, + { + "epoch": 0.7480884480264517, + "grad_norm": 1.11928129196167, + "learning_rate": 7.571858458576017e-05, + "loss": 0.8028, + "step": 7240 + }, + { + "epoch": 0.7491217193635048, + "grad_norm": 0.5922040939331055, + "learning_rate": 7.513180686690002e-05, + "loss": 0.6601, + "step": 7250 + }, + { + "epoch": 0.750154990700558, + "grad_norm": 0.9148753881454468, + "learning_rate": 7.454690928072088e-05, + "loss": 0.7152, + "step": 7260 + }, + { + "epoch": 0.7511882620376111, + "grad_norm": 0.7024368047714233, + "learning_rate": 7.396389811587415e-05, + "loss": 0.6906, + "step": 7270 + }, + { + "epoch": 0.7522215333746642, + "grad_norm": 1.3108892440795898, + "learning_rate": 7.338277964072864e-05, + "loss": 0.7471, + "step": 7280 + }, + { + "epoch": 0.7532548047117174, + "grad_norm": 0.6525394320487976, + "learning_rate": 7.280356010330361e-05, + "loss": 0.7364, + "step": 7290 + }, + { + "epoch": 0.7542880760487705, + "grad_norm": 1.344499111175537, + "learning_rate": 7.222624573120137e-05, + "loss": 0.7023, + "step": 7300 + }, + { + "epoch": 0.7553213473858236, + "grad_norm": 0.7209261059761047, + "learning_rate": 7.165084273154074e-05, + "loss": 0.6748, + "step": 7310 + }, + { + "epoch": 0.7563546187228767, + "grad_norm": 1.4905532598495483, + "learning_rate": 7.10773572908898e-05, + "loss": 0.6376, + "step": 7320 + }, + { + "epoch": 0.7573878900599298, + "grad_norm": 0.4700772762298584, + "learning_rate": 7.050579557519938e-05, + "loss": 0.6694, + "step": 7330 + }, + { + "epoch": 0.7584211613969829, + "grad_norm": 0.6527320146560669, + "learning_rate": 6.993616372973739e-05, + "loss": 0.6764, + "step": 7340 + }, + { + "epoch": 0.759454432734036, + "grad_norm": 0.7010611295700073, + "learning_rate": 6.936846787902207e-05, + "loss": 0.658, + "step": 7350 + }, + { + "epoch": 0.7604877040710891, + "grad_norm": 1.1044212579727173, + "learning_rate": 6.880271412675645e-05, + "loss": 0.6569, + "step": 7360 + }, + { + "epoch": 0.7615209754081422, + "grad_norm": 0.8779295682907104, + "learning_rate": 6.823890855576261e-05, + "loss": 0.6382, + "step": 7370 + }, + { + "epoch": 0.7625542467451953, + "grad_norm": 0.9512449502944946, + "learning_rate": 6.76770572279167e-05, + "loss": 0.6453, + "step": 7380 + }, + { + "epoch": 0.7635875180822485, + "grad_norm": 0.782164990901947, + "learning_rate": 6.711716618408281e-05, + "loss": 0.6539, + "step": 7390 + }, + { + "epoch": 0.7646207894193016, + "grad_norm": 1.7972410917282104, + "learning_rate": 6.655924144404906e-05, + "loss": 0.6142, + "step": 7400 + }, + { + "epoch": 0.7656540607563547, + "grad_norm": 0.9399009346961975, + "learning_rate": 6.600328900646222e-05, + "loss": 0.7511, + "step": 7410 + }, + { + "epoch": 0.7666873320934078, + "grad_norm": 2.6008989810943604, + "learning_rate": 6.544931484876368e-05, + "loss": 0.5901, + "step": 7420 + }, + { + "epoch": 0.7677206034304609, + "grad_norm": 1.3178836107254028, + "learning_rate": 6.489732492712466e-05, + "loss": 0.6217, + "step": 7430 + }, + { + "epoch": 0.768753874767514, + "grad_norm": 1.1651490926742554, + "learning_rate": 6.434732517638254e-05, + "loss": 0.6472, + "step": 7440 + }, + { + "epoch": 0.7697871461045671, + "grad_norm": 1.072582721710205, + "learning_rate": 6.37993215099771e-05, + "loss": 0.6595, + "step": 7450 + }, + { + "epoch": 0.7708204174416202, + "grad_norm": 1.5162521600723267, + "learning_rate": 6.325331981988653e-05, + "loss": 0.7025, + "step": 7460 + }, + { + "epoch": 0.7718536887786733, + "grad_norm": 0.804498016834259, + "learning_rate": 6.27093259765645e-05, + "loss": 0.7168, + "step": 7470 + }, + { + "epoch": 0.7728869601157264, + "grad_norm": 1.0188984870910645, + "learning_rate": 6.216734582887678e-05, + "loss": 0.6158, + "step": 7480 + }, + { + "epoch": 0.7739202314527796, + "grad_norm": 0.9018194675445557, + "learning_rate": 6.162738520403871e-05, + "loss": 0.5752, + "step": 7490 + }, + { + "epoch": 0.7749535027898327, + "grad_norm": 0.6925551891326904, + "learning_rate": 6.108944990755202e-05, + "loss": 0.6934, + "step": 7500 + }, + { + "epoch": 0.7759867741268858, + "grad_norm": 1.3633153438568115, + "learning_rate": 6.0553545723142864e-05, + "loss": 0.6985, + "step": 7510 + }, + { + "epoch": 0.7770200454639389, + "grad_norm": 1.114071249961853, + "learning_rate": 6.001967841269937e-05, + "loss": 0.6031, + "step": 7520 + }, + { + "epoch": 0.778053316800992, + "grad_norm": 0.7959581613540649, + "learning_rate": 5.9487853716209794e-05, + "loss": 0.6645, + "step": 7530 + }, + { + "epoch": 0.7790865881380451, + "grad_norm": 0.8598949313163757, + "learning_rate": 5.89580773517008e-05, + "loss": 0.7458, + "step": 7540 + }, + { + "epoch": 0.7801198594750982, + "grad_norm": 1.4160740375518799, + "learning_rate": 5.843035501517596e-05, + "loss": 0.6635, + "step": 7550 + }, + { + "epoch": 0.7811531308121513, + "grad_norm": 1.0915048122406006, + "learning_rate": 5.790469238055465e-05, + "loss": 0.6052, + "step": 7560 + }, + { + "epoch": 0.7821864021492044, + "grad_norm": 0.8317732214927673, + "learning_rate": 5.738109509961076e-05, + "loss": 0.7007, + "step": 7570 + }, + { + "epoch": 0.7832196734862575, + "grad_norm": 0.9893842339515686, + "learning_rate": 5.685956880191218e-05, + "loss": 0.5915, + "step": 7580 + }, + { + "epoch": 0.7842529448233106, + "grad_norm": 1.2170053720474243, + "learning_rate": 5.634011909476008e-05, + "loss": 0.6921, + "step": 7590 + }, + { + "epoch": 0.7852862161603638, + "grad_norm": 1.261479377746582, + "learning_rate": 5.582275156312885e-05, + "loss": 0.6338, + "step": 7600 + }, + { + "epoch": 0.7863194874974169, + "grad_norm": 1.0150582790374756, + "learning_rate": 5.5307471769605875e-05, + "loss": 0.637, + "step": 7610 + }, + { + "epoch": 0.78735275883447, + "grad_norm": 1.148558497428894, + "learning_rate": 5.479428525433167e-05, + "loss": 0.6243, + "step": 7620 + }, + { + "epoch": 0.7883860301715231, + "grad_norm": 0.6112420558929443, + "learning_rate": 5.4283197534940684e-05, + "loss": 0.579, + "step": 7630 + }, + { + "epoch": 0.7894193015085762, + "grad_norm": 1.1648898124694824, + "learning_rate": 5.377421410650149e-05, + "loss": 0.6978, + "step": 7640 + }, + { + "epoch": 0.7904525728456293, + "grad_norm": 1.3245562314987183, + "learning_rate": 5.326734044145801e-05, + "loss": 0.5724, + "step": 7650 + }, + { + "epoch": 0.7914858441826824, + "grad_norm": 1.0554862022399902, + "learning_rate": 5.276258198957051e-05, + "loss": 0.7884, + "step": 7660 + }, + { + "epoch": 0.7925191155197355, + "grad_norm": 1.1452159881591797, + "learning_rate": 5.225994417785726e-05, + "loss": 0.7309, + "step": 7670 + }, + { + "epoch": 0.7935523868567886, + "grad_norm": 0.7298494577407837, + "learning_rate": 5.175943241053582e-05, + "loss": 0.6804, + "step": 7680 + }, + { + "epoch": 0.7945856581938417, + "grad_norm": 1.0335783958435059, + "learning_rate": 5.1261052068965306e-05, + "loss": 0.6306, + "step": 7690 + }, + { + "epoch": 0.7956189295308949, + "grad_norm": 0.6662817001342773, + "learning_rate": 5.0764808511588155e-05, + "loss": 0.6118, + "step": 7700 + }, + { + "epoch": 0.796652200867948, + "grad_norm": 0.8202494978904724, + "learning_rate": 5.027070707387296e-05, + "loss": 0.6764, + "step": 7710 + }, + { + "epoch": 0.7976854722050011, + "grad_norm": 1.5033270120620728, + "learning_rate": 4.977875306825672e-05, + "loss": 0.7725, + "step": 7720 + }, + { + "epoch": 0.7987187435420542, + "grad_norm": 0.7202991843223572, + "learning_rate": 4.928895178408782e-05, + "loss": 0.685, + "step": 7730 + }, + { + "epoch": 0.7997520148791073, + "grad_norm": 0.8215998411178589, + "learning_rate": 4.880130848756925e-05, + "loss": 0.6536, + "step": 7740 + }, + { + "epoch": 0.8007852862161604, + "grad_norm": 1.0507868528366089, + "learning_rate": 4.8315828421701986e-05, + "loss": 0.6238, + "step": 7750 + }, + { + "epoch": 0.8018185575532135, + "grad_norm": 1.0970897674560547, + "learning_rate": 4.783251680622847e-05, + "loss": 0.6302, + "step": 7760 + }, + { + "epoch": 0.8028518288902666, + "grad_norm": 1.2945756912231445, + "learning_rate": 4.7351378837576565e-05, + "loss": 0.5465, + "step": 7770 + }, + { + "epoch": 0.8038851002273197, + "grad_norm": 1.6131089925765991, + "learning_rate": 4.687241968880393e-05, + "loss": 0.6698, + "step": 7780 + }, + { + "epoch": 0.8049183715643728, + "grad_norm": 1.0525102615356445, + "learning_rate": 4.639564450954201e-05, + "loss": 0.6425, + "step": 7790 + }, + { + "epoch": 0.805951642901426, + "grad_norm": 1.0598100423812866, + "learning_rate": 4.5921058425940745e-05, + "loss": 0.5235, + "step": 7800 + }, + { + "epoch": 0.8069849142384791, + "grad_norm": 1.4403691291809082, + "learning_rate": 4.544866654061364e-05, + "loss": 0.7243, + "step": 7810 + }, + { + "epoch": 0.8080181855755322, + "grad_norm": 0.995022714138031, + "learning_rate": 4.497847393258292e-05, + "loss": 0.6382, + "step": 7820 + }, + { + "epoch": 0.8090514569125853, + "grad_norm": 1.2074267864227295, + "learning_rate": 4.451048565722468e-05, + "loss": 0.644, + "step": 7830 + }, + { + "epoch": 0.8100847282496384, + "grad_norm": 1.038830041885376, + "learning_rate": 4.404470674621463e-05, + "loss": 0.6955, + "step": 7840 + }, + { + "epoch": 0.8111179995866915, + "grad_norm": 1.028713345527649, + "learning_rate": 4.3581142207474225e-05, + "loss": 0.5929, + "step": 7850 + }, + { + "epoch": 0.8121512709237446, + "grad_norm": 1.214220404624939, + "learning_rate": 4.311979702511645e-05, + "loss": 0.6807, + "step": 7860 + }, + { + "epoch": 0.8131845422607977, + "grad_norm": 0.91230708360672, + "learning_rate": 4.266067615939234e-05, + "loss": 0.5622, + "step": 7870 + }, + { + "epoch": 0.8142178135978508, + "grad_norm": 0.7656465172767639, + "learning_rate": 4.220378454663784e-05, + "loss": 0.7191, + "step": 7880 + }, + { + "epoch": 0.815251084934904, + "grad_norm": 0.9677980542182922, + "learning_rate": 4.1749127099220684e-05, + "loss": 0.6268, + "step": 7890 + }, + { + "epoch": 0.8162843562719571, + "grad_norm": 0.8608049750328064, + "learning_rate": 4.129670870548738e-05, + "loss": 0.608, + "step": 7900 + }, + { + "epoch": 0.8173176276090102, + "grad_norm": 1.3923230171203613, + "learning_rate": 4.084653422971077e-05, + "loss": 0.6626, + "step": 7910 + }, + { + "epoch": 0.8183508989460633, + "grad_norm": 1.0703678131103516, + "learning_rate": 4.039860851203775e-05, + "loss": 0.5827, + "step": 7920 + }, + { + "epoch": 0.8193841702831164, + "grad_norm": 0.9771378040313721, + "learning_rate": 3.9952936368437275e-05, + "loss": 0.7703, + "step": 7930 + }, + { + "epoch": 0.8204174416201695, + "grad_norm": 1.0725332498550415, + "learning_rate": 3.950952259064841e-05, + "loss": 0.6425, + "step": 7940 + }, + { + "epoch": 0.8214507129572226, + "grad_norm": 1.1629067659378052, + "learning_rate": 3.9068371946128945e-05, + "loss": 0.6658, + "step": 7950 + }, + { + "epoch": 0.8224839842942757, + "grad_norm": 0.9839202165603638, + "learning_rate": 3.862948917800424e-05, + "loss": 0.5883, + "step": 7960 + }, + { + "epoch": 0.8235172556313288, + "grad_norm": 0.8677191138267517, + "learning_rate": 3.8192879005015974e-05, + "loss": 0.6664, + "step": 7970 + }, + { + "epoch": 0.8245505269683819, + "grad_norm": 0.8415852189064026, + "learning_rate": 3.775854612147153e-05, + "loss": 0.5629, + "step": 7980 + }, + { + "epoch": 0.825583798305435, + "grad_norm": 0.7151034474372864, + "learning_rate": 3.7326495197193555e-05, + "loss": 0.5575, + "step": 7990 + }, + { + "epoch": 0.8266170696424882, + "grad_norm": 1.2307301759719849, + "learning_rate": 3.689673087746975e-05, + "loss": 0.6356, + "step": 8000 + }, + { + "epoch": 0.8276503409795413, + "grad_norm": 2.110441207885742, + "learning_rate": 3.646925778300286e-05, + "loss": 0.7123, + "step": 8010 + }, + { + "epoch": 0.8286836123165944, + "grad_norm": 0.47642096877098083, + "learning_rate": 3.604408050986105e-05, + "loss": 0.6165, + "step": 8020 + }, + { + "epoch": 0.8297168836536475, + "grad_norm": 0.69282066822052, + "learning_rate": 3.5621203629428386e-05, + "loss": 0.6196, + "step": 8030 + }, + { + "epoch": 0.8307501549907006, + "grad_norm": 1.5054296255111694, + "learning_rate": 3.520063168835605e-05, + "loss": 0.749, + "step": 8040 + }, + { + "epoch": 0.8317834263277537, + "grad_norm": 1.0589144229888916, + "learning_rate": 3.478236920851283e-05, + "loss": 0.6056, + "step": 8050 + }, + { + "epoch": 0.8328166976648068, + "grad_norm": 0.8430522680282593, + "learning_rate": 3.4366420686937044e-05, + "loss": 0.6502, + "step": 8060 + }, + { + "epoch": 0.8338499690018599, + "grad_norm": 0.8304775357246399, + "learning_rate": 3.395279059578798e-05, + "loss": 0.6363, + "step": 8070 + }, + { + "epoch": 0.834883240338913, + "grad_norm": 0.5285193920135498, + "learning_rate": 3.3541483382297734e-05, + "loss": 0.751, + "step": 8080 + }, + { + "epoch": 0.8359165116759661, + "grad_norm": 0.9843568205833435, + "learning_rate": 3.313250346872362e-05, + "loss": 0.5978, + "step": 8090 + }, + { + "epoch": 0.8369497830130193, + "grad_norm": 1.0727510452270508, + "learning_rate": 3.272585525230032e-05, + "loss": 0.528, + "step": 8100 + }, + { + "epoch": 0.8379830543500724, + "grad_norm": 0.9402909874916077, + "learning_rate": 3.232154310519298e-05, + "loss": 0.5649, + "step": 8110 + }, + { + "epoch": 0.8390163256871255, + "grad_norm": 0.6825330257415771, + "learning_rate": 3.1919571374449894e-05, + "loss": 0.6619, + "step": 8120 + }, + { + "epoch": 0.8400495970241786, + "grad_norm": 1.605404019355774, + "learning_rate": 3.151994438195582e-05, + "loss": 0.7536, + "step": 8130 + }, + { + "epoch": 0.8410828683612317, + "grad_norm": 0.9325007796287537, + "learning_rate": 3.112266642438566e-05, + "loss": 0.7271, + "step": 8140 + }, + { + "epoch": 0.8421161396982848, + "grad_norm": 1.083452820777893, + "learning_rate": 3.072774177315807e-05, + "loss": 0.6664, + "step": 8150 + }, + { + "epoch": 0.8431494110353379, + "grad_norm": 0.6602939963340759, + "learning_rate": 3.0335174674389732e-05, + "loss": 0.5289, + "step": 8160 + }, + { + "epoch": 0.844182682372391, + "grad_norm": 1.497051477432251, + "learning_rate": 2.9944969348849493e-05, + "loss": 0.5557, + "step": 8170 + }, + { + "epoch": 0.8452159537094441, + "grad_norm": 0.7010759115219116, + "learning_rate": 2.9557129991913235e-05, + "loss": 0.5932, + "step": 8180 + }, + { + "epoch": 0.8462492250464972, + "grad_norm": 0.93887859582901, + "learning_rate": 2.917166077351846e-05, + "loss": 0.5421, + "step": 8190 + }, + { + "epoch": 0.8472824963835504, + "grad_norm": 1.242521047592163, + "learning_rate": 2.8788565838119757e-05, + "loss": 0.5551, + "step": 8200 + }, + { + "epoch": 0.8483157677206035, + "grad_norm": 1.1588674783706665, + "learning_rate": 2.840784930464399e-05, + "loss": 0.5647, + "step": 8210 + }, + { + "epoch": 0.8493490390576566, + "grad_norm": 1.381052851676941, + "learning_rate": 2.8029515266446232e-05, + "loss": 0.6187, + "step": 8220 + }, + { + "epoch": 0.8503823103947097, + "grad_norm": 1.1169166564941406, + "learning_rate": 2.765356779126557e-05, + "loss": 0.6038, + "step": 8230 + }, + { + "epoch": 0.8514155817317628, + "grad_norm": 1.2974213361740112, + "learning_rate": 2.7280010921181487e-05, + "loss": 0.7017, + "step": 8240 + }, + { + "epoch": 0.8524488530688159, + "grad_norm": 0.9372439384460449, + "learning_rate": 2.690884867257032e-05, + "loss": 0.5887, + "step": 8250 + }, + { + "epoch": 0.853482124405869, + "grad_norm": 0.8471134305000305, + "learning_rate": 2.6540085036062227e-05, + "loss": 0.5159, + "step": 8260 + }, + { + "epoch": 0.8545153957429221, + "grad_norm": 1.077082633972168, + "learning_rate": 2.617372397649814e-05, + "loss": 0.6389, + "step": 8270 + }, + { + "epoch": 0.8555486670799752, + "grad_norm": 1.1442515850067139, + "learning_rate": 2.5809769432886986e-05, + "loss": 0.6233, + "step": 8280 + }, + { + "epoch": 0.8565819384170283, + "grad_norm": 1.59578537940979, + "learning_rate": 2.5448225318363828e-05, + "loss": 0.6336, + "step": 8290 + }, + { + "epoch": 0.8576152097540815, + "grad_norm": 0.7789118885993958, + "learning_rate": 2.5089095520147266e-05, + "loss": 0.6106, + "step": 8300 + }, + { + "epoch": 0.8586484810911346, + "grad_norm": 0.8877026438713074, + "learning_rate": 2.4732383899497972e-05, + "loss": 0.6226, + "step": 8310 + }, + { + "epoch": 0.8596817524281877, + "grad_norm": 0.9339987635612488, + "learning_rate": 2.4378094291676978e-05, + "loss": 0.5998, + "step": 8320 + }, + { + "epoch": 0.8607150237652408, + "grad_norm": 0.45565560460090637, + "learning_rate": 2.4026230505904633e-05, + "loss": 0.5463, + "step": 8330 + }, + { + "epoch": 0.8617482951022939, + "grad_norm": 0.7657277584075928, + "learning_rate": 2.367679632531955e-05, + "loss": 0.6507, + "step": 8340 + }, + { + "epoch": 0.862781566439347, + "grad_norm": 1.4094293117523193, + "learning_rate": 2.3329795506937728e-05, + "loss": 0.5541, + "step": 8350 + }, + { + "epoch": 0.8638148377764001, + "grad_norm": 0.5916699171066284, + "learning_rate": 2.2985231781612466e-05, + "loss": 0.6416, + "step": 8360 + }, + { + "epoch": 0.8648481091134532, + "grad_norm": 0.7877978086471558, + "learning_rate": 2.2643108853994317e-05, + "loss": 0.6413, + "step": 8370 + }, + { + "epoch": 0.8658813804505063, + "grad_norm": 1.3543034791946411, + "learning_rate": 2.2303430402490805e-05, + "loss": 0.6255, + "step": 8380 + }, + { + "epoch": 0.8669146517875594, + "grad_norm": 1.2339602708816528, + "learning_rate": 2.1966200079227257e-05, + "loss": 0.6086, + "step": 8390 + }, + { + "epoch": 0.8679479231246126, + "grad_norm": 1.563103199005127, + "learning_rate": 2.1631421510007575e-05, + "loss": 0.6084, + "step": 8400 + }, + { + "epoch": 0.8689811944616657, + "grad_norm": 1.1265037059783936, + "learning_rate": 2.129909829427479e-05, + "loss": 0.5604, + "step": 8410 + }, + { + "epoch": 0.8700144657987188, + "grad_norm": 1.4551678895950317, + "learning_rate": 2.0969234005072916e-05, + "loss": 0.6835, + "step": 8420 + }, + { + "epoch": 0.8710477371357719, + "grad_norm": 0.9663675427436829, + "learning_rate": 2.064183218900817e-05, + "loss": 0.6, + "step": 8430 + }, + { + "epoch": 0.872081008472825, + "grad_norm": 1.2805395126342773, + "learning_rate": 2.0316896366211118e-05, + "loss": 0.6471, + "step": 8440 + }, + { + "epoch": 0.8731142798098781, + "grad_norm": 0.9442639946937561, + "learning_rate": 1.9994430030298496e-05, + "loss": 0.6104, + "step": 8450 + }, + { + "epoch": 0.8741475511469312, + "grad_norm": 1.5761253833770752, + "learning_rate": 1.967443664833596e-05, + "loss": 0.6629, + "step": 8460 + }, + { + "epoch": 0.8751808224839843, + "grad_norm": 1.1689879894256592, + "learning_rate": 1.9356919660800553e-05, + "loss": 0.6161, + "step": 8470 + }, + { + "epoch": 0.8762140938210374, + "grad_norm": 1.1315240859985352, + "learning_rate": 1.9041882481543955e-05, + "loss": 0.574, + "step": 8480 + }, + { + "epoch": 0.8772473651580905, + "grad_norm": 1.0627079010009766, + "learning_rate": 1.8729328497755578e-05, + "loss": 0.6021, + "step": 8490 + }, + { + "epoch": 0.8782806364951437, + "grad_norm": 0.9222522377967834, + "learning_rate": 1.8419261069926197e-05, + "loss": 0.5026, + "step": 8500 + }, + { + "epoch": 0.8793139078321968, + "grad_norm": 0.92143315076828, + "learning_rate": 1.8111683531812002e-05, + "loss": 0.5853, + "step": 8510 + }, + { + "epoch": 0.8803471791692499, + "grad_norm": 1.042581558227539, + "learning_rate": 1.7806599190398455e-05, + "loss": 0.5239, + "step": 8520 + }, + { + "epoch": 0.881380450506303, + "grad_norm": 1.405364751815796, + "learning_rate": 1.750401132586496e-05, + "loss": 0.571, + "step": 8530 + }, + { + "epoch": 0.8824137218433561, + "grad_norm": 1.2399895191192627, + "learning_rate": 1.720392319154948e-05, + "loss": 0.5704, + "step": 8540 + }, + { + "epoch": 0.8834469931804092, + "grad_norm": 0.9870766401290894, + "learning_rate": 1.6906338013913585e-05, + "loss": 0.5944, + "step": 8550 + }, + { + "epoch": 0.8844802645174623, + "grad_norm": 0.8951932787895203, + "learning_rate": 1.6611258992507817e-05, + "loss": 0.5861, + "step": 8560 + }, + { + "epoch": 0.8855135358545154, + "grad_norm": 1.8214749097824097, + "learning_rate": 1.6318689299937213e-05, + "loss": 0.6643, + "step": 8570 + }, + { + "epoch": 0.8865468071915685, + "grad_norm": 0.8029395341873169, + "learning_rate": 1.6028632081827173e-05, + "loss": 0.5645, + "step": 8580 + }, + { + "epoch": 0.8875800785286216, + "grad_norm": 0.43516650795936584, + "learning_rate": 1.574109045678987e-05, + "loss": 0.5996, + "step": 8590 + }, + { + "epoch": 0.8886133498656748, + "grad_norm": 1.673608422279358, + "learning_rate": 1.545606751639034e-05, + "loss": 0.6781, + "step": 8600 + }, + { + "epoch": 0.8896466212027279, + "grad_norm": 1.0479223728179932, + "learning_rate": 1.5173566325113575e-05, + "loss": 0.5522, + "step": 8610 + }, + { + "epoch": 0.890679892539781, + "grad_norm": 1.2102528810501099, + "learning_rate": 1.4893589920331352e-05, + "loss": 0.5926, + "step": 8620 + }, + { + "epoch": 0.8917131638768341, + "grad_norm": 1.0137587785720825, + "learning_rate": 1.4616141312269737e-05, + "loss": 0.5393, + "step": 8630 + }, + { + "epoch": 0.8927464352138872, + "grad_norm": 2.0007264614105225, + "learning_rate": 1.4341223483976624e-05, + "loss": 0.6629, + "step": 8640 + }, + { + "epoch": 0.8937797065509403, + "grad_norm": 0.966791570186615, + "learning_rate": 1.406883939128964e-05, + "loss": 0.6255, + "step": 8650 + }, + { + "epoch": 0.8948129778879934, + "grad_norm": 1.436010479927063, + "learning_rate": 1.3798991962804541e-05, + "loss": 0.6226, + "step": 8660 + }, + { + "epoch": 0.8958462492250465, + "grad_norm": 1.060235857963562, + "learning_rate": 1.3531684099843483e-05, + "loss": 0.6685, + "step": 8670 + }, + { + "epoch": 0.8968795205620996, + "grad_norm": 0.578377902507782, + "learning_rate": 1.3266918676423962e-05, + "loss": 0.5681, + "step": 8680 + }, + { + "epoch": 0.8979127918991527, + "grad_norm": 0.5564404726028442, + "learning_rate": 1.300469853922795e-05, + "loss": 0.587, + "step": 8690 + }, + { + "epoch": 0.8989460632362059, + "grad_norm": 0.9987608194351196, + "learning_rate": 1.2745026507571145e-05, + "loss": 0.65, + "step": 8700 + }, + { + "epoch": 0.899979334573259, + "grad_norm": 1.4747377634048462, + "learning_rate": 1.2487905373372799e-05, + "loss": 0.6024, + "step": 8710 + }, + { + "epoch": 0.9010126059103121, + "grad_norm": 0.7746207118034363, + "learning_rate": 1.223333790112563e-05, + "loss": 0.6037, + "step": 8720 + }, + { + "epoch": 0.9020458772473652, + "grad_norm": 1.0953818559646606, + "learning_rate": 1.1981326827866124e-05, + "loss": 0.6469, + "step": 8730 + }, + { + "epoch": 0.9030791485844183, + "grad_norm": 1.1232373714447021, + "learning_rate": 1.1731874863145143e-05, + "loss": 0.591, + "step": 8740 + }, + { + "epoch": 0.9041124199214714, + "grad_norm": 2.062878131866455, + "learning_rate": 1.148498468899864e-05, + "loss": 0.6622, + "step": 8750 + }, + { + "epoch": 0.9051456912585245, + "grad_norm": 1.1355712413787842, + "learning_rate": 1.1240658959918965e-05, + "loss": 0.762, + "step": 8760 + }, + { + "epoch": 0.9061789625955776, + "grad_norm": 1.6199957132339478, + "learning_rate": 1.0998900302826382e-05, + "loss": 0.6976, + "step": 8770 + }, + { + "epoch": 0.9072122339326307, + "grad_norm": 1.111836552619934, + "learning_rate": 1.0759711317040649e-05, + "loss": 0.5116, + "step": 8780 + }, + { + "epoch": 0.9082455052696838, + "grad_norm": 1.650025725364685, + "learning_rate": 1.0523094574253101e-05, + "loss": 0.5384, + "step": 8790 + }, + { + "epoch": 0.909278776606737, + "grad_norm": 0.9917317628860474, + "learning_rate": 1.0289052618499218e-05, + "loss": 0.5526, + "step": 8800 + }, + { + "epoch": 0.9103120479437901, + "grad_norm": 0.618635356426239, + "learning_rate": 1.005758796613096e-05, + "loss": 0.6995, + "step": 8810 + }, + { + "epoch": 0.9113453192808432, + "grad_norm": 1.289839506149292, + "learning_rate": 9.828703105789983e-06, + "loss": 0.6453, + "step": 8820 + }, + { + "epoch": 0.9123785906178963, + "grad_norm": 1.1484878063201904, + "learning_rate": 9.602400498380542e-06, + "loss": 0.5949, + "step": 8830 + }, + { + "epoch": 0.9134118619549494, + "grad_norm": 1.1618013381958008, + "learning_rate": 9.378682577043524e-06, + "loss": 0.633, + "step": 8840 + }, + { + "epoch": 0.9144451332920025, + "grad_norm": 0.8447564840316772, + "learning_rate": 9.157551747129844e-06, + "loss": 0.6921, + "step": 8850 + }, + { + "epoch": 0.9154784046290556, + "grad_norm": 2.2064759731292725, + "learning_rate": 8.939010386174783e-06, + "loss": 0.5906, + "step": 8860 + }, + { + "epoch": 0.9165116759661087, + "grad_norm": 1.3931576013565063, + "learning_rate": 8.723060843872393e-06, + "loss": 0.6567, + "step": 8870 + }, + { + "epoch": 0.9175449473031618, + "grad_norm": 0.8289423584938049, + "learning_rate": 8.50970544205032e-06, + "loss": 0.6287, + "step": 8880 + }, + { + "epoch": 0.9185782186402149, + "grad_norm": 0.911371111869812, + "learning_rate": 8.298946474644575e-06, + "loss": 0.541, + "step": 8890 + }, + { + "epoch": 0.919611489977268, + "grad_norm": 1.081002950668335, + "learning_rate": 8.090786207675171e-06, + "loss": 0.6585, + "step": 8900 + }, + { + "epoch": 0.9206447613143212, + "grad_norm": 1.1589564085006714, + "learning_rate": 7.885226879221691e-06, + "loss": 0.644, + "step": 8910 + }, + { + "epoch": 0.9216780326513743, + "grad_norm": 0.6146034002304077, + "learning_rate": 7.682270699399057e-06, + "loss": 0.525, + "step": 8920 + }, + { + "epoch": 0.9227113039884274, + "grad_norm": 0.8697605729103088, + "learning_rate": 7.481919850333946e-06, + "loss": 0.5737, + "step": 8930 + }, + { + "epoch": 0.9237445753254805, + "grad_norm": 1.1086236238479614, + "learning_rate": 7.284176486141214e-06, + "loss": 0.5475, + "step": 8940 + }, + { + "epoch": 0.9247778466625336, + "grad_norm": 1.8016564846038818, + "learning_rate": 7.0890427329008964e-06, + "loss": 0.6598, + "step": 8950 + }, + { + "epoch": 0.9258111179995867, + "grad_norm": 1.1818935871124268, + "learning_rate": 6.896520688635111e-06, + "loss": 0.634, + "step": 8960 + }, + { + "epoch": 0.9268443893366398, + "grad_norm": 1.2800990343093872, + "learning_rate": 6.706612423285685e-06, + "loss": 0.6006, + "step": 8970 + }, + { + "epoch": 0.9278776606736929, + "grad_norm": 1.0055065155029297, + "learning_rate": 6.519319978691845e-06, + "loss": 0.5678, + "step": 8980 + }, + { + "epoch": 0.928910932010746, + "grad_norm": 2.050903081893921, + "learning_rate": 6.334645368568315e-06, + "loss": 0.6205, + "step": 8990 + }, + { + "epoch": 0.9299442033477991, + "grad_norm": 1.3951302766799927, + "learning_rate": 6.152590578483497e-06, + "loss": 0.5148, + "step": 9000 + }, + { + "epoch": 0.9309774746848523, + "grad_norm": 2.197643995285034, + "learning_rate": 5.973157565838327e-06, + "loss": 0.6931, + "step": 9010 + }, + { + "epoch": 0.9320107460219054, + "grad_norm": 1.161935567855835, + "learning_rate": 5.796348259845096e-06, + "loss": 0.5684, + "step": 9020 + }, + { + "epoch": 0.9330440173589585, + "grad_norm": 1.7656304836273193, + "learning_rate": 5.622164561506826e-06, + "loss": 0.6114, + "step": 9030 + }, + { + "epoch": 0.9340772886960116, + "grad_norm": 0.9781146049499512, + "learning_rate": 5.450608343596647e-06, + "loss": 0.5743, + "step": 9040 + }, + { + "epoch": 0.9351105600330647, + "grad_norm": 0.7141128182411194, + "learning_rate": 5.281681450637843e-06, + "loss": 0.5756, + "step": 9050 + }, + { + "epoch": 0.9361438313701178, + "grad_norm": 1.8938722610473633, + "learning_rate": 5.115385698883979e-06, + "loss": 0.5679, + "step": 9060 + }, + { + "epoch": 0.9371771027071709, + "grad_norm": 1.8188812732696533, + "learning_rate": 4.951722876299275e-06, + "loss": 0.744, + "step": 9070 + }, + { + "epoch": 0.938210374044224, + "grad_norm": 0.8902860879898071, + "learning_rate": 4.790694742539487e-06, + "loss": 0.6274, + "step": 9080 + }, + { + "epoch": 0.9392436453812771, + "grad_norm": 2.0708553791046143, + "learning_rate": 4.632303028932943e-06, + "loss": 0.6173, + "step": 9090 + }, + { + "epoch": 0.9402769167183302, + "grad_norm": 1.710466742515564, + "learning_rate": 4.476549438461958e-06, + "loss": 0.6267, + "step": 9100 + }, + { + "epoch": 0.9413101880553834, + "grad_norm": 0.8895155787467957, + "learning_rate": 4.323435645744445e-06, + "loss": 0.7292, + "step": 9110 + }, + { + "epoch": 0.9423434593924365, + "grad_norm": 1.9920309782028198, + "learning_rate": 4.172963297015997e-06, + "loss": 0.6778, + "step": 9120 + }, + { + "epoch": 0.9433767307294896, + "grad_norm": 1.4172946214675903, + "learning_rate": 4.025134010112258e-06, + "loss": 0.5764, + "step": 9130 + }, + { + "epoch": 0.9444100020665427, + "grad_norm": 0.9763116836547852, + "learning_rate": 3.879949374451269e-06, + "loss": 0.5936, + "step": 9140 + }, + { + "epoch": 0.9454432734035958, + "grad_norm": 0.7206150889396667, + "learning_rate": 3.737410951016623e-06, + "loss": 0.6432, + "step": 9150 + }, + { + "epoch": 0.9464765447406489, + "grad_norm": 1.7434405088424683, + "learning_rate": 3.5975202723405874e-06, + "loss": 0.6962, + "step": 9160 + }, + { + "epoch": 0.947509816077702, + "grad_norm": 1.5172473192214966, + "learning_rate": 3.4602788424876464e-06, + "loss": 0.6048, + "step": 9170 + }, + { + "epoch": 0.9485430874147551, + "grad_norm": 1.1150609254837036, + "learning_rate": 3.3256881370383183e-06, + "loss": 0.5865, + "step": 9180 + }, + { + "epoch": 0.9495763587518082, + "grad_norm": 0.8272204995155334, + "learning_rate": 3.1937496030732536e-06, + "loss": 0.7075, + "step": 9190 + }, + { + "epoch": 0.9506096300888613, + "grad_norm": 0.9530496597290039, + "learning_rate": 3.0644646591577174e-06, + "loss": 0.5166, + "step": 9200 + }, + { + "epoch": 0.9516429014259145, + "grad_norm": 0.7508150935173035, + "learning_rate": 2.9378346953264633e-06, + "loss": 0.5569, + "step": 9210 + }, + { + "epoch": 0.9526761727629676, + "grad_norm": 1.5538454055786133, + "learning_rate": 2.8138610730684686e-06, + "loss": 0.6117, + "step": 9220 + }, + { + "epoch": 0.9537094441000207, + "grad_norm": 1.6959922313690186, + "learning_rate": 2.692545125312612e-06, + "loss": 0.5843, + "step": 9230 + }, + { + "epoch": 0.9547427154370738, + "grad_norm": 1.9940367937088013, + "learning_rate": 2.573888156413212e-06, + "loss": 0.6021, + "step": 9240 + }, + { + "epoch": 0.9557759867741269, + "grad_norm": 1.3337304592132568, + "learning_rate": 2.4578914421359288e-06, + "loss": 0.525, + "step": 9250 + }, + { + "epoch": 0.95680925811118, + "grad_norm": 0.8987991213798523, + "learning_rate": 2.344556229644218e-06, + "loss": 0.6227, + "step": 9260 + }, + { + "epoch": 0.9578425294482331, + "grad_norm": 0.8351725339889526, + "learning_rate": 2.233883737485731e-06, + "loss": 0.5019, + "step": 9270 + }, + { + "epoch": 0.9588758007852862, + "grad_norm": 0.8255094289779663, + "learning_rate": 2.1258751555794654e-06, + "loss": 0.6226, + "step": 9280 + }, + { + "epoch": 0.9599090721223393, + "grad_norm": 0.882256269454956, + "learning_rate": 2.020531645202689e-06, + "loss": 0.4691, + "step": 9290 + }, + { + "epoch": 0.9609423434593924, + "grad_norm": 1.3614681959152222, + "learning_rate": 1.9178543389786475e-06, + "loss": 0.6947, + "step": 9300 + }, + { + "epoch": 0.9619756147964456, + "grad_norm": 1.0351027250289917, + "learning_rate": 1.8178443408642386e-06, + "loss": 0.5705, + "step": 9310 + }, + { + "epoch": 0.9630088861334987, + "grad_norm": 1.4347681999206543, + "learning_rate": 1.7205027261383565e-06, + "loss": 0.659, + "step": 9320 + }, + { + "epoch": 0.9640421574705518, + "grad_norm": 0.7792657017707825, + "learning_rate": 1.625830541390122e-06, + "loss": 0.5753, + "step": 9330 + }, + { + "epoch": 0.9650754288076049, + "grad_norm": 2.364046812057495, + "learning_rate": 1.5338288045076697e-06, + "loss": 0.626, + "step": 9340 + }, + { + "epoch": 0.966108700144658, + "grad_norm": 0.6518005728721619, + "learning_rate": 1.4444985046673798e-06, + "loss": 0.5726, + "step": 9350 + }, + { + "epoch": 0.9671419714817111, + "grad_norm": 0.8066127896308899, + "learning_rate": 1.3578406023229962e-06, + "loss": 0.6068, + "step": 9360 + }, + { + "epoch": 0.9681752428187642, + "grad_norm": 0.918552815914154, + "learning_rate": 1.2738560291954415e-06, + "loss": 0.5854, + "step": 9370 + }, + { + "epoch": 0.9692085141558173, + "grad_norm": 0.840164840221405, + "learning_rate": 1.1925456882627417e-06, + "loss": 0.5582, + "step": 9380 + }, + { + "epoch": 0.9702417854928704, + "grad_norm": 1.3730981349945068, + "learning_rate": 1.113910453750394e-06, + "loss": 0.6132, + "step": 9390 + }, + { + "epoch": 0.9712750568299235, + "grad_norm": 0.9575563669204712, + "learning_rate": 1.0379511711219313e-06, + "loss": 0.552, + "step": 9400 + }, + { + "epoch": 0.9723083281669767, + "grad_norm": 1.423699975013733, + "learning_rate": 9.646686570697061e-07, + "loss": 0.6279, + "step": 9410 + }, + { + "epoch": 0.9733415995040298, + "grad_norm": 1.0341185331344604, + "learning_rate": 8.9406369950637e-07, + "loss": 0.6182, + "step": 9420 + }, + { + "epoch": 0.9743748708410829, + "grad_norm": 0.9466649889945984, + "learning_rate": 8.261370575561866e-07, + "loss": 0.6104, + "step": 9430 + }, + { + "epoch": 0.975408142178136, + "grad_norm": 1.4443756341934204, + "learning_rate": 7.608894615468709e-07, + "loss": 0.6099, + "step": 9440 + }, + { + "epoch": 0.9764414135151891, + "grad_norm": 1.057411551475525, + "learning_rate": 6.983216130019288e-07, + "loss": 0.621, + "step": 9450 + }, + { + "epoch": 0.9774746848522422, + "grad_norm": 1.0595217943191528, + "learning_rate": 6.384341846329134e-07, + "loss": 0.6057, + "step": 9460 + }, + { + "epoch": 0.9785079561892953, + "grad_norm": 1.049188494682312, + "learning_rate": 5.812278203322918e-07, + "loss": 0.5873, + "step": 9470 + }, + { + "epoch": 0.9795412275263484, + "grad_norm": 1.212692379951477, + "learning_rate": 5.267031351664786e-07, + "loss": 0.6586, + "step": 9480 + }, + { + "epoch": 0.9805744988634015, + "grad_norm": 0.6824125647544861, + "learning_rate": 4.7486071536925745e-07, + "loss": 0.5165, + "step": 9490 + }, + { + "epoch": 0.9816077702004546, + "grad_norm": 0.6535896062850952, + "learning_rate": 4.257011183354809e-07, + "loss": 0.568, + "step": 9500 + }, + { + "epoch": 0.9826410415375078, + "grad_norm": 1.9366480112075806, + "learning_rate": 3.792248726150471e-07, + "loss": 0.5708, + "step": 9510 + }, + { + "epoch": 0.9836743128745609, + "grad_norm": 0.8946526646614075, + "learning_rate": 3.354324779071827e-07, + "loss": 0.499, + "step": 9520 + }, + { + "epoch": 0.984707584211614, + "grad_norm": 1.2145295143127441, + "learning_rate": 2.9432440505522406e-07, + "loss": 0.6079, + "step": 9530 + }, + { + "epoch": 0.9857408555486671, + "grad_norm": 0.948176383972168, + "learning_rate": 2.559010960413444e-07, + "loss": 0.5659, + "step": 9540 + }, + { + "epoch": 0.9867741268857202, + "grad_norm": 0.8348559141159058, + "learning_rate": 2.201629639819458e-07, + "loss": 0.6056, + "step": 9550 + }, + { + "epoch": 0.9878073982227733, + "grad_norm": 1.4996442794799805, + "learning_rate": 1.87110393123191e-07, + "loss": 0.521, + "step": 9560 + }, + { + "epoch": 0.9888406695598264, + "grad_norm": 0.999030590057373, + "learning_rate": 1.567437388368398e-07, + "loss": 0.6173, + "step": 9570 + }, + { + "epoch": 0.9898739408968795, + "grad_norm": 2.4354379177093506, + "learning_rate": 1.2906332761647433e-07, + "loss": 0.6978, + "step": 9580 + }, + { + "epoch": 0.9909072122339326, + "grad_norm": 1.067911148071289, + "learning_rate": 1.0406945707391868e-07, + "loss": 0.5733, + "step": 9590 + }, + { + "epoch": 0.9919404835709857, + "grad_norm": 0.8200865387916565, + "learning_rate": 8.176239593610246e-08, + "loss": 0.602, + "step": 9600 + }, + { + "epoch": 0.9929737549080389, + "grad_norm": 0.7436734437942505, + "learning_rate": 6.214238404214645e-08, + "loss": 0.6507, + "step": 9610 + }, + { + "epoch": 0.994007026245092, + "grad_norm": 1.0331168174743652, + "learning_rate": 4.520963234083686e-08, + "loss": 0.6224, + "step": 9620 + }, + { + "epoch": 0.9950402975821451, + "grad_norm": 1.3714491128921509, + "learning_rate": 3.09643228882106e-08, + "loss": 0.6278, + "step": 9630 + }, + { + "epoch": 0.9960735689191982, + "grad_norm": 1.478288173675537, + "learning_rate": 1.940660884577894e-08, + "loss": 0.6508, + "step": 9640 + }, + { + "epoch": 0.9971068402562513, + "grad_norm": 1.7626447677612305, + "learning_rate": 1.053661447877885e-08, + "loss": 0.5287, + "step": 9650 + }, + { + "epoch": 0.9981401115933044, + "grad_norm": 1.4299039840698242, + "learning_rate": 4.354435154813042e-09, + "loss": 0.6035, + "step": 9660 + }, + { + "epoch": 0.9991733829303575, + "grad_norm": 0.8753514885902405, + "learning_rate": 8.601373429339976e-10, + "loss": 0.6028, + "step": 9670 + }, + { + "epoch": 1.0, + "eval_loss": 2.729660987854004, + "eval_runtime": 5409.1379, + "eval_samples_per_second": 9.159, + "eval_steps_per_second": 0.286, + "step": 9678 + }, + { + "epoch": 1.0, + "step": 9678, + "total_flos": 0.0, + "train_loss": 0.9993063533914609, + "train_runtime": 82593.2232, + "train_samples_per_second": 7.499, + "train_steps_per_second": 0.117 + } + ], + "logging_steps": 10, + "max_steps": 9678, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}